From b78588d1630aa6643bf021613717bafb705df4ef Mon Sep 17 00:00:00 2001
From: Yujia Zhai <yzhai015@ucr.edu>
Date: Sat, 18 Jan 2025 06:53:07 -0800
Subject: [PATCH] CUTLASS 3.7 (#2045)

* CUTLASS 3.7

* clean up changelog

---------

Co-authored-by: yuzhai <yuzhai@nvidia.com>
Co-authored-by: Haicheng Wu <haichengw@nvidia.com>
---
 CHANGELOG.md                                  |  10 +-
 CMakeLists.txt                                |   2 +-
 CUDA.cmake                                    |   2 +-
 LICENSE.txt                                   |   2 +-
 README.md                                     |  31 +-
 bin2hex.cmake                                 |   2 +-
 cmake/CTestTestfile.configure.cmake           |   2 +-
 cmake/CTestTestfile.test.configure.cmake      |   2 +-
 cmake/NvidiaCutlassPackageConfig.cmake        |   2 +-
 cmake/googletest.cmake                        |   2 +-
 cmake/nop.cu                                  |   2 +-
 cmake/version_extended.h.in                   |   2 +-
 cuBLAS.cmake                                  |   2 +-
 cuDNN.cmake                                   |   2 +-
 examples/00_basic_gemm/CMakeLists.txt         |   2 +-
 examples/00_basic_gemm/basic_gemm.cu          |   2 +-
 examples/01_cutlass_utilities/CMakeLists.txt  |   2 +-
 .../01_cutlass_utilities/cutlass_utilities.cu |   2 +-
 examples/02_dump_reg_shmem/CMakeLists.txt     |   2 +-
 examples/02_dump_reg_shmem/dump_reg_shmem.cu  |   2 +-
 examples/03_visualize_layout/CMakeLists.txt   |   2 +-
 examples/03_visualize_layout/options.h        |   2 +-
 .../03_visualize_layout/register_layout.cu    |   2 +-
 .../03_visualize_layout/register_layout.h     |   2 +-
 .../03_visualize_layout/visualize_layout.cpp  |   2 +-
 .../03_visualize_layout/visualize_layout.h    |   2 +-
 examples/04_tile_iterator/CMakeLists.txt      |   2 +-
 examples/04_tile_iterator/tile_iterator.cu    |   2 +-
 examples/05_batched_gemm/CMakeLists.txt       |   2 +-
 examples/05_batched_gemm/batched_gemm.cu      |   2 +-
 examples/06_splitK_gemm/CMakeLists.txt        |   2 +-
 examples/06_splitK_gemm/splitk_gemm.cu        |   2 +-
 .../07_volta_tensorop_gemm/CMakeLists.txt     |   2 +-
 .../volta_tensorop_gemm.cu                    |   2 +-
 .../08_turing_tensorop_gemm/CMakeLists.txt    |   2 +-
 .../turing_tensorop_gemm.cu                   |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../turing_tensorop_conv2dfprop.cu            |   2 +-
 examples/10_planar_complex/CMakeLists.txt     |   2 +-
 examples/10_planar_complex/planar_complex.cu  |   2 +-
 .../11_planar_complex_array/CMakeLists.txt    |   2 +-
 .../planar_complex_array.cu                   |   2 +-
 examples/12_gemm_bias_relu/CMakeLists.txt     |   2 +-
 examples/12_gemm_bias_relu/gemm_bias_relu.cu  |   2 +-
 .../13_two_tensor_op_fusion/CMakeLists.txt    |   2 +-
 examples/13_two_tensor_op_fusion/README.md    |   2 +-
 .../13_two_tensor_op_fusion/b2b_conv2d_run.h  |   2 +-
 .../13_two_tensor_op_fusion/b2b_gemm_run.h    |   2 +-
 .../b2b_grouped_gemm_run.h                    |   2 +-
 .../b2b_interleaved_conv2d_run.h              |   2 +-
 .../b2b_interleaved_gemm_run.h                |   2 +-
 .../13_two_tensor_op_fusion/device/b2b_gemm.h |   2 +-
 .../device/b2b_implicit_gemm_convolution.h    |   2 +-
 .../fused_two_convs_f16_sm75_rf.cu            |   2 +-
 .../fused_two_convs_f16_sm75_shmem.cu         |   2 +-
 .../fused_two_convs_f16_sm80_rf.cu            |   2 +-
 .../fused_two_convs_f16_sm80_shmem.cu         |   2 +-
 .../fused_two_convs_s8_sm75_rf.cu             |   2 +-
 .../fused_two_convs_s8_sm75_shmem.cu          |   2 +-
 .../fused_two_convs_s8_sm80_rf.cu             |   2 +-
 .../fused_two_convs_s8_sm80_shmem.cu          |   2 +-
 .../fused_two_gemms_f16_sm75_rf.cu            |   2 +-
 .../fused_two_gemms_f16_sm75_shmem.cu         |   2 +-
 .../fused_two_gemms_f16_sm80_rf.cu            |   2 +-
 .../fused_two_gemms_f16_sm80_shmem.cu         |   2 +-
 .../fused_two_gemms_grouped_f16_sm80_rf.cu    |   2 +-
 .../fused_two_gemms_s8_sm75_rf.cu             |   2 +-
 .../fused_two_gemms_s8_sm75_shmem.cu          |   2 +-
 .../fused_two_gemms_s8_sm80_rf.cu             |   2 +-
 .../fused_two_gemms_s8_sm80_shmem.cu          |   2 +-
 .../13_two_tensor_op_fusion/kernel/b2b_gemm.h |   2 +-
 .../kernel/b2b_gemm_grouped_problem_visitor.h |   2 +-
 .../kernel/b2b_implicit_gemm_convolution.h    |   2 +-
 .../kernel/default_b2b_conv2d_fprop.h         |   2 +-
 .../kernel/default_b2b_conv2d_fprop_sm75.h    |   2 +-
 .../kernel/default_b2b_conv2d_fprop_sm80.h    |   2 +-
 ...t_b2b_conv2d_fprop_smem_accumulator_sm75.h |   2 +-
 ...t_b2b_conv2d_fprop_smem_accumulator_sm80.h |   2 +-
 .../kernel/default_b2b_gemm.h                 |   2 +-
 .../default_b2b_gemm_smem_accumulator.h       |   2 +-
 .../13_two_tensor_op_fusion/kernel/grouped.h  |   2 +-
 .../reference/device/tensor_scale_bias.h      |   2 +-
 examples/13_two_tensor_op_fusion/test_run.h   |   2 +-
 .../b2b_implicit_gemm_multistage.h            |   2 +-
 ...mplicit_gemm_multistage_smem_accumulator.h |   2 +-
 .../threadblock/b2b_implicit_gemm_pipelined.h |   2 +-
 ...implicit_gemm_pipelined_smem_accumulator.h |   2 +-
 .../threadblock/b2b_mma_base.h                |   2 +-
 .../b2b_mma_base_smem_accumulator.h           |   2 +-
 .../threadblock/b2b_mma_multistage.h          |   2 +-
 .../b2b_mma_multistage_smem_accumulator.h     |   2 +-
 .../threadblock/b2b_mma_pipelined.h           |   2 +-
 .../b2b_mma_pipelined_smem_accumulator.h      |   2 +-
 .../threadblock/default_b2b_mma.h             |   2 +-
 .../default_b2b_mma_smem_accumulator.h        |   2 +-
 .../threadblock/grouped_threadblock_swizzle.h |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_tf32_tensorop_gemm.cu              |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_sparse_tensorop_gemm.cu            |   2 +-
 .../ampere_sparse_tensorop_gemm_universal.cu  |   2 +-
 ...mpere_sparse_tensorop_gemm_with_visitor.cu |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_tensorop_conv2dfprop.cu            |   2 +-
 .../17_fprop_per_channel_bias/CMakeLists.txt  |   2 +-
 .../fprop_per_channel_bias.cu                 |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_fp64_tensorop_affine2_gemm.cu      |   2 +-
 examples/19_tensorop_canonical/CMakeLists.txt |   2 +-
 .../tensorop_canonical.cu                     |   2 +-
 examples/20_simt_canonical/CMakeLists.txt     |   2 +-
 examples/20_simt_canonical/simt_canonical.cu  |   2 +-
 examples/21_quaternion_gemm/CMakeLists.txt    |   2 +-
 .../21_quaternion_gemm/quaternion_gemm.cu     |   2 +-
 examples/22_quaternion_conv/CMakeLists.txt    |   2 +-
 .../22_quaternion_conv/quaternion_conv.cu     |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_gemm_operand_reduction_fusion.cu   |   2 +-
 examples/24_gemm_grouped/CMakeLists.txt       |   2 +-
 examples/24_gemm_grouped/gemm_grouped.cu      |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_3d_fprop_mainloop_fusion.cu        |   2 +-
 .../ampere_fprop_mainloop_fusion.cu           |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_wgrad_mainloop_fusion.cu           |   2 +-
 ...pere_3xtf32_fast_accurate_tensorop_gemm.cu |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../CMakeLists.txt                            |   2 +-
 ...ere_3xtf32_fast_accurate_tensorop_fprop.cu |   2 +-
 .../29_3xtf32_complex_gemm.cu                 |   2 +-
 .../CMakeLists.txt                            |   2 +-
 examples/30_wgrad_split_k/30_wgrad_split_k.cu |   2 +-
 examples/30_wgrad_split_k/CMakeLists.txt      |   2 +-
 examples/31_basic_syrk/CMakeLists.txt         |   2 +-
 examples/31_basic_syrk/basic_syrk.cu          |   2 +-
 examples/32_basic_trmm/CMakeLists.txt         |   2 +-
 examples/32_basic_trmm/basic_trmm.cu          |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_3xtf32_tensorop_symm.cu            |   2 +-
 .../34_transposed_conv2d.cu                   |   2 +-
 examples/34_transposed_conv2d/CMakeLists.txt  |   2 +-
 examples/35_gemm_softmax/CMakeLists.txt       |   2 +-
 examples/35_gemm_softmax/gemm_softmax.cu      |   2 +-
 .../gemm_with_epilogue_visitor.h              |   2 +-
 examples/35_gemm_softmax/gemm_with_softmax.h  |   2 +-
 .../36_gather_scatter_fusion/CMakeLists.txt   |   2 +-
 .../gather_scatter_fusion.cu                  |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../gemm_layernorm.cu                         |   2 +-
 .../gemm_with_epilogue_visitor.h              |   2 +-
 .../gemm_with_layernorm.h                     |   2 +-
 examples/38_syr2k_grouped/CMakeLists.txt      |   2 +-
 examples/38_syr2k_grouped/syr2k_grouped.cu    |   2 +-
 examples/39_gemm_permute/CMakeLists.txt       |   2 +-
 examples/39_gemm_permute/gemm_permute.cu      |   2 +-
 examples/39_gemm_permute/layouts.h            |   2 +-
 examples/39_gemm_permute/permute_info.h       |   2 +-
 examples/40_cutlass_py/conv2d.py              |   2 +-
 examples/40_cutlass_py/customizable/conv2d.py |   2 +-
 examples/40_cutlass_py/customizable/gemm.py   |   2 +-
 .../customizable/gemm_grouped.py              |   2 +-
 examples/40_cutlass_py/gemm.py                |   2 +-
 examples/40_cutlass_py/gemm_grouped.py        |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../debug_utils.h                             |   2 +-
 .../default_fmha_grouped.h                    |   2 +-
 .../epilogue/epilogue_pipelined.h             |   2 +-
 .../epilogue/epilogue_rescale_output.h        |   2 +-
 .../epilogue_thread_apply_logsumexp.h         |   2 +-
 .../fmha_backward_test.py                     |   2 +-
 .../fmha_grouped.h                            |   2 +-
 .../fmha_grouped_problem_visitor.h            |   2 +-
 .../fused_multi_head_attention_backward.cu    |   2 +-
 .../fused_multihead_attention_fixed_seqlen.cu |   2 +-
 ...sed_multihead_attention_variable_seqlen.cu |   2 +-
 .../gemm/custom_mma.h                         |   2 +-
 .../gemm/custom_mma_base.h                    |   2 +-
 .../gemm/custom_mma_multistage.h              |   2 +-
 .../gemm/custom_mma_pipelined.h               |   2 +-
 .../gemm/find_default_mma.h                   |   2 +-
 .../gemm/mma_accum_lambda_iterator.h          |   2 +-
 .../gemm/mma_from_smem.h                      |   2 +-
 .../gemm_kernel_utils.h                       |   2 +-
 .../default_warp_iterator_from_smem.h         |   2 +-
 .../epilogue_predicated_tile_iterator.h       |   2 +-
 .../iterators/make_residual_last.h            |   2 +-
 ...cated_tile_access_iterator_residual_last.h |   2 +-
 .../predicated_tile_iterator_residual_last.h  |   2 +-
 .../iterators/transpose_warp_iterator.h       |   2 +-
 .../iterators/warp_iterator_from_smem.h       |   2 +-
 .../kernel_backward.h                         |   2 +-
 .../kernel_forward.h                          |   2 +-
 .../piped_subprocess.py                       |   2 +-
 .../transform/tile_smem_loader.h              |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_tensorop_group_conv.cu             |   2 +-
 .../43_ell_block_sparse_gemm/CMakeLists.txt   |   2 +-
 .../ell_block_sparse_gemm.cu                  |   2 +-
 .../44_multi_gemm_ir_and_codegen/README.md    |   2 +-
 .../default_bias_act_epilogue_tensor_op.h     |   2 +-
 ...ault_thread_map_tensor_op_for_fused_bias.h |   2 +-
 .../threadblock/fused_bias_act_epilogue.h     |   2 +-
 .../output_tile_thread_map_for_fused_bias.h   |   2 +-
 ...sed_bias_act_fragment_iterator_tensor_op.h |   2 +-
 ...r_op_fragment_iterator_without_output_op.h |   2 +-
 .../ir_gen/gen_all_code.py                    |   2 +-
 .../ir_gen/gen_cmake.py                       |   2 +-
 .../ir_gen/gen_customized_epilogue.py         |   2 +-
 .../ir_gen/gen_device.py                      |   2 +-
 .../ir_gen/gen_ir.py                          |   2 +-
 .../ir_gen/gen_kernel.py                      |   2 +-
 .../ir_gen/gen_sample.py                      |   2 +-
 .../ir_gen/gen_threadblock.py                 |   2 +-
 .../ir_gen/gen_turing_and_volta.py            |   2 +-
 .../ir_gen/gen_verify.py                      |   2 +-
 .../ir_gen/generate.sh                        |   2 +-
 .../ir_gen/helper.py                          |   2 +-
 .../ir_gen/replace_fix_impl_header.py         |   2 +-
 .../44_multi_gemm_ir_and_codegen/leaky_bias.h |   2 +-
 examples/44_multi_gemm_ir_and_codegen/utils.h |   2 +-
 examples/45_dual_gemm/CMakeLists.txt          |   2 +-
 examples/45_dual_gemm/device/dual_gemm.h      |   2 +-
 examples/45_dual_gemm/dual_gemm.cu            |   2 +-
 examples/45_dual_gemm/dual_gemm_common.h      |   2 +-
 examples/45_dual_gemm/dual_gemm_run.h         |   2 +-
 examples/45_dual_gemm/kernel/dual_gemm.h      |   2 +-
 examples/45_dual_gemm/test_run.h              |   2 +-
 .../45_dual_gemm/thread/left_silu_and_mul.h   |   2 +-
 .../45_dual_gemm/threadblock/dual_epilogue.h  |   2 +-
 .../45_dual_gemm/threadblock/dual_mma_base.h  |   2 +-
 .../threadblock/dual_mma_multistage.h         |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../depthwise_simt_conv2dfprop.cu             |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_gemm_universal_streamk.cu          |   2 +-
 ...ampere_gemm_universal_streamk_broadcast.cu |   2 +-
 .../48_hopper_warp_specialized_gemm.cu        |  12 +-
 .../CMakeLists.txt                            |   2 +-
 .../49_collective_builder.cu                  |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../50_hopper_gemm_with_epilogue_swizzle.cu   |   2 +-
 .../CMakeLists.txt                            |   2 +-
 examples/51_hopper_gett/51_hopper_gett.cu     |   2 +-
 examples/51_hopper_gett/CMakeLists.txt        |   2 +-
 examples/51_hopper_gett/gett_kernel.cuh       |   3 +-
 .../52_hopper_gather_scatter_fusion.cu        |   4 +-
 .../CMakeLists.txt                            |   2 +-
 .../gather_gemm.hpp                           |   2 +-
 .../gather_kernel.cuh                         |   2 +-
 .../scatter_epilogue.hpp                      |   2 +-
 .../53_hopper_gemm_permute.cu                 |   2 +-
 .../53_hopper_gemm_permute/CMakeLists.txt     |   2 +-
 .../53_hopper_gemm_permute/permute_kernel.cuh |   2 +-
 .../53_hopper_gemm_permute/permute_traits.hpp |   2 +-
 .../54_hopper_fp8_warp_specialized_gemm.cu    |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../hopper_fp8_commandline.hpp                |   2 +-
 .../55_hopper_int4_bf16_gemm.cu               |   2 +-
 .../55_hopper_int4_fp8_gemm.cu                |   2 +-
 .../55_hopper_mixed_dtype_gemm.cu             |   2 +-
 .../55_hopper_mixed_dtype_gemm/CMakeLists.txt |   2 +-
 examples/55_hopper_mixed_dtype_gemm/README.md |   7 +-
 .../mixed_dtype_utils.hpp                     |  20 +-
 .../packed_scale.hpp                          |   6 +-
 .../reorder_utils.hpp                         |   2 +-
 .../56_hopper_ptr_array_batched_gemm.cu       |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../57_hopper_grouped_gemm.cu                 |  11 +-
 .../57_hopper_grouped_gemm/CMakeLists.txt     |   2 +-
 examples/58_ada_fp8_gemm/CMakeLists.txt       |   2 +-
 examples/58_ada_fp8_gemm/ada_fp8_gemm.cu      |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../ampere_conv_kernel.h                      |   2 +-
 .../ampere_gather_scatter_conv.cu             |   2 +-
 examples/60_cutlass_import/CMakeLists.txt     |   2 +-
 examples/60_cutlass_import/main.cpp           |   2 +-
 .../61_hopper_gemm_with_topk_and_softmax.cu   |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../62_hopper_sparse_gemm.cu                  |  71 +-
 examples/62_hopper_sparse_gemm/CMakeLists.txt |   2 +-
 .../63_hopper_gemm_with_weight_prefetch.cu    |   2 +-
 .../CMakeLists.txt                            |   2 +-
 .../collective/builder.hpp                    |   2 +-
 .../collective/dispatch_policy_extra.hpp      |   2 +-
 ..._gmma_ss_warpspecialized_with_prefetch.hpp |   2 +-
 .../gemm_with_weight_prefetch_commandline.hpp |   2 +-
 ...gemm_tma_warpspecialized_with_prefetch.hpp |   2 +-
 .../pipeline/prefetch_pipeline_sm90.hpp       |   2 +-
 .../64_ada_fp8_gemm_grouped/CMakeLists.txt    |   2 +-
 .../ada_fp8_gemm_grouped.cu                   |   2 +-
 .../65_distributed_gemm.cu                    | 864 ++++++++++++++++++
 .../CMakeLists.txt                            |   6 +-
 examples/65_distributed_gemm/README.md        |  64 ++
 examples/65_distributed_gemm/REQUIREMENTS.md  |  86 ++
 examples/65_distributed_gemm/util/benchmark.h | 118 +++
 .../65_distributed_gemm/util/device_copy.h    |  84 ++
 ...pecialized_gemm_with_blockwise_scaling.cu} |  14 +-
 .../CMakeLists.txt                            |  32 +
 .../hopper_fp8_commandline.hpp                |   2 +-
 .../host/gemm_with_blockwise_scaling.h        |   4 +-
 examples/CMakeLists.txt                       |   6 +-
 examples/common/gather_tensor.hpp             |   2 +-
 examples/common/helper.h                      |   2 +-
 examples/cute/CMakeLists.txt                  |   2 +-
 examples/cute/tutorial/CMakeLists.txt         |   2 +-
 examples/cute/tutorial/sgemm_1.cu             |   2 +-
 examples/cute/tutorial/sgemm_2.cu             |   2 +-
 examples/cute/tutorial/sgemm_sm70.cu          |   2 +-
 examples/cute/tutorial/sgemm_sm80.cu          |   2 +-
 examples/cute/tutorial/tiled_copy.cu          |   2 +-
 examples/cute/tutorial/wgmma_sm90.cu          |   2 +-
 include/cute/algorithm/axpby.hpp              |   2 +-
 include/cute/algorithm/clear.hpp              |   2 +-
 include/cute/algorithm/cooperative_copy.hpp   |   2 +-
 include/cute/algorithm/cooperative_gemm.hpp   |   2 +-
 include/cute/algorithm/copy.hpp               |   2 +-
 include/cute/algorithm/fill.hpp               |   2 +-
 include/cute/algorithm/functional.hpp         |   2 +-
 include/cute/algorithm/gemm.hpp               |   2 +-
 include/cute/algorithm/prefer.hpp             |   2 +-
 include/cute/algorithm/prefetch.hpp           |   2 +-
 include/cute/algorithm/tensor_algorithms.hpp  |   2 +-
 include/cute/algorithm/tuple_algorithms.hpp   |   2 +-
 include/cute/arch/cluster_sm90.hpp            |   2 +-
 include/cute/arch/config.hpp                  |   2 +-
 include/cute/arch/copy.hpp                    |   2 +-
 include/cute/arch/copy_sm50.hpp               |   2 +-
 include/cute/arch/copy_sm75.hpp               |   2 +-
 include/cute/arch/copy_sm80.hpp               |   2 +-
 include/cute/arch/copy_sm90.hpp               |   2 +-
 include/cute/arch/copy_sm90_desc.hpp          |   2 +-
 include/cute/arch/copy_sm90_tma.hpp           |   2 +-
 include/cute/arch/mma.hpp                     |   2 +-
 include/cute/arch/mma_sm61.hpp                |   2 +-
 include/cute/arch/mma_sm70.hpp                |   2 +-
 include/cute/arch/mma_sm75.hpp                |   2 +-
 include/cute/arch/mma_sm80.hpp                |   2 +-
 include/cute/arch/mma_sm90.hpp                |   2 +-
 include/cute/arch/mma_sm90_desc.hpp           |   2 +-
 include/cute/arch/mma_sm90_gmma.hpp           |   2 +-
 include/cute/arch/mma_sm90_gmma_ext.hpp       |   2 +-
 include/cute/arch/mma_sm90_gmma_sparse.hpp    |   2 +-
 .../cute/arch/mma_sm90_gmma_sparse_ext.hpp    |   2 +-
 include/cute/arch/util.hpp                    |   2 +-
 include/cute/atom/copy_atom.hpp               |   2 +-
 include/cute/atom/copy_traits.hpp             |   2 +-
 include/cute/atom/copy_traits_sm50.hpp        |   2 +-
 include/cute/atom/copy_traits_sm75.hpp        |   2 +-
 include/cute/atom/copy_traits_sm80.hpp        |   2 +-
 include/cute/atom/copy_traits_sm90.hpp        |   2 +-
 include/cute/atom/copy_traits_sm90_im2col.hpp |   2 +-
 include/cute/atom/copy_traits_sm90_tma.hpp    |   2 +-
 .../atom/copy_traits_sm90_tma_swizzle.hpp     |   2 +-
 include/cute/atom/mma_atom.hpp                |   2 +-
 include/cute/atom/mma_traits.hpp              |   2 +-
 include/cute/atom/mma_traits_sm61.hpp         |   2 +-
 include/cute/atom/mma_traits_sm70.hpp         |   2 +-
 include/cute/atom/mma_traits_sm75.hpp         |   2 +-
 include/cute/atom/mma_traits_sm80.hpp         |   2 +-
 include/cute/atom/mma_traits_sm90.hpp         |   2 +-
 include/cute/atom/mma_traits_sm90_gmma.hpp    |   2 +-
 .../cute/atom/mma_traits_sm90_gmma_ext.hpp    |   2 +-
 .../cute/atom/mma_traits_sm90_gmma_sparse.hpp |   2 +-
 .../atom/mma_traits_sm90_gmma_sparse_ext.hpp  |   2 +-
 include/cute/config.hpp                       |   2 +-
 include/cute/container/alignment.hpp          |   2 +-
 include/cute/container/array.hpp              |   2 +-
 include/cute/container/array_aligned.hpp      |   2 +-
 include/cute/container/array_subbyte.hpp      |   2 +-
 include/cute/container/bit_field.hpp          |   2 +-
 include/cute/container/cuda_types.hpp         |   2 +-
 include/cute/container/packed_tuple.hpp       |   2 +-
 include/cute/container/tuple.hpp              |   2 +-
 include/cute/container/type_list.hpp          |   2 +-
 include/cute/int_tuple.hpp                    |   2 +-
 include/cute/layout.hpp                       |   2 +-
 include/cute/layout_composed.hpp              |   2 +-
 include/cute/numeric/arithmetic_tuple.hpp     |   2 +-
 include/cute/numeric/complex.hpp              |   2 +-
 include/cute/numeric/int.hpp                  |   2 +-
 include/cute/numeric/integer_sequence.hpp     |   2 +-
 include/cute/numeric/integral_constant.hpp    |  10 +-
 include/cute/numeric/integral_ratio.hpp       |   2 +-
 include/cute/numeric/math.hpp                 |   2 +-
 include/cute/numeric/numeric_types.hpp        |   2 +-
 include/cute/numeric/real.hpp                 |   2 +-
 include/cute/pointer.hpp                      |   2 +-
 include/cute/pointer_base.hpp                 |   2 +-
 include/cute/pointer_flagged.hpp              |   2 +-
 include/cute/pointer_sparse.hpp               |   2 +-
 include/cute/pointer_swizzle.hpp              |   2 +-
 include/cute/stride.hpp                       |   2 +-
 include/cute/swizzle.hpp                      |   4 +-
 include/cute/swizzle_layout.hpp               |   2 +-
 include/cute/tensor.hpp                       |   2 +-
 include/cute/tensor_impl.hpp                  |   2 +-
 include/cute/tensor_predicate.hpp             |   2 +-
 include/cute/tensor_zip.hpp                   |   2 +-
 include/cute/underscore.hpp                   |   2 +-
 include/cute/util/debug.hpp                   |   2 +-
 include/cute/util/print.hpp                   |   2 +-
 include/cute/util/type_traits.hpp             |   2 +-
 include/cutlass/aligned_buffer.h              |   2 +-
 include/cutlass/arch/arch.h                   |   2 +-
 include/cutlass/arch/barrier.h                |   2 +-
 include/cutlass/arch/cache_operation.h        |   2 +-
 include/cutlass/arch/config.h                 |   2 +-
 .../cutlass/arch/grid_dependency_control.h    |   2 +-
 include/cutlass/arch/memory.h                 |   2 +-
 include/cutlass/arch/memory_sm75.h            |   2 +-
 include/cutlass/arch/memory_sm80.h            |   2 +-
 include/cutlass/arch/mma.h                    |  14 +-
 include/cutlass/arch/mma_sm50.h               |   2 +-
 include/cutlass/arch/mma_sm60.h               |   2 +-
 include/cutlass/arch/mma_sm61.h               |   2 +-
 include/cutlass/arch/mma_sm70.h               |   2 +-
 include/cutlass/arch/mma_sm75.h               |   2 +-
 include/cutlass/arch/mma_sm80.h               |   2 +-
 include/cutlass/arch/mma_sm89.h               |   2 +-
 include/cutlass/arch/mma_sm90.h               |   2 +-
 include/cutlass/arch/mma_sparse_sm80.h        |   2 +-
 include/cutlass/arch/mma_sparse_sm89.h        |   2 +-
 include/cutlass/arch/reg_reconfig.h           |   2 +-
 include/cutlass/arch/simd.h                   |   2 +-
 include/cutlass/arch/simd_sm60.h              |   2 +-
 include/cutlass/arch/simd_sm61.h              |   2 +-
 include/cutlass/arch/synclog.hpp              |   2 +-
 include/cutlass/arch/wmma.h                   |   2 +-
 include/cutlass/arch/wmma_sm70.h              |   2 +-
 include/cutlass/arch/wmma_sm72.h              |   2 +-
 include/cutlass/arch/wmma_sm75.h              |   2 +-
 include/cutlass/array.h                       | 114 ++-
 include/cutlass/array_planar_complex.h        |   2 +-
 include/cutlass/array_subbyte.h               |   2 +-
 include/cutlass/barrier.h                     |   2 +-
 include/cutlass/bfloat16.h                    |   2 +-
 include/cutlass/blas3.h                       |   2 +-
 include/cutlass/blas3_types.h                 |   2 +-
 include/cutlass/block_striped.h               |   2 +-
 include/cutlass/cluster_launch.hpp            |  81 +-
 include/cutlass/complex.h                     |   2 +-
 include/cutlass/constants.h                   |   2 +-
 .../conv/collective/builders/sm90_common.inl  |   2 +-
 .../collective/builders/sm90_gmma_builder.inl |   2 +-
 .../conv/collective/collective_builder.hpp    |   2 +-
 .../conv/collective/collective_conv.hpp       |   2 +-
 include/cutlass/conv/collective/detail.hpp    |   2 +-
 ..._implicit_gemm_gmma_ss_warpspecialized.hpp |  18 +-
 include/cutlass/conv/conv2d_problem_size.h    |   2 +-
 include/cutlass/conv/conv3d_problem_size.h    |   2 +-
 include/cutlass/conv/convnd_problem_shape.hpp |   2 +-
 include/cutlass/conv/convolution.h            |   2 +-
 include/cutlass/conv/detail.hpp               |   2 +-
 .../conv/device/conv_universal_adapter.hpp    |   2 +-
 .../cutlass/conv/device/direct_convolution.h  |   2 +-
 .../conv/device/implicit_gemm_convolution.h   |   2 +-
 .../device/implicit_gemm_convolution_fusion.h |   2 +-
 include/cutlass/conv/dispatch_policy.hpp      |   2 +-
 .../cutlass/conv/kernel/conv_universal.hpp    |   3 +-
 include/cutlass/conv/kernel/default_conv2d.h  |   2 +-
 .../conv/kernel/default_conv2d_dgrad.h        |   2 +-
 .../conv/kernel/default_conv2d_fprop.h        |   2 +-
 .../conv/kernel/default_conv2d_fprop_fusion.h |   2 +-
 .../kernel/default_conv2d_fprop_with_absmax.h |   2 +-
 .../default_conv2d_fprop_with_broadcast.h     |   2 +-
 .../default_conv2d_fprop_with_reduction.h     |   2 +-
 .../conv/kernel/default_conv2d_group_fprop.h  |   2 +-
 .../conv/kernel/default_conv2d_wgrad.h        |   2 +-
 .../conv/kernel/default_conv2d_wgrad_fusion.h |   2 +-
 .../conv/kernel/default_conv3d_dgrad.h        |   2 +-
 .../conv/kernel/default_conv3d_fprop.h        |   2 +-
 .../conv/kernel/default_conv3d_fprop_fusion.h |   2 +-
 .../default_conv3d_fprop_with_broadcast.h     |   2 +-
 .../conv/kernel/default_conv3d_wgrad.h        |   2 +-
 .../cutlass/conv/kernel/default_deconv2d.h    |   2 +-
 .../kernel/default_deconv2d_with_broadcast.h  |   2 +-
 .../cutlass/conv/kernel/default_deconv3d.h    |   2 +-
 .../kernel/default_deconv3d_with_broadcast.h  |   2 +-
 .../conv/kernel/default_depthwise_fprop.h     |   2 +-
 .../cutlass/conv/kernel/direct_convolution.h  |   2 +-
 .../conv/kernel/implicit_gemm_convolution.h   |   2 +-
 .../kernel/implicit_gemm_convolution_fusion.h |   2 +-
 .../implicit_gemm_convolution_strided_dgrad.h |   2 +-
 .../implicit_gemm_convolution_with_absmax.h   |   2 +-
 ...cit_gemm_convolution_with_fused_epilogue.h |   2 +-
 ...sm90_implicit_gemm_tma_warpspecialized.hpp |   2 +-
 include/cutlass/conv/thread/depthwise_mma.h   |   2 +-
 ...rad_filter_tile_access_iterator_analytic.h |   2 +-
 ...ad_filter_tile_access_iterator_optimized.h |   2 +-
 ...t_gradient_tile_access_iterator_analytic.h |   2 +-
 ..._gradient_tile_access_iterator_optimized.h |   2 +-
 ...activation_tile_access_iterator_analytic.h |   2 +-
 ...vation_tile_access_iterator_few_channels.h |   2 +-
 ...tion_tile_access_iterator_fixed_channels.h |   2 +-
 ...ctivation_tile_access_iterator_optimized.h |   2 +-
 ...rop_filter_tile_access_iterator_analytic.h |   2 +-
 ...filter_tile_access_iterator_few_channels.h |   2 +-
 ...lter_tile_access_iterator_fixed_channels.h |   2 +-
 ...op_filter_tile_access_iterator_optimized.h |   2 +-
 .../cutlass/conv/threadblock/conv2d_params.h  |   2 +-
 .../conv/threadblock/conv2d_tile_iterator.h   |   2 +-
 ...activation_tile_access_iterator_analytic.h |   2 +-
 ...ctivation_tile_access_iterator_optimized.h |   2 +-
 ...t_gradient_tile_access_iterator_analytic.h |   2 +-
 ..._gradient_tile_access_iterator_optimized.h |   2 +-
 ...rad_filter_tile_access_iterator_analytic.h |   2 +-
 ...ad_filter_tile_access_iterator_optimized.h |   2 +-
 ...t_gradient_tile_access_iterator_analytic.h |   2 +-
 ..._gradient_tile_access_iterator_optimized.h |   2 +-
 ...activation_tile_access_iterator_analytic.h |   2 +-
 ...ctivation_tile_access_iterator_optimized.h |   2 +-
 ...rop_filter_tile_access_iterator_analytic.h |   2 +-
 ...op_filter_tile_access_iterator_optimized.h |   2 +-
 .../cutlass/conv/threadblock/conv3d_params.h  |   2 +-
 ...activation_tile_access_iterator_analytic.h |   2 +-
 ...ctivation_tile_access_iterator_optimized.h |   2 +-
 ...t_gradient_tile_access_iterator_analytic.h |   2 +-
 ..._gradient_tile_access_iterator_optimized.h |   2 +-
 .../depthwise_direct_conv_params.h            |   2 +-
 ...erator_direct_conv_fixed_stride_dilation.h |   2 +-
 ...le_access_iterator_direct_conv_optimized.h |   2 +-
 .../depthwise_fprop_direct_conv_multistage.h  |   2 +-
 ...le_access_iterator_direct_conv_optimized.h |   2 +-
 .../threadblock/depthwise_fprop_pipelined.h   |   2 +-
 .../conv/threadblock/depthwise_mma_base.h     |   2 +-
 ...depthwise_mma_core_with_lane_access_size.h |   2 +-
 .../implicit_gemm_fprop_fusion_multistage.h   |   2 +-
 .../threadblock/implicit_gemm_multistage.h    |   2 +-
 .../threadblock/implicit_gemm_pipelined.h     |   2 +-
 .../implicit_gemm_wgrad_fusion_multistage.h   |   2 +-
 ...icated_scale_bias_vector_access_iterator.h |   2 +-
 .../predicated_scale_bias_vector_iterator.h   |   2 +-
 .../conv/threadblock/threadblock_swizzle.h    |   2 +-
 .../cutlass/conv/warp/mma_depthwise_simt.h    |   2 +-
 .../warp/mma_depthwise_simt_tile_iterator.h   |   2 +-
 .../conv/warp/scale_bias_relu_transform.h     |   2 +-
 include/cutlass/coord.h                       |   2 +-
 include/cutlass/core_io.h                     |   2 +-
 include/cutlass/cuda_host_adapter.hpp         |   2 +-
 include/cutlass/cutlass.h                     |   2 +-
 include/cutlass/detail/collective.hpp         |   2 +-
 .../detail/collective/mixed_input_utils.hpp   |  13 +-
 include/cutlass/detail/dependent_false.hpp    |   2 +-
 include/cutlass/detail/helper_macros.hpp      |   4 +-
 include/cutlass/detail/layout.hpp             |   2 +-
 .../mainloop_fusion_helper_scale_factor.hpp   |   2 +-
 include/cutlass/detail/mma.hpp                |   2 +-
 include/cutlass/device_kernel.h               |   2 +-
 .../collective/builders/sm90_builder.inl      |   5 +-
 .../collective/builders/sm90_common.inl       |   2 +-
 .../collective/collective_builder.hpp         |   2 +-
 .../collective/collective_epilogue.hpp        |   2 +-
 .../epilogue/collective/default_epilogue.hpp  |   9 +-
 .../collective/default_epilogue_array.hpp     |   9 +-
 .../cutlass/epilogue/collective/detail.hpp    |   2 +-
 .../collective/epilogue_tensor_broadcast.hpp  |   2 +-
 .../collective/sm70_epilogue_vectorized.hpp   |   2 +-
 .../sm70_epilogue_vectorized_array.hpp        |   2 +-
 ...m90_epilogue_array_tma_warpspecialized.hpp |   5 +-
 .../sm90_epilogue_tma_warpspecialized.hpp     |  16 +-
 ...e_tma_warpspecialized_bias_elementwise.hpp |   2 +-
 include/cutlass/epilogue/dispatch_policy.hpp  |   2 +-
 include/cutlass/epilogue/fusion/callbacks.hpp |   2 +-
 .../cutlass/epilogue/fusion/operations.hpp    |  37 +-
 .../sm90_callbacks_tma_warpspecialized.hpp    | 160 ++--
 ...90_visitor_compute_tma_warpspecialized.hpp |  19 +-
 .../sm90_visitor_load_tma_warpspecialized.hpp | 278 ++----
 ...sm90_visitor_store_tma_warpspecialized.hpp |   2 +-
 .../sm90_visitor_tma_warpspecialized.hpp      |  81 +-
 .../fusion/sm90_visitor_topk_softmax.hpp      |   2 +-
 include/cutlass/epilogue/thread/activation.h  |  13 +-
 .../cutlass/epilogue/thread/conversion_op.h   |   2 +-
 include/cutlass/epilogue/thread/detail.hpp    |   2 +-
 .../epilogue/thread/linear_combination.h      |   2 +-
 .../linear_combination_bias_elementwise.h     | 478 +++++++++-
 .../thread/linear_combination_bias_relu.h     |   2 +-
 .../thread/linear_combination_clamp.h         |   2 +-
 .../thread/linear_combination_dgelu.h         |   2 +-
 .../thread/linear_combination_drelu.h         |   2 +-
 .../epilogue/thread/linear_combination_gelu.h |   2 +-
 .../thread/linear_combination_generic.h       |   2 +-
 .../linear_combination_generic_with_scaling.h |   2 +-
 .../thread/linear_combination_hardswish.h     |   2 +-
 .../thread/linear_combination_leaky_relu.h    |   2 +-
 .../thread/linear_combination_params.h        |   2 +-
 .../linear_combination_planar_complex.h       |   2 +-
 .../epilogue/thread/linear_combination_relu.h |   2 +-
 .../thread/linear_combination_relu0.h         |   2 +-
 .../linear_combination_residual_block.h       |   2 +-
 .../thread/linear_combination_sigmoid.h       |   2 +-
 .../epilogue/thread/linear_combination_silu.h |   2 +-
 .../linear_combination_tensor_broadcast.hpp   |   2 +-
 .../linear_combination_with_elementwise.h     |   2 +-
 .../cutlass/epilogue/thread/reduction_op.h    |   2 +-
 include/cutlass/epilogue/thread/scale_type.h  |   2 +-
 .../default_epilogue_complex_tensor_op.h      |   2 +-
 ...default_epilogue_complex_tensor_op_blas3.h |   2 +-
 .../default_epilogue_direct_store.h           |   2 +-
 .../default_epilogue_planar_complex.h         |   2 +-
 .../threadblock/default_epilogue_simt.h       |   2 +-
 .../threadblock/default_epilogue_tensor_op.h  |   2 +-
 .../default_epilogue_tensor_op_blas3.h        |   2 +-
 .../default_epilogue_volta_tensor_op.h        |   2 +-
 .../default_epilogue_with_absmax.h            |   2 +-
 .../default_epilogue_with_broadcast.h         |   2 +-
 .../default_epilogue_with_reduction.h         |   2 +-
 .../default_epilogue_wmma_tensor_op.h         |   2 +-
 .../threadblock/default_thread_map_simt.h     |   2 +-
 .../default_thread_map_tensor_op.h            |   2 +-
 .../default_thread_map_volta_tensor_op.h      |   2 +-
 .../default_thread_map_wmma_tensor_op.h       |   2 +-
 .../direct_store_epilogue_iterator.h          |   2 +-
 .../cutlass/epilogue/threadblock/epilogue.h   |   2 +-
 .../epilogue/threadblock/epilogue_base.h      |   2 +-
 .../threadblock/epilogue_base_streamk.h       |   2 +-
 .../epilogue/threadblock/epilogue_depthwise.h |   2 +-
 .../threadblock/epilogue_direct_store.h       |   2 +-
 .../threadblock/epilogue_gemm_k_reduction.h   |   2 +-
 .../threadblock/epilogue_planar_complex.h     |   2 +-
 .../threadblock/epilogue_smem_accumulator.h   |   2 +-
 .../epilogue_streamk_with_broadcast.h         |   2 +-
 .../epilogue_visitor_with_softmax.h           |   2 +-
 .../threadblock/epilogue_with_absmax.h        |   2 +-
 .../threadblock/epilogue_with_broadcast.h     |   2 +-
 .../threadblock/epilogue_with_reduction.h     |   2 +-
 .../threadblock/epilogue_with_visitor.h       |   2 +-
 .../epilogue_with_visitor_callbacks.h         |   2 +-
 .../epilogue/threadblock/epilogue_workspace.h |   2 +-
 .../threadblock/fusion/visitor_2x.hpp         |   2 +-
 .../threadblock/fusion/visitor_compute.hpp    |   2 +-
 .../threadblock/fusion/visitor_load.hpp       |   2 +-
 .../threadblock/fusion/visitor_store.hpp      |   2 +-
 .../epilogue/threadblock/fusion/visitors.hpp  |   2 +-
 .../threadblock/interleaved_epilogue.h        |   2 +-
 .../threadblock/output_iterator_parameter.h   |   2 +-
 .../threadblock/output_tile_thread_map.h      |   2 +-
 .../threadblock/predicated_tile_iterator.h    |   2 +-
 .../predicated_tile_iterator_affine.h         |   2 +-
 ...cated_tile_iterator_affine_layout_params.h |   2 +-
 .../predicated_tile_iterator_blas3.h          |   2 +-
 .../predicated_tile_iterator_conv.h           |   2 +-
 .../predicated_tile_iterator_direct_conv.h    |   2 +-
 .../predicated_tile_iterator_params.h         |   2 +-
 .../predicated_tile_iterator_predicates.h     |   2 +-
 .../predicated_tile_iterator_strided_dgrad.h  |   2 +-
 .../threadblock/shared_load_iterator.h        |   2 +-
 .../threadblock/shared_load_iterator_mixed.h  |   2 +-
 .../shared_load_iterator_pitch_linear.h       |   2 +-
 .../fragment_iterator_complex_tensor_op.h     |   2 +-
 ...ment_iterator_gaussian_complex_tensor_op.h |   2 +-
 .../epilogue/warp/fragment_iterator_simt.h    |   2 +-
 .../warp/fragment_iterator_tensor_op.h        |   2 +-
 .../warp/fragment_iterator_volta_tensor_op.h  |   2 +-
 .../warp/fragment_iterator_wmma_tensor_op.h   |   2 +-
 include/cutlass/epilogue/warp/simt_policy.h   |   2 +-
 .../cutlass/epilogue/warp/tensor_op_policy.h  |   2 +-
 .../epilogue/warp/tile_iterator_simt.h        |   2 +-
 .../epilogue/warp/tile_iterator_tensor_op.h   |   2 +-
 .../warp/tile_iterator_tensor_op_mixed.h      |   2 +-
 .../warp/tile_iterator_volta_tensor_op.h      |   2 +-
 .../warp/tile_iterator_wmma_tensor_op.h       |   2 +-
 .../epilogue/warp/volta_tensor_op_policy.h    |   2 +-
 .../epilogue/warp/wmma_tensor_op_policy.h     |   2 +-
 .../distributed/device/detail.hpp             | 163 ++++
 .../device/dist_gemm_universal_wrapper.hpp    | 717 +++++++++++++++
 .../distributed/device/full_barrier.hpp       |  74 ++
 .../distributed/kernel/detail.hpp             |  72 ++
 .../kernel/dist_gemm_kernel_wrapper.hpp       | 235 +++++
 .../distributed/kernel/full_barrier.hpp       |  82 ++
 .../schedules/dist_gemm_1d_schedules.hpp      | 324 +++++++
 .../schedules/dist_gemm_base_schedule.hpp     | 538 +++++++++++
 include/cutlass/fast_math.h                   |  14 +-
 include/cutlass/float8.h                      |   2 +-
 include/cutlass/floating_point_nvrtc.h        |   2 +-
 include/cutlass/functional.h                  |   2 +-
 .../gemm/collective/builders/sm90_common.inl  |   4 +-
 .../collective/builders/sm90_gmma_builder.inl |   3 +-
 .../builders/sm90_sparse_config.inl           |   2 +-
 .../builders/sm90_sparse_gmma_builder.inl     |  23 +-
 .../gemm/collective/collective_builder.hpp    |   2 +-
 .../collective/collective_builder_decl.hpp    |   3 +-
 .../gemm/collective/collective_mma.hpp        |   4 +-
 .../gemm/collective/collective_mma_decl.hpp   |   2 +-
 .../gemm/collective/fp8_accumulation.hpp      |   4 +-
 .../gemm/collective/sm70_mma_twostage.hpp     |   2 +-
 .../gemm/collective/sm80_mma_multistage.hpp   |   2 +-
 ...ma_gmma_rs_warpspecialized_mixed_input.hpp |   2 +-
 ..._mma_array_tma_gmma_ss_warpspecialized.hpp |   2 +-
 ...mma_multistage_gmma_rs_warpspecialized.hpp |   2 +-
 ...mma_multistage_gmma_ss_warpspecialized.hpp |   2 +-
 .../sm90_mma_tma_gmma_rs_warpspecialized.hpp  |   2 +-
 ...ma_gmma_rs_warpspecialized_mixed_input.hpp |   4 +-
 .../gemm/collective/sm90_mma_tma_gmma_ss.hpp  |   2 +-
 .../sm90_mma_tma_gmma_ss_warpspecialized.hpp  |   4 +-
 ...90_mma_tma_gmma_ss_warpspecialized_fp8.hpp |   5 +-
 ..._warpspecialized_fp8_blockwise_scaling.hpp |  24 +-
 ...sparse_mma_tma_gmma_ss_warpspecialized.hpp |  82 +-
 ...se_mma_tma_gmma_ss_warpspecialized_fp8.hpp | 775 ++++++++++++++++
 include/cutlass/gemm/device/base_grouped.h    |   2 +-
 .../gemm/device/default_gemm_configuration.h  |   2 +-
 include/cutlass/gemm/device/ell_gemm.h        |   2 +-
 include/cutlass/gemm/device/gemm.h            |   2 +-
 include/cutlass/gemm/device/gemm_array.h      |   2 +-
 include/cutlass/gemm/device/gemm_batched.h    |   2 +-
 include/cutlass/gemm/device/gemm_complex.h    |   2 +-
 include/cutlass/gemm/device/gemm_grouped.h    |   2 +-
 .../device/gemm_layernorm_mainloop_fusion.h   |   2 +-
 include/cutlass/gemm/device/gemm_sparse.h     |   2 +-
 .../gemm/device/gemm_sparse_universal.h       |   2 +-
 .../gemm_sparse_universal_with_absmax.h       |   2 +-
 .../gemm/device/gemm_sparse_with_absmax.h     |   2 +-
 .../gemm/device/gemm_sparse_with_visitor.h    |   2 +-
 .../gemm/device/gemm_splitk_parallel.h        |   2 +-
 include/cutlass/gemm/device/gemm_universal.h  |   2 +-
 .../gemm/device/gemm_universal_adapter.h      |   2 +-
 .../cutlass/gemm/device/gemm_universal_base.h |   2 +-
 .../gemm_universal_streamk_with_broadcast.h   |   2 +-
 .../gemm/device/gemm_universal_with_absmax.h  |   2 +-
 .../device/gemm_universal_with_broadcast.h    |   2 +-
 .../gemm/device/gemm_with_k_reduction.h       |   2 +-
 include/cutlass/gemm/device/gemv.h            |   2 +-
 include/cutlass/gemm/device/rank_2k.h         |   2 +-
 include/cutlass/gemm/device/rank_2k_grouped.h |   2 +-
 include/cutlass/gemm/device/rank_k.h          |   2 +-
 include/cutlass/gemm/device/symm.h            |   2 +-
 include/cutlass/gemm/device/trmm.h            |   2 +-
 include/cutlass/gemm/dispatch_policy.hpp      |  36 +-
 include/cutlass/gemm/gemm.h                   |   2 +-
 include/cutlass/gemm/gemm_enumerated_types.h  |   2 +-
 .../gemm/group_array_problem_shape.hpp        |   2 +-
 .../cutlass/gemm/kernel/default_ell_gemm.h    |   2 +-
 include/cutlass/gemm/kernel/default_gemm.h    |   2 +-
 .../gemm/kernel/default_gemm_complex.h        |   2 +-
 .../gemm/kernel/default_gemm_grouped.h        |   2 +-
 .../default_gemm_grouped_per_group_scale.h    |   2 +-
 ...ult_gemm_grouped_softmax_mainloop_fusion.h |   2 +-
 .../default_gemm_layernorm_mainloop_fusion.h  |   2 +-
 .../default_gemm_planar_complex_universal.h   |   2 +-
 .../cutlass/gemm/kernel/default_gemm_sparse.h |   2 +-
 .../kernel/default_gemm_sparse_universal.h    |   2 +-
 ...efault_gemm_sparse_universal_with_absmax.h |   2 +-
 .../kernel/default_gemm_sparse_with_absmax.h  |   2 +-
 .../kernel/default_gemm_sparse_with_visitor.h |   2 +-
 .../kernel/default_gemm_splitk_parallel.h     |   2 +-
 .../default_gemm_streamk_with_broadcast.h     |   2 +-
 .../gemm/kernel/default_gemm_universal.h      |   2 +-
 .../default_gemm_universal_with_visitor.h     |   2 +-
 .../gemm/kernel/default_gemm_with_absmax.h    |   2 +-
 .../gemm/kernel/default_gemm_with_broadcast.h |   2 +-
 .../kernel/default_gemm_with_k_reduction.h    |   2 +-
 .../gemm/kernel/default_gemm_with_reduction.h |   2 +-
 include/cutlass/gemm/kernel/default_gemv.h    |   2 +-
 include/cutlass/gemm/kernel/default_rank_2k.h |   2 +-
 .../gemm/kernel/default_rank_2k_complex.h     |   2 +-
 .../gemm/kernel/default_rank_2k_grouped.h     |   2 +-
 .../gemm/kernel/default_rank_2k_universal.h   |   2 +-
 include/cutlass/gemm/kernel/default_rank_k.h  |   2 +-
 .../gemm/kernel/default_rank_k_complex.h      |   2 +-
 .../gemm/kernel/default_rank_k_universal.h    |   2 +-
 include/cutlass/gemm/kernel/default_symm.h    |   2 +-
 .../gemm/kernel/default_symm_complex.h        |   2 +-
 .../gemm/kernel/default_symm_universal.h      |   2 +-
 include/cutlass/gemm/kernel/default_trmm.h    |   2 +-
 .../gemm/kernel/default_trmm_complex.h        |   2 +-
 .../gemm/kernel/default_trmm_universal.h      |   2 +-
 include/cutlass/gemm/kernel/ell_gemm.h        |   2 +-
 include/cutlass/gemm/kernel/gemm.h            |   2 +-
 include/cutlass/gemm/kernel/gemm_array.h      |   2 +-
 include/cutlass/gemm/kernel/gemm_batched.h    |   2 +-
 include/cutlass/gemm/kernel/gemm_grouped.h    |   2 +-
 .../kernel/gemm_grouped_per_group_scale.h     |   2 +-
 .../kernel/gemm_grouped_problem_visitor.h     |   2 +-
 .../gemm_grouped_softmax_mainloop_fusion.h    |   2 +-
 .../kernel/gemm_layernorm_mainloop_fusion.h   |   2 +-
 include/cutlass/gemm/kernel/gemm_params.h     |   2 +-
 include/cutlass/gemm/kernel/gemm_pipelined.h  |   2 +-
 .../cutlass/gemm/kernel/gemm_planar_complex.h |   2 +-
 .../gemm/kernel/gemm_planar_complex_array.h   |   2 +-
 .../gemm/kernel/gemm_sparse_universal.h       |   2 +-
 .../gemm_sparse_universal_with_absmax.h       |   2 +-
 .../gemm/kernel/gemm_splitk_parallel.h        |   2 +-
 .../kernel/gemm_streamk_with_fused_epilogue.h |   2 +-
 .../gemm/kernel/gemm_transpose_operands.h     |   2 +-
 include/cutlass/gemm/kernel/gemm_universal.h  |   2 +-
 .../cutlass/gemm/kernel/gemm_universal.hpp    |   2 +-
 .../cutlass/gemm/kernel/gemm_universal_decl.h |   2 +-
 .../gemm/kernel/gemm_universal_streamk.h      |   2 +-
 .../gemm/kernel/gemm_universal_with_visitor.h |   2 +-
 .../gemm_universal_with_visitor_streamk.h     |   2 +-
 .../cutlass/gemm/kernel/gemm_with_absmax.h    |   2 +-
 .../gemm/kernel/gemm_with_fused_epilogue.h    |   2 +-
 .../gemm/kernel/gemm_with_k_reduction.h       |   2 +-
 include/cutlass/gemm/kernel/gemv.h            |   2 +-
 .../gemm/kernel/gemv_batched_strided.h        |   2 +-
 .../gemm/kernel/grouped_problem_visitor.h     |   2 +-
 .../cutlass/gemm/kernel/params_sparse_base.h  |   2 +-
 .../gemm/kernel/params_universal_base.h       |   2 +-
 include/cutlass/gemm/kernel/rank_2k_grouped.h |   2 +-
 .../kernel/rank_2k_grouped_problem_visitor.h  |   2 +-
 .../gemm/kernel/rank_2k_transpose_operands.h  |   2 +-
 .../cutlass/gemm/kernel/rank_2k_universal.h   |   2 +-
 .../cutlass/gemm/kernel/rank_k_universal.h    |   2 +-
 include/cutlass/gemm/kernel/sm70_gemm.hpp     |   2 +-
 ..._array_tma_warpspecialized_cooperative.hpp |  16 +-
 ...emm_array_tma_warpspecialized_pingpong.hpp |  16 +-
 include/cutlass/gemm/kernel/sm90_gemm_tma.hpp |   2 +-
 .../kernel/sm90_gemm_tma_warpspecialized.hpp  |   2 +-
 ...0_gemm_tma_warpspecialized_cooperative.hpp |  16 +-
 ...sm90_gemm_tma_warpspecialized_pingpong.hpp |  17 +-
 .../gemm/kernel/sm90_gemm_warpspecialized.hpp |   2 +-
 .../sm90_gemm_warpspecialized_cooperative.hpp |  17 +-
 .../sm90_gemm_warpspecialized_pingpong.hpp    |  17 +-
 .../gemm/kernel/sm90_tile_scheduler.hpp       |   2 +-
 .../gemm/kernel/sm90_tile_scheduler_group.hpp |   2 +-
 .../kernel/sm90_tile_scheduler_stream_k.hpp   |  45 +-
 include/cutlass/gemm/kernel/sparse_gemm.h     |   2 +-
 .../gemm/kernel/sparse_gemm_with_absmax.h     |   2 +-
 .../gemm/kernel/sparse_gemm_with_visitor.h    |   2 +-
 .../gemm/kernel/static_tile_scheduler.hpp     |   2 +-
 include/cutlass/gemm/kernel/symm_universal.h  |   2 +-
 .../cutlass/gemm/kernel/tile_scheduler.hpp    |   2 +-
 .../gemm/kernel/tile_scheduler_params.h       |  39 +-
 include/cutlass/gemm/kernel/trmm_universal.h  |   2 +-
 include/cutlass/gemm/thread/mma.h             |   2 +-
 include/cutlass/gemm/thread/mma_sm50.h        |   2 +-
 include/cutlass/gemm/thread/mma_sm60.h        |   2 +-
 include/cutlass/gemm/thread/mma_sm61.h        |   2 +-
 .../gemm/threadblock/default_ell_mma.h        |   2 +-
 .../gemm/threadblock/default_gemv_core.h      |   2 +-
 .../cutlass/gemm/threadblock/default_mma.h    |   2 +-
 .../gemm/threadblock/default_mma_core.h       |   2 +-
 .../gemm/threadblock/default_mma_core_simt.h  |   2 +-
 .../gemm/threadblock/default_mma_core_sm70.h  |   2 +-
 .../gemm/threadblock/default_mma_core_sm75.h  |   2 +-
 .../gemm/threadblock/default_mma_core_sm80.h  |   2 +-
 .../default_mma_core_sparse_sm80.h            |   2 +-
 .../default_mma_core_with_access_size.h       |   2 +-
 .../default_mma_core_with_reduction.h         |   2 +-
 .../gemm/threadblock/default_mma_core_wmma.h  |   2 +-
 .../default_mma_layernorm_mainloop_fusion.h   |   2 +-
 .../default_mma_planar_complex_multistage.h   |   2 +-
 .../default_mma_planar_complex_pipelined.h    |   2 +-
 .../default_mma_softmax_mainloop_fusion.h     |   2 +-
 .../threadblock/default_mma_with_reduction.h  |   2 +-
 .../default_multistage_mma_complex.h          |   2 +-
 .../default_multistage_mma_complex_core.h     |   2 +-
 ...default_multistage_mma_complex_core_sm80.h |   2 +-
 .../default_multistage_trmm_complex.h         |   2 +-
 .../gemm/threadblock/default_sparse_mma.h     |   2 +-
 .../cutlass/gemm/threadblock/default_trmm.h   |   2 +-
 .../gemm/threadblock/ell_mma_multistage.h     |   2 +-
 .../gemm/threadblock/ell_mma_pipelined.h      |   2 +-
 include/cutlass/gemm/threadblock/gemv.h       |   2 +-
 .../cutlass/gemm/threadblock/index_remat.h    |   2 +-
 include/cutlass/gemm/threadblock/mma_base.h   |   2 +-
 .../gemm/threadblock/mma_blas3_multistage.h   |   2 +-
 ...mma_layernorm_mainloop_fusion_multistage.h |   2 +-
 .../cutlass/gemm/threadblock/mma_multistage.h |   2 +-
 .../cutlass/gemm/threadblock/mma_pipelined.h  |   2 +-
 .../threadblock/mma_planar_complex_base.h     |   2 +-
 .../mma_planar_complex_multistage.h           |   2 +-
 .../mma_planar_complex_pipelined.h            |   2 +-
 .../gemm/threadblock/mma_singlestage.h        |   2 +-
 .../mma_softmax_mainloop_fusion_multistage.h  |   2 +-
 .../gemm/threadblock/mma_sparse_base.h        |   2 +-
 .../gemm/threadblock/mma_sparse_multistage.h  |   2 +-
 .../mma_with_reduction_multistage.h           |   2 +-
 .../gemm/threadblock/threadblock_swizzle.h    |   2 +-
 .../threadblock/threadblock_swizzle_streamk.h |   2 +-
 .../gemm/warp/default_mma_complex_tensor_op.h |   2 +-
 .../gemm/warp/default_mma_sparse_tensor_op.h  |   2 +-
 .../cutlass/gemm/warp/default_mma_tensor_op.h |   2 +-
 .../gemm/warp/default_mma_tensor_op_sm80.h    |   2 +-
 .../default_mma_with_reduction_tensor_op.h    |   2 +-
 .../gemm/warp/default_mma_wmma_tensor_op.h    |   2 +-
 .../warp/layernorm_scale_bias_transform.h     |   2 +-
 include/cutlass/gemm/warp/mma.h               |   2 +-
 .../cutlass/gemm/warp/mma_complex_tensor_op.h |   2 +-
 .../warp/mma_complex_tensor_op_fast_f32.h     |   2 +-
 ...mma_complex_tensor_op_tile_iterator_sm80.h |   2 +-
 .../warp/mma_gaussian_complex_tensor_op.h     |   2 +-
 ...ian_complex_tensor_op_tile_iterator_sm80.h |   2 +-
 .../gemm/warp/mma_mixed_input_tensor_op.h     |   2 +-
 .../cutlass/gemm/warp/mma_planar_complex.h    |   2 +-
 include/cutlass/gemm/warp/mma_simt.h          |   2 +-
 include/cutlass/gemm/warp/mma_simt_policy.h   |   2 +-
 .../gemm/warp/mma_simt_tile_iterator.h        |   2 +-
 .../cutlass/gemm/warp/mma_sparse_tensor_op.h  |   2 +-
 include/cutlass/gemm/warp/mma_tensor_op.h     |   2 +-
 .../gemm/warp/mma_tensor_op_fast_f32.h        |   2 +-
 .../warp/mma_tensor_op_fragment_iterator.h    |   2 +-
 .../cutlass/gemm/warp/mma_tensor_op_policy.h  |   2 +-
 .../cutlass/gemm/warp/mma_tensor_op_sm70.h    |   2 +-
 .../warp/mma_tensor_op_tile_access_iterator.h |   2 +-
 .../gemm/warp/mma_tensor_op_tile_iterator.h   |   2 +-
 .../warp/mma_tensor_op_tile_iterator_sm70.h   |   2 +-
 .../warp/mma_tensor_op_tile_iterator_sm80.h   |   2 +-
 .../warp/mma_tensor_op_tile_iterator_sparse.h |   2 +-
 .../warp/mma_tensor_op_tile_iterator_wmma.h   |   2 +-
 .../cutlass/gemm/warp/mma_tensor_op_wmma.h    |   2 +-
 .../gemm/warp/mma_with_reduction_tensor_op.h  |   2 +-
 .../gemm/warp/scale_bias_tile_iterator.h      |   2 +-
 .../gemm/warp/softmax_scale_bias_transform.h  |   2 +-
 .../gemm/warp/tile_iterator_planar_complex.h  |   2 +-
 include/cutlass/gemm_coord.h                  |   2 +-
 include/cutlass/gemm_coord.hpp                |   2 +-
 include/cutlass/half.h                        |   2 +-
 include/cutlass/integer_subbyte.h             |   2 +-
 include/cutlass/kernel_hardware_info.h        |  62 +-
 include/cutlass/kernel_hardware_info.hpp      |   2 +-
 include/cutlass/kernel_launch.h               |   2 +-
 include/cutlass/layout/layout.h               |   2 +-
 include/cutlass/layout/matrix.h               |   2 +-
 include/cutlass/layout/permute.h              |   2 +-
 include/cutlass/layout/pitch_linear.h         |   2 +-
 include/cutlass/layout/tensor.h               |   2 +-
 .../layout/tensor_op_multiplicand_sm70.h      |   2 +-
 .../layout/tensor_op_multiplicand_sm75.h      |   2 +-
 .../layout/tensor_op_multiplicand_sm80.h      |   2 +-
 include/cutlass/layout/vector.h               |   2 +-
 include/cutlass/matrix.h                      |   2 +-
 include/cutlass/matrix_coord.h                |   2 +-
 include/cutlass/matrix_shape.h                |   2 +-
 include/cutlass/numeric_conversion.h          |   2 +-
 include/cutlass/numeric_size.h                |   2 +-
 include/cutlass/numeric_types.h               |   2 +-
 include/cutlass/pipeline/pipeline.hpp         |   2 +-
 include/cutlass/pipeline/sm90_pipeline.hpp    |   2 +-
 include/cutlass/pitch_linear_coord.h          |   2 +-
 include/cutlass/platform/platform.h           |   2 +-
 include/cutlass/predicate_vector.h            |   2 +-
 include/cutlass/quaternion.h                  |   2 +-
 include/cutlass/real.h                        |   2 +-
 .../cutlass/reduction/device/reduce_split_k.h |   2 +-
 .../cutlass/reduction/device/tensor_reduce.h  |   2 +-
 .../device/tensor_reduce_affine_contiguous.h  |   2 +-
 .../device/tensor_reduce_affine_strided.h     |   2 +-
 .../reduction/kernel/reduce_softmax_final.h   |   2 +-
 .../cutlass/reduction/kernel/reduce_split_k.h |   2 +-
 .../kernel/tensor_reduce_affine_contiguous.h  |   2 +-
 .../kernel/tensor_reduce_affine_strided.h     |   2 +-
 include/cutlass/reduction/thread/reduce.h     |   2 +-
 .../reduction/thread/reduction_operators.h    |   2 +-
 .../cutlass/reduction/threadblock_swizzle.h   |   2 +-
 include/cutlass/relatively_equal.h            |   2 +-
 include/cutlass/semaphore.h                   |   2 +-
 include/cutlass/subbyte_reference.h           |   2 +-
 include/cutlass/tensor_coord.h                |   2 +-
 include/cutlass/tensor_ref.h                  |   2 +-
 include/cutlass/tensor_ref_planar_complex.h   |   2 +-
 include/cutlass/tensor_view.h                 |   2 +-
 include/cutlass/tensor_view_planar_complex.h  |   2 +-
 include/cutlass/tfloat32.h                    |   2 +-
 include/cutlass/thread/matrix.h               |   2 +-
 include/cutlass/trace.h                       |   2 +-
 .../collective/sm90_wgmma_transpose.hpp       |   2 +-
 .../device/transform_universal_adapter.hpp    |   2 +-
 .../kernel/filter_format_transformer.hpp      |   2 +-
 .../kernel/sm90_sparse_gemm_compressor.hpp    |   2 +-
 .../kernel/sparse_gemm_compressor.hpp         |   2 +-
 .../transform/pitch_linear_thread_map.h       |   2 +-
 include/cutlass/transform/thread/transpose.h  |   2 +-
 include/cutlass/transform/thread/unary_op.h   |   2 +-
 .../transform/threadblock/ell_iterator.h      |   2 +-
 .../ell_predicated_tile_access_iterator.h     |   2 +-
 .../ell_predicated_tile_iterator.h            |   2 +-
 ...icated_scale_bias_vector_access_iterator.h |   2 +-
 .../predicated_scale_bias_vector_iterator.h   |   2 +-
 .../predicated_tile_access_iterator.h         |   2 +-
 ...icated_tile_access_iterator_2dthreadtile.h |   2 +-
 .../predicated_tile_access_iterator_params.h  |   2 +-
 ...d_tile_access_iterator_triangular_matrix.h |   2 +-
 .../threadblock/predicated_tile_iterator.h    |   2 +-
 .../predicated_tile_iterator_2dthreadtile.h   |   2 +-
 ...edicated_tile_iterator_triangular_matrix.h |   2 +-
 .../predicated_vector_access_iterator.h       |   2 +-
 ...egular_scale_bias_vector_access_iterator.h |   2 +-
 .../regular_tile_access_iterator.h            |   2 +-
 ...egular_tile_access_iterator_pitch_linear.h |   2 +-
 ...access_iterator_pitch_linear_direct_conv.h |   2 +-
 .../regular_tile_access_iterator_tensor_op.h  |   2 +-
 ...ular_tile_access_iterator_tensor_op_sm80.h |   2 +-
 .../threadblock/regular_tile_iterator.h       |   2 +-
 .../regular_tile_iterator_pitch_linear.h      |   2 +-
 ..._tile_iterator_pitch_linear_2dthreadtile.h |   2 +-
 .../regular_tile_iterator_tensor_op.h         |   2 +-
 .../regular_tile_iterator_tensor_op_sm70.h    |   2 +-
 .../transform/threadblock/vector_iterator.h   |   2 +-
 .../transform/warp/vector_fragment_iterator.h |   2 +-
 include/cutlass/uint128.h                     |   2 +-
 include/cutlass/version.h                     |   4 +-
 include/cutlass/wmma_array.h                  |   2 +-
 include/cutlass/workspace.h                   |   2 +-
 media/docs/code_organization.md               |   2 +-
 .../cutlass_3x_backwards_compatibility.md     |   2 +-
 media/docs/doxygen_mainpage.md                |   2 +-
 media/docs/efficient_gemm.md                  |   2 +-
 media/docs/functionality.md                   |   2 +-
 media/docs/fundamental_types.md               |   2 +-
 media/docs/gemm_api.md                        |   2 +-
 media/docs/gemm_api_3x.md                     |   3 +-
 media/docs/implicit_gemm_convolution.md       |   2 +-
 media/docs/layout.md                          |   2 +-
 media/docs/pipeline.md                        |   2 +-
 media/docs/profiler.md                        |  14 +-
 media/docs/programming_guidelines.md          |   2 +-
 media/docs/quickstart.md                      |   2 +-
 media/docs/terminology.md                     |   2 +-
 media/docs/tile_iterator_concept.md           |   2 +-
 media/docs/utilities.md                       |   2 +-
 pyproject.toml                                |   2 +-
 python/LICENSE.txt                            |   2 +-
 python/README.md                              |   2 +-
 python/cutlass/__init__.py                    |   4 +-
 python/cutlass/backend/__init__.py            |   2 +-
 python/cutlass/backend/arguments.py           |   2 +-
 python/cutlass/backend/c_types.py             |   2 +-
 python/cutlass/backend/compiler.py            |   2 +-
 python/cutlass/backend/conv2d_operation.py    |   2 +-
 python/cutlass/backend/epilogue.py            |   2 +-
 python/cutlass/backend/evt/__init__.py        |   2 +-
 .../cutlass/backend/evt/backend/__init__.py   |   2 +-
 .../backend/evt/backend/emitter_base.py       |   2 +-
 .../backend/evt/backend/sm80_emitter.py       |   2 +-
 .../cutlass/backend/evt/backend/sm80_nodes.py |   2 +-
 .../backend/evt/backend/sm90_emitter.py       |   2 +-
 .../cutlass/backend/evt/backend/sm90_nodes.py |   2 +-
 python/cutlass/backend/evt/epilogue.py        |   2 +-
 .../cutlass/backend/evt/frontend/__init__.py  |   2 +-
 .../backend/evt/frontend/frontend_base.py     |   2 +-
 .../backend/evt/frontend/python_ast.py        |   2 +-
 python/cutlass/backend/evt/ir/__init__.py     |   2 +-
 .../cutlass/backend/evt/ir/compute_nodes.py   |   2 +-
 python/cutlass/backend/evt/ir/dag_ir.py       |   2 +-
 .../backend/evt/ir/layout_algorithm.py        |   2 +-
 python/cutlass/backend/evt/ir/layout_nodes.py |   2 +-
 python/cutlass/backend/evt/ir/load_nodes.py   |   2 +-
 python/cutlass/backend/evt/ir/node.py         |   2 +-
 python/cutlass/backend/evt/ir/store_nodes.py  |   2 +-
 python/cutlass/backend/evt/ir/tensor.py       |   2 +-
 python/cutlass/backend/evt/passes/__init__.py |   2 +-
 .../backend/evt/passes/graph_drawer.py        |   2 +-
 .../backend/evt/passes/pass_argument_type.py  |   2 +-
 .../backend/evt/passes/pass_dag_2_tree.py     |   2 +-
 .../backend/evt/passes/pass_fix_element_d.py  |   2 +-
 .../backend/evt/passes/pass_get_impl.py       |   2 +-
 .../evt/passes/pass_layout_elimination.py     |   2 +-
 .../backend/evt/passes/pass_manager.py        |   2 +-
 .../evt/passes/pass_no_op_elimination.py      |   2 +-
 .../backend/evt/passes/pass_preprocess_red.py |   2 +-
 .../evt/passes/pass_shape_type_propagation.py |   2 +-
 .../evt/passes/smem_size_calculator.py        |   2 +-
 python/cutlass/backend/evt/passes/util.py     |   2 +-
 python/cutlass/backend/frontend.py            |   2 +-
 python/cutlass/backend/gemm_operation.py      |   2 +-
 python/cutlass/backend/library.py             |   2 +-
 python/cutlass/backend/memory_manager.py      |   2 +-
 python/cutlass/backend/operation.py           |   2 +-
 python/cutlass/backend/reduction_operation.py |   2 +-
 python/cutlass/backend/type_hint.py           |   2 +-
 python/cutlass/backend/utils/__init__.py      |   2 +-
 python/cutlass/backend/utils/device.py        |   2 +-
 python/cutlass/emit/__init__.py               |   2 +-
 python/cutlass/emit/common.py                 |   2 +-
 python/cutlass/emit/pytorch.py                |   2 +-
 python/cutlass/epilogue/__init__.py           |   2 +-
 python/cutlass/epilogue/epilogue.py           |   2 +-
 python/cutlass/epilogue/evt_ops.py            |   2 +-
 python/cutlass/library_defaults.py            |   2 +-
 python/cutlass/op/__init__.py                 |   2 +-
 python/cutlass/op/conv.py                     |   2 +-
 python/cutlass/op/gemm.py                     |   2 +-
 python/cutlass/op/gemm_grouped.py             |   2 +-
 python/cutlass/op/op.py                       |   2 +-
 python/cutlass/shape.py                       |   2 +-
 python/cutlass/swizzle.py                     |   2 +-
 python/cutlass/utils/__init__.py              |   2 +-
 python/cutlass/utils/check.py                 |   2 +-
 python/cutlass/utils/datatypes.py             |   2 +-
 python/cutlass/utils/profiler.py              |   2 +-
 python/cutlass_library/__init__.py            |   2 +-
 python/cutlass_library/conv2d_operation.py    |   2 +-
 python/cutlass_library/conv3d_operation.py    |   2 +-
 python/cutlass_library/conv3x_emitter.py      |   2 +-
 python/cutlass_library/gemm_operation.py      |   2 +-
 python/cutlass_library/generator.py           |  26 +-
 python/cutlass_library/library.py             |   2 +-
 python/cutlass_library/manifest.py            |  26 +-
 python/cutlass_library/rank_2k_operation.py   |   2 +-
 python/cutlass_library/rank_k_operation.py    |   2 +-
 python/cutlass_library/sm90_shapes.py         |   2 +-
 python/cutlass_library/sm90_utils.py          |  99 +-
 python/cutlass_library/symm_operation.py      |   2 +-
 python/cutlass_library/trmm_operation.py      |   2 +-
 python/docs_src/source/conf.py                |   2 +-
 python/pycute/__init__.py                     |   2 +-
 python/pycute/int_tuple.py                    |   2 +-
 python/pycute/layout.py                       |   2 +-
 python/pycute/swizzle.py                      |   2 +-
 python/pycute/typing.py                       |   2 +-
 python/setup_cutlass.py                       |   2 +-
 python/setup_library.py                       |   4 +-
 python/setup_pycute.py                        |   4 +-
 test/CMakeLists.txt                           |   2 +-
 .../cutlass/conv2d/conv2d_problem_sizes.py    |   2 +-
 test/python/cutlass/conv2d/conv2d_sm80.py     |   2 +-
 .../cutlass/conv2d/conv2d_test_utils.py       |   2 +-
 test/python/cutlass/conv2d/run_all_tests.py   |   2 +-
 test/python/cutlass/emit/pytorch.py           |   2 +-
 .../python/cutlass/evt/evt_compute_sm80_90.py |   2 +-
 test/python/cutlass/evt/evt_layout_sm80_90.py |   2 +-
 test/python/cutlass/evt/evt_load_sm80_90.py   |   2 +-
 test/python/cutlass/evt/evt_mixed_sm80_90.py  |   2 +-
 test/python/cutlass/evt/evt_store_sm80_90.py  |   2 +-
 test/python/cutlass/evt/run_all_tests.py      |   2 +-
 test/python/cutlass/evt/utils/evt_testbed.py  |   2 +-
 test/python/cutlass/gemm/gemm_batched.py      |   2 +-
 test/python/cutlass/gemm/gemm_f16_sm80.py     |   2 +-
 test/python/cutlass/gemm/gemm_f16_sm90.py     |   2 +-
 test/python/cutlass/gemm/gemm_f32_sm80.py     |   2 +-
 test/python/cutlass/gemm/gemm_f64_sm80.py     |   2 +-
 test/python/cutlass/gemm/gemm_f64_sm90.py     |   2 +-
 test/python/cutlass/gemm/gemm_f8_sm90.py      |   2 +-
 test/python/cutlass/gemm/gemm_mixed_sm80.py   |   2 +-
 test/python/cutlass/gemm/gemm_s8_sm80.py      |   2 +-
 test/python/cutlass/gemm/gemm_s8_sm90.py      |   2 +-
 test/python/cutlass/gemm/gemm_testbed.py      |   2 +-
 test/python/cutlass/gemm/run_all_tests.py     |   2 +-
 test/python/cutlass/gemm/utils.py             |   2 +-
 test/python/cutlass/installation.py           |   2 +-
 .../cutlass/interface/conv2d_interface.py     |   2 +-
 .../python/cutlass/interface/evt_interface.py |   2 +-
 .../cutlass/interface/gemm_interface.py       |   2 +-
 test/python/cutlass/interface/utils.py        |   2 +-
 test/python/pycute/run_all_tests.py           |   2 +-
 test/python/pycute/test_coalesce.py           |   2 +-
 test/python/pycute/test_complement.py         |   2 +-
 test/python/pycute/test_composition.py        |   2 +-
 test/python/pycute/test_int_tuple.py          |   2 +-
 test/python/pycute/test_left_inverse.py       |   2 +-
 test/python/pycute/test_right_inverse.py      |   2 +-
 test/python/pycute/test_typing.py             |   2 +-
 test/self_contained_includes/CMakeLists.txt   |   2 +-
 test/unit/CMakeLists.txt                      |   2 +-
 test/unit/cluster_launch/CMakeLists.txt       |   2 +-
 test/unit/cluster_launch/cluster_launch.cu    |   2 +-
 test/unit/common/cutlass_unit_test.h          |   2 +-
 test/unit/common/filter_architecture.cpp      |   2 +-
 test/unit/conv/CMakeLists.txt                 |   2 +-
 test/unit/conv/cache_testbed_output.h         |   2 +-
 test/unit/conv/device/CMakeLists.txt          |   2 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu |   2 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   2 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu |   2 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu |   2 +-
 ...m_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu |   2 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   2 +-
 ...f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu |   2 +-
 ...f32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu |   2 +-
 ...wx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu |   2 +-
 ...wx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu |   2 +-
 ...4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu |   2 +-
 ...4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu |   2 +-
 ...wx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu |   2 +-
 ...wx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu |   2 +-
 ...8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu |   2 +-
 ...8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu |   2 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 .../conv2d_fprop_with_broadcast_simt_sm80.cu  |   2 +-
 .../conv2d_fprop_with_broadcast_sm70.cu       |   2 +-
 .../conv2d_fprop_with_broadcast_sm75.cu       |   2 +-
 .../conv2d_fprop_with_reduction_sm75.cu       |   2 +-
 test/unit/conv/device/conv2d_problems.h       |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...ded_dgrad_implicit_gemm_swizzling4_sm80.cu |   2 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 test/unit/conv/device/conv2d_testbed.h        |   2 +-
 .../conv/device/conv2d_testbed_interleaved.h  |   2 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu |   2 +-
 ...f32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu |   2 +-
 ...nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   2 +-
 ...hwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu |   2 +-
 .../conv/device/conv2d_with_absmax_testbed.h  |   2 +-
 .../device/conv2d_with_broadcast_testbed.h    |   2 +-
 .../device/conv2d_with_reduction_testbed.h    |   2 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   2 +-
 ...32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu |   2 +-
 ...c_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   2 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu |   2 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   2 +-
 ...32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu |   2 +-
 ...c_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   2 +-
 .../conv3d_fprop_with_broadcast_simt_sm80.cu  |   2 +-
 test/unit/conv/device/conv3d_problems.h       |   2 +-
 test/unit/conv/device/conv3d_testbed.h        |   2 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu |   2 +-
 ...wc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   2 +-
 ...32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu |   2 +-
 ...c_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu |   2 +-
 .../device/conv3d_with_broadcast_testbed.h    |   2 +-
 ...m_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu |   2 +-
 .../deconv2d_with_broadcast_simt_sm80.cu      |   2 +-
 ...32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu |   2 +-
 .../deconv3d_with_broadcast_simt_sm80.cu      |   2 +-
 .../depthwise_conv2d_direct_conv_testbed.h    |   2 +-
 ...v_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu |   2 +-
 ...n_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu |   2 +-
 ...m_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu |   2 +-
 ...nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu |   2 +-
 test/unit/conv/device_3x/CMakeLists.txt       |   2 +-
 .../conv/device_3x/conv_problem_sizes.hpp     |  74 +-
 test/unit/conv/device_3x/dgrad/CMakeLists.txt |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 test/unit/conv/device_3x/fprop/CMakeLists.txt |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ...op_implicit_gemm_s8_s8_s32_tensorop_s32.cu |   2 +-
 ...mplicit_gemm_tf32_tf32_f32_tensorop_f32.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ...op_implicit_gemm_s8_s8_s32_tensorop_s32.cu |   2 +-
 ...mplicit_gemm_tf32_tf32_f32_tensorop_f32.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ...op_implicit_gemm_s8_s8_s32_tensorop_s32.cu |   2 +-
 ...mplicit_gemm_tf32_tf32_f32_tensorop_f32.cu |   2 +-
 test/unit/conv/device_3x/testbed_conv.hpp     |  85 +-
 test/unit/conv/device_3x/wgrad/CMakeLists.txt |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f16.cu |   2 +-
 ..._implicit_gemm_f16_f16_f32_tensorop_f32.cu |   2 +-
 test/unit/core/CMakeLists.txt                 |   2 +-
 test/unit/core/array.cu                       |   2 +-
 test/unit/core/bfloat16.cu                    |   2 +-
 test/unit/core/complex.cu                     |   2 +-
 test/unit/core/fast_numeric_conversion.cu     |   2 +-
 test/unit/core/float8.cu                      |   2 +-
 test/unit/core/functional.cu                  |   2 +-
 test/unit/core/half.cu                        |   2 +-
 test/unit/core/matrix.cu                      |   2 +-
 test/unit/core/matrix_coord.cu                |   2 +-
 test/unit/core/numeric_conversion.cu          |   2 +-
 test/unit/core/numeric_conversion_subbyte.cu  |   2 +-
 test/unit/core/predicate_vector.cu            |   2 +-
 test/unit/core/quaternion.cu                  |   2 +-
 test/unit/core/tensor_ref.cu                  |   2 +-
 test/unit/core/tensor_view.cu                 |   2 +-
 test/unit/core/test_unit_core.cpp             |   2 +-
 test/unit/core/tfloat32.cu                    |   2 +-
 test/unit/core/uint128.cu                     |   2 +-
 test/unit/cute/CMakeLists.txt                 |   2 +-
 test/unit/cute/ampere/CMakeLists.txt          |   2 +-
 test/unit/cute/ampere/cooperative_copy.cu     |   2 +-
 test/unit/cute/ampere/cooperative_gemm.cu     |   2 +-
 test/unit/cute/ampere/cp_sync.cu              |   2 +-
 test/unit/cute/ampere/ldsm.cu                 |   2 +-
 test/unit/cute/ampere/tiled_cp_async.cu       |   2 +-
 .../cute/ampere/tiled_cp_async_testbed.hpp    |   2 +-
 test/unit/cute/cooperative_gemm_common.hpp    |   2 +-
 test/unit/cute/core/CMakeLists.txt            |   2 +-
 test/unit/cute/core/array_subbyte.cpp         |   2 +-
 test/unit/cute/core/bitfield.cpp              |   2 +-
 test/unit/cute/core/coalesce.cpp              |   2 +-
 test/unit/cute/core/compact_xmajor.cpp        |   2 +-
 test/unit/cute/core/compare.cpp               |   2 +-
 test/unit/cute/core/complement.cpp            |   2 +-
 test/unit/cute/core/composition.cpp           |   2 +-
 test/unit/cute/core/constants.cpp             |   2 +-
 test/unit/cute/core/core_unit.cpp             |   2 +-
 test/unit/cute/core/domain_distribute.cpp     |   2 +-
 test/unit/cute/core/int_tuple.cpp             |   2 +-
 test/unit/cute/core/inverse_left.cpp          |   2 +-
 test/unit/cute/core/inverse_right.cpp         |   2 +-
 test/unit/cute/core/logical_divide.cpp        |   2 +-
 test/unit/cute/core/logical_product.cpp       |   2 +-
 test/unit/cute/core/math.cpp                  |   2 +-
 test/unit/cute/core/mixedbits.cpp             |   2 +-
 test/unit/cute/core/nullspace.cpp             |   2 +-
 test/unit/cute/core/packed_tuple.cpp          |   2 +-
 test/unit/cute/core/pointer.cpp               |   2 +-
 test/unit/cute/core/reverse.cpp               |   2 +-
 test/unit/cute/core/swizzle_layout.cpp        |   2 +-
 test/unit/cute/core/transform.cpp             |   2 +-
 test/unit/cute/core/tuple.cpp                 |   2 +-
 test/unit/cute/core/tuple_find.cpp            |   2 +-
 test/unit/cute/hopper/CMakeLists.txt          |   2 +-
 test/unit/cute/hopper/bulk_load.cu            |   2 +-
 test/unit/cute/hopper/bulk_store.cu           |   2 +-
 test/unit/cute/hopper/cooperative_gemm.cu     |   2 +-
 test/unit/cute/hopper/stsm.cu                 |   2 +-
 test/unit/cute/hopper/tma_load.cu             |   2 +-
 test/unit/cute/hopper/tma_load_testbed.hpp    |   2 +-
 test/unit/cute/hopper/tma_mcast_load.cu       |   2 +-
 .../cute/hopper/tma_mcast_load_testbed.hpp    |   2 +-
 test/unit/cute/hopper/tma_store.cu            |   2 +-
 test/unit/cute/hopper/tma_store_testbed.hpp   |   2 +-
 test/unit/cute/layout/CMakeLists.txt          |   2 +-
 test/unit/cute/layout/layout_operator.cu      |   2 +-
 .../unit/cute/msvc_compilation/CMakeLists.txt |   2 +-
 test/unit/cute/msvc_compilation/tuple.cpp     |   2 +-
 test/unit/cute/turing/CMakeLists.txt          |   2 +-
 test/unit/cute/turing/cooperative_gemm.cu     |   2 +-
 test/unit/cute/volta/CMakeLists.txt           |   2 +-
 test/unit/cute/volta/cooperative_gemm.cu      |   2 +-
 test/unit/cute/volta/vectorization_auto.cu    |   2 +-
 test/unit/epilogue/CMakeLists.txt             |   2 +-
 test/unit/epilogue/thread/CMakeLists.txt      |   2 +-
 test/unit/epilogue/thread/activation.cu       |   2 +-
 .../epilogue/thread/linear_combination.cu     |   2 +-
 .../linear_combination_planar_complex.cu      |   2 +-
 test/unit/epilogue/threadblock/CMakeLists.txt |   2 +-
 .../threadblock/epilogue_planar_complex.cu    |   2 +-
 .../epilogue/threadblock/epilogue_simt.cu     |   2 +-
 .../threadblock/epilogue_simt_sm60.cu         |   2 +-
 .../threadblock/epilogue_simt_sm61.cu         |   2 +-
 .../threadblock/epilogue_tensor_op.cu         |   2 +-
 .../threadblock/epilogue_volta_tensor_op.cu   |   2 +-
 .../epilogue_with_reduction_tensor_op.cu      |   2 +-
 .../epilogue_with_reduction_testbed.h         |   2 +-
 .../epilogue_wmma_tensor_op_sm70.cu           |   2 +-
 .../threadblock/output_tile_threadmap.cu      |   2 +-
 .../threadblock/predicated_tile_iterator.cu   |   2 +-
 test/unit/epilogue/threadblock/testbed.h      |   2 +-
 .../threadblock/testbed_planar_complex.h      |   2 +-
 test/unit/epilogue/warp/CMakeLists.txt        |   2 +-
 .../warp/fragment_iterator_tensor_op.cu       |   2 +-
 .../warp/fragment_iterator_volta_tensor_op.cu |   2 +-
 .../warp/fragment_iterator_wmma_tensor_op.cu  |   2 +-
 test/unit/gemm/CMakeLists.txt                 |   2 +-
 test/unit/gemm/device/CMakeLists.txt          |   2 +-
 .../device/default_gemm_configuration.hpp     |  18 +-
 .../gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu   |   2 +-
 .../gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu   |   2 +-
 ...mm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu |   2 +-
 .../gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu   |   2 +-
 .../gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu   |   2 +-
 ...mm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu |   2 +-
 ...emm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...mm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu |   2 +-
 ...32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu |   2 +-
 ...32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu |   2 +-
 ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu |   2 +-
 ...cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu |   2 +-
 ...mm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu |   2 +-
 ...mm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu |   2 +-
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu |   2 +-
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu |   2 +-
 ...mm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu |   2 +-
 ...mm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu |   2 +-
 ...6n_f16n_direct_store_tensor_op_f32_sm80.cu |   2 +-
 ..._f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu |   2 +-
 ..._f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu |   2 +-
 ..._f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu |   2 +-
 ..._f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu |   2 +-
 ..._f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu |   2 +-
 ...6n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu |   2 +-
 ...6n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu |   2 +-
 .../gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu |   2 +-
 .../gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu |   2 +-
 ...16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu |   2 +-
 .../gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu |   2 +-
 ...f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu |   2 +-
 ..._f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu |   2 +-
 ..._f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu |   2 +-
 ..._f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu |   2 +-
 ...16n_singlestage_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu |   2 +-
 ...16t_singlestage_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16n_f16t_tensor_op_f16_broadcast_sm80.cu |   2 +-
 ...6t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu |   2 +-
 ...6t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu |   2 +-
 .../gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu |   2 +-
 .../gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu |   2 +-
 ...16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu |   2 +-
 .../gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu |   2 +-
 ...f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu |   2 +-
 ..._f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu |   2 +-
 ..._f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu |   2 +-
 ...32t_singlestage_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu |   2 +-
 ..._f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu |   2 +-
 ..._f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu |   2 +-
 ..._f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu |   2 +-
 ..._f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu |   2 +-
 ..._f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu |   2 +-
 .../gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu |   2 +-
 .../gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu |   2 +-
 ..._f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu |   2 +-
 ..._f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu |   2 +-
 .../gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 ...32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu |   2 +-
 .../gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu |   2 +-
 .../gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu |   2 +-
 .../gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu |   2 +-
 .../gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu |   2 +-
 .../gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu   |   2 +-
 ..._f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu |   2 +-
 .../gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu    |   2 +-
 ...m_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu |   2 +-
 .../device/gemm_grouped_scheduler_sm80.cu     |   2 +-
 test/unit/gemm/device/gemm_grouped_sm80.cu    |   2 +-
 ...anar_complex_f16_f16_f32_tensor_op_sm70.cu |   2 +-
 ...anar_complex_f16_f16_f32_tensor_op_sm75.cu |   2 +-
 ...anar_complex_f16_f16_f32_tensor_op_sm80.cu |   2 +-
 .../gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu    |   2 +-
 .../gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu    |   2 +-
 .../gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu   |   2 +-
 .../gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu   |   2 +-
 ...mm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu |   2 +-
 .../gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu   |   2 +-
 .../gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu   |   2 +-
 ..._s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu |   2 +-
 ...mm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu |   2 +-
 .../gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu    |   2 +-
 .../gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu    |   2 +-
 .../gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu    |   2 +-
 .../gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu    |   2 +-
 .../gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu    |   2 +-
 .../gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu    |   2 +-
 .../gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu   |   2 +-
 .../gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu   |   2 +-
 .../gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu   |   2 +-
 ...mm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu |   2 +-
 .../gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu   |   2 +-
 .../gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu   |   2 +-
 ..._s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu |   2 +-
 ...mm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu |   2 +-
 .../gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu    |   2 +-
 .../gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu    |   2 +-
 ...emm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu |   2 +-
 .../gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu    |   2 +-
 .../gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu    |   2 +-
 ...emm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu |   2 +-
 .../gemm_splitk_serial_tensor_op_sm75.cu      |   2 +-
 .../unit/gemm/device/gemm_splitk_simt_sm50.cu |   2 +-
 .../gemm/device/gemm_splitk_tensor_op_sm70.cu |   2 +-
 .../gemm/device/gemm_splitk_tensor_op_sm75.cu |   2 +-
 test/unit/gemm/device/gemm_testbed_3x.hpp     |  10 +-
 test/unit/gemm/device/gemm_testbed_3x_evt.hpp |   2 +-
 .../gemm/device/gemm_testbed_3x_ptr_array.hpp | 142 ++-
 .../gemm_testbed_3x_tensor_broadcast.hpp      |   2 +-
 ...emm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...emm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...emm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...emm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu |   2 +-
 ...mm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu |   2 +-
 ...8n_bf16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...s8n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...8n_bf16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...u8n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...al_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu |   2 +-
 ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu |   2 +-
 ...al_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu |   2 +-
 ...ersal_f16n_f16t_f32n_tensor_op_f32_sm75.cu |   2 +-
 ...ersal_f16n_f16t_f32t_tensor_op_f32_sm75.cu |   2 +-
 ...s8n_f16t_mixed_input_tensor_op_f16_sm80.cu |   2 +-
 ...s8n_f16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...s8n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...u8n_f16t_mixed_input_tensor_op_f16_sm80.cu |   2 +-
 ...u8n_f16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...u8n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...s8n_s32t_mixed_input_tensor_op_s32_sm80.cu |   2 +-
 ..._s8n_s8t_mixed_input_tensor_op_s32_sm80.cu |   2 +-
 ...6n_bf16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f16t_mixed_input_tensor_op_f16_sm80.cu |   2 +-
 ...16n_f16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...s4n_s32t_mixed_input_tensor_op_s32_sm80.cu |   2 +-
 ..._s4n_s8t_mixed_input_tensor_op_s32_sm80.cu |   2 +-
 ...6n_bf16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f16t_mixed_input_tensor_op_f16_sm80.cu |   2 +-
 ...16n_f16t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...16n_f32t_mixed_input_tensor_op_f32_sm80.cu |   2 +-
 ...adcast_f16n_f16n_f16n_tensorop_f32_sm75.cu |   2 +-
 ...uction_f16n_f16n_f16n_tensorop_f32_sm75.cu |   2 +-
 ...uction_f16t_f16n_f16n_tensorop_f32_sm80.cu |   2 +-
 test/unit/gemm/device/gemv.cu                 |   2 +-
 .../hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu |   2 +-
 .../hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu |   2 +-
 ..._cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 ..._cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu |   2 +-
 .../hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu |   2 +-
 ...4n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu |   2 +-
 ...cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu |   2 +-
 ...cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu |   2 +-
 .../her2k_cf32h_cf32n_tensor_op_f32_sm80.cu   |   2 +-
 ...r2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu |   2 +-
 .../her2k_cf64_cf64_tensor_op_f64_sm90.cu     |   2 +-
 ..._cf64h_cf64n_tensor_op_f64_grouped_sm80.cu |   2 +-
 ..._cf64n_cf64n_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../her2k_cf64n_cf64n_tensor_op_f64_sm80.cu   |   2 +-
 .../her2k_cf64n_cf64t_tensor_op_f64_sm80.cu   |   2 +-
 .../herk_cf32h_cf32n_tensor_op_f32_sm80.cu    |   2 +-
 ...erk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu |   2 +-
 .../herk_cf64_cf64_tensor_op_f64_sm90.cu      |   2 +-
 .../herk_cf64h_cf64n_tensor_op_f64_sm80.cu    |   2 +-
 test/unit/gemm/device/multistage_testbed.h    |   2 +-
 .../device/multistage_testbed_interleaved.h   |   2 +-
 .../device/rank_2k_grouped_scheduler_sm80.cu  |   2 +-
 test/unit/gemm/device/simt_cgemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_cgemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_cgemm_nt_sm80.cu   |   2 +-
 test/unit/gemm/device/simt_cgemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_cgemm_tn_sm80.cu   |   2 +-
 test/unit/gemm/device/simt_cgemm_tt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_dgemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_dgemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_dgemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_dgemm_tt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_f8gemm_tn_sm50.cu  |   2 +-
 test/unit/gemm/device/simt_hgemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_hgemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_hgemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_hgemm_tt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_igemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_igemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_igemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_igemm_tt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_int8_igemm_sm61.cu |   2 +-
 .../gemm/device/simt_int8_igemm_sm61_perf.cu  |   2 +-
 .../device/simt_int8_igemm_sm61_sliced_k.cu   |   2 +-
 test/unit/gemm/device/simt_qgemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_qgemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_qgemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_qgemm_tt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_sgemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_sgemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_sgemm_nt_sm80.cu   |   2 +-
 test/unit/gemm/device/simt_sgemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_sgemm_tn_sm80.cu   |   2 +-
 test/unit/gemm/device/simt_sgemm_tt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_sm50.py            |   4 +-
 test/unit/gemm/device/simt_zgemm_nn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_zgemm_nt_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_zgemm_tn_sm50.cu   |   2 +-
 test/unit/gemm/device/simt_zgemm_tt_sm50.cu   |   2 +-
 .../gemm/device/sm50_gemm_f32_f32_f32_simt.cu |   2 +-
 .../gemm/device/sm50_gemm_f64_f64_f64_simt.cu |   2 +-
 .../gemm/device/sm61_gemm_s8_s8_s32_simt.cu   |   2 +-
 .../sm80_gemm_f16_f16_f32_tensor_op_f32.cu    |   2 +-
 .../gemm/device/sm80_gemm_f32_f32_f32_simt.cu |   2 +-
 .../gemm/device/sm80_gemm_f64_f64_f64_simt.cu |   2 +-
 .../sm80_gemm_f64_f64_f64_tensor_op_f64.cu    |   2 +-
 .../device/sm80_gemm_s8_s8_s32_tensor_op.cu   |   2 +-
 .../sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu  |   2 +-
 test/unit/gemm/device/sm90_evt_operations.hpp |   2 +-
 ...emm_bf16_bf16_bf16_alignx_tensor_op_f32.cu |   2 +-
 ...16_alignx_tensor_op_f32_warpspecialized.cu |   2 +-
 ...nsor_op_f32_warpspecialized_cooperative.cu |   2 +-
 ..._tensor_op_f32_warpspecialized_pingpong.cu |   2 +-
 .../sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu |   2 +-
 ...0_gemm_f16_f16_f16_alignx_tensor_op_f32.cu |   2 +-
 ...16_alignx_tensor_op_f32_warpspecialized.cu |   2 +-
 ...nsor_op_f32_warpspecialized_cooperative.cu |   2 +-
 ..._tensor_op_f32_warpspecialized_pingpong.cu |   2 +-
 .../device/sm90_gemm_f16_f16_f16_tensor_op.cu |   2 +-
 ...f16_tensor_op_f32_cluster_unspecialized.cu |   2 +-
 ...6_tensor_op_f32_cluster_warpspecialized.cu |   2 +-
 ...f32_cluster_warpspecialized_cooperative.cu |   2 +-
 ...er_warpspecialized_cooperative_aux_load.cu |   2 +-
 ...r_warpspecialized_cooperative_aux_store.cu |   2 +-
 ...pecialized_cooperative_bias_elementwise.cu |   2 +-
 ...cluster_warpspecialized_cooperative_dag.cu |   2 +-
 ...ster_warpspecialized_cooperative_reduce.cu |   2 +-
 ...rpspecialized_cooperative_row_broadcast.cu |   2 +-
 ...op_f32_cluster_warpspecialized_pingpong.cu |   2 +-
 ...uster_warpspecialized_pingpong_aux_load.cu |   2 +-
 ...rpspecialized_pingpong_bias_elementwise.cu |   2 +-
 ...32_cluster_warpspecialized_pingpong_dag.cu |   2 +-
 ...cluster_warpspecialized_pingpong_reduce.cu |   2 +-
 ..._warpspecialized_pingpong_row_broadcast.cu |   2 +-
 ..._f16_tensor_op_f32_cooperative_stream_k.cu |   2 +-
 ...mm_f16_f16_f16_tensor_op_f32_group_gemm.cu |   2 +-
 ...6_f16_tensor_op_f32_group_gemm_pingpong.cu |   2 +-
 ...emm_f16_f16_f16_tensor_op_f32_ptr_array.cu |   2 +-
 ...16_f16_tensor_op_f32_ptr_array_pingpong.cu |   2 +-
 ..._f16_f16_tensor_op_f32_tensor_broadcast.cu |   2 +-
 ..._rs_cluster_warpspecialized_cooperative.cu |   2 +-
 .../sm90_gemm_f32_f32_f32_tensor_op_f32.cu    |   2 +-
 ..._f32_f32_tensor_op_f32_tensor_broadcast.cu |   2 +-
 .../sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu    |   2 +-
 ...sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu |   2 +-
 ...f32_cluster_warpspecialized_cooperative.cu |   2 +-
 ...cluster_warpspecialized_cooperative_evt.cu |   2 +-
 ..._f32_tensor_op_f32_cooperative_stream_k.cu |   2 +-
 ..._rs_cluster_warpspecialized_cooperative.cu |   2 +-
 .../sm90_gemm_f8_f8_f32_tensor_op_fp32.cu     |   2 +-
 .../sm90_gemm_f8_f8_f8_tensor_op_fp32.cu      |   2 +-
 .../sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu  |   2 +-
 ...sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu |   2 +-
 ...s8_alignx_tensor_op_s32_warpspecialized.cu |   2 +-
 ...nsor_op_s32_warpspecialized_cooperative.cu |   2 +-
 ..._tensor_op_s32_warpspecialized_pingpong.cu |   2 +-
 .../sm90_gemm_s8_s8_s8_tensor_op_s32.cu       |   2 +-
 ...s8_s8_s8_tensor_op_s32_tensor_broadcast.cu |   2 +-
 .../device/sm90_gemm_stream_k_scheduler.cu    |   2 +-
 ...gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu |   2 +-
 ...32_alignx_tensor_op_f32_warpspecialized.cu |   2 +-
 ...nsor_op_f32_warpspecialized_cooperative.cu |   2 +-
 ..._tensor_op_f32_warpspecialized_pingpong.cu |   2 +-
 .../sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu  |   2 +-
 ..._op_f32_gmma_rs_cluster_warpspecialized.cu |   2 +-
 .../device/sm90_gett_f16_f16_f16_tensor_op.cu |   2 +-
 ...0_sparse_gemm_f16_f16_f32_tensor_op_f32.cu |   2 +-
 ...m90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu | 119 ++-
 ...m90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu |   2 +-
 ...sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu |   2 +-
 .../symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu |   2 +-
 .../symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu |   2 +-
 ..._cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 ..._cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu |   2 +-
 .../symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu |   2 +-
 ...4n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu |   2 +-
 ...cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu |   2 +-
 ...cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu |   2 +-
 ...mm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 ...mm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu |   2 +-
 ...mm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 .../device/symm_f64_f64_tensor_op_f64_sm90.cu |   2 +-
 .../symm_f64n_f64n_tensor_op_f64_ls_sm80.cu   |   2 +-
 .../symm_f64n_f64n_tensor_op_f64_rs_sm80.cu   |   2 +-
 .../symm_f64n_f64t_tensor_op_f64_ls_sm80.cu   |   2 +-
 .../symm_f64n_f64t_tensor_op_f64_rs_sm80.cu   |   2 +-
 .../symm_f64t_f64n_tensor_op_f64_ls_sm80.cu   |   2 +-
 .../symm_f64t_f64n_tensor_op_f64_rs_sm80.cu   |   2 +-
 .../symm_f64t_f64t_tensor_op_f64_ls_sm80.cu   |   2 +-
 .../symm_f64t_f64t_tensor_op_f64_rs_sm80.cu   |   2 +-
 .../symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu  |   2 +-
 .../symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu  |   2 +-
 .../symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu  |   2 +-
 .../syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu   |   2 +-
 ...r2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu |   2 +-
 .../syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu   |   2 +-
 ...r2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu |   2 +-
 .../syr2k_cf64_cf64_tensor_op_f64_sm90.cu     |   2 +-
 ..._cf64n_cf64n_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu   |   2 +-
 ..._cf64n_cf64t_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu   |   2 +-
 ..._cf64t_cf64n_tensor_op_f64_grouped_sm80.cu |   2 +-
 ..._cf64t_cf64t_tensor_op_f64_grouped_sm80.cu |   2 +-
 ...syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu |   2 +-
 ...syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu |   2 +-
 .../syr2k_f64_f64_tensor_op_f64_sm90.cu       |   2 +-
 ...2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../syr2k_f64n_f64n_tensor_op_f64_sm80.cu     |   2 +-
 ...2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../syr2k_f64n_f64t_tensor_op_f64_sm80.cu     |   2 +-
 ...2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../syr2k_f64t_f64n_tensor_op_f64_sm80.cu     |   2 +-
 ...2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu |   2 +-
 .../syr2k_tf32n_f32n_tensor_op_f32_sm80.cu    |   2 +-
 .../syr2k_tf32t_f32n_tensor_op_f32_sm80.cu    |   2 +-
 .../syrk_cf32n_cf32n_tensor_op_f32_sm80.cu    |   2 +-
 ...yrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu |   2 +-
 .../syrk_cf32n_cf32t_tensor_op_f32_sm80.cu    |   2 +-
 ...yrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu |   2 +-
 .../syrk_cf64_cf64_tensor_op_f64_sm90.cu      |   2 +-
 .../syrk_cf64n_cf64n_tensor_op_f64_sm80.cu    |   2 +-
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu |   2 +-
 .../syrk_cf64n_cf64t_tensor_op_f64_sm80.cu    |   2 +-
 .../syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu |   2 +-
 .../syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu |   2 +-
 .../device/syrk_f64_f64_tensor_op_f64_sm90.cu |   2 +-
 .../syrk_f64n_f64t_tensor_op_f64_sm80.cu      |   2 +-
 .../syrk_f64t_f64n_tensor_op_f64_sm80.cu      |   2 +-
 .../syrk_tf32n_f32t_tensor_op_f32_sm80.cu     |   2 +-
 .../syrk_tf32t_f32t_tensor_op_f32_sm80.cu     |   2 +-
 test/unit/gemm/device/testbed.h               |   2 +-
 test/unit/gemm/device/testbed_complex.h       |   2 +-
 .../gemm/device/testbed_gemm_with_broadcast.h |   2 +-
 .../gemm/device/testbed_gemm_with_reduction.h |   2 +-
 test/unit/gemm/device/testbed_grouped.h       |   2 +-
 .../gemm/device/testbed_grouped_rank_2k.h     |   2 +-
 .../testbed_grouped_rank_2k_scheduler.h       |   2 +-
 .../gemm/device/testbed_grouped_scheduler.h   |   2 +-
 test/unit/gemm/device/testbed_interleaved.h   |   2 +-
 .../unit/gemm/device/testbed_planar_complex.h |   2 +-
 .../gemm/device/testbed_rank2k_universal.h    |   2 +-
 .../gemm/device/testbed_rank_k_universal.h    |   2 +-
 test/unit/gemm/device/testbed_sanity.h        |   2 +-
 test/unit/gemm/device/testbed_sparse.h        |   2 +-
 test/unit/gemm/device/testbed_splitk.h        |   2 +-
 .../unit/gemm/device/testbed_symm_universal.h |   2 +-
 .../unit/gemm/device/testbed_trmm_universal.h |   2 +-
 test/unit/gemm/device/testbed_universal.h     |   2 +-
 test/unit/gemm/device/testbed_utils.h         |   2 +-
 test/unit/gemm/device/testbed_with_absmax.h   |   2 +-
 ...mm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu |   2 +-
 ...32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu |   2 +-
 .../trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu |   2 +-
 ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu |   2 +-
 ...mm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu |   2 +-
 ...2n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 ...2n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu |   2 +-
 ...2t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 ...2t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu |   2 +-
 .../trmm_f64_f64_f64_tensor_op_f64_sm90.cu    |   2 +-
 ...mm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu |   2 +-
 ...mm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu |   2 +-
 ...mm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu |   2 +-
 ...mm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu |   2 +-
 ...mm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu |   2 +-
 ..._tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu |   2 +-
 ..._tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu |   2 +-
 ..._tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu |   2 +-
 ..._tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu |   2 +-
 test/unit/gemm/kernel/batched_gemv.cu         |   2 +-
 test/unit/gemm/kernel/testbed_gemv.h          |   2 +-
 test/unit/gemm/thread/CMakeLists.txt          |   2 +-
 test/unit/gemm/thread/gemm_sm50.cu            |   2 +-
 test/unit/gemm/thread/gemm_sm60.cu            |   2 +-
 test/unit/gemm/thread/gemm_sm61.cu            |   2 +-
 test/unit/gemm/thread/host/CMakeLists.txt     |   2 +-
 test/unit/gemm/thread/host/gemm_sm60_host.cu  |   2 +-
 test/unit/gemm/thread/host/testbed_host.h     |   2 +-
 test/unit/gemm/thread/testbed.h               |   2 +-
 test/unit/gemm/threadblock/CMakeLists.txt     |   2 +-
 test/unit/gemm/threadblock/batched_gemv.cu    |   2 +-
 .../gemm/threadblock/epilogue_workspace.cu    |   2 +-
 test/unit/gemm/threadblock/mma_multistage.cu  |   2 +-
 .../threadblock/mma_multistage_slicedk.cu     |   2 +-
 .../gemm/threadblock/mma_multistage_sparse.cu |   2 +-
 .../mma_multistage_sparse_testbed.h           |   2 +-
 .../gemm/threadblock/mma_multistage_testbed.h |   2 +-
 .../mma_multistage_testbed_slicedk.h          |   2 +-
 .../gemm/threadblock/mma_pipelined_simt.cu    |   2 +-
 .../gemm/threadblock/mma_pipelined_slicedk.cu |   2 +-
 .../gemm/threadblock/mma_pipelined_sm70.cu    |   2 +-
 .../gemm/threadblock/mma_pipelined_sm75.cu    |   2 +-
 .../gemm/threadblock/mma_pipelined_sm80.cu    |   2 +-
 .../gemm/threadblock/mma_pipelined_testbed.h  |   2 +-
 .../mma_pipelined_testbed_slicedk.h           |   2 +-
 .../threadblock/mma_pipelined_wmma_sm70.cu    |   2 +-
 .../threadblock/mma_pipelined_wmma_sm75.cu    |   2 +-
 .../threadblock/mma_planar_complex_sm80.cu    |   2 +-
 .../threadblock/mma_planar_complex_testbed.h  |   2 +-
 .../threadblock/mma_singlestage_wmma_sm70.cu  |   2 +-
 .../threadblock/mma_singlestage_wmma_sm75.cu  |   2 +-
 test/unit/gemm/warp/CMakeLists.txt            |   2 +-
 test/unit/gemm/warp/gemm_complex_sm80.cu      |   2 +-
 test/unit/gemm/warp/gemm_complex_sm90.cu      |   2 +-
 .../gemm/warp/gemm_gaussian_complex_sm80.cu   |   2 +-
 test/unit/gemm/warp/gemm_mixed_input_sm80.cu  |   2 +-
 test/unit/gemm/warp/gemm_sm50.cu              |   2 +-
 test/unit/gemm/warp/gemm_sm60.cu              |   2 +-
 test/unit/gemm/warp/gemm_sm61.cu              |   2 +-
 test/unit/gemm/warp/gemm_sm70.cu              |   2 +-
 test/unit/gemm/warp/gemm_sm75.cu              |   2 +-
 test/unit/gemm/warp/gemm_sm80.cu              |   2 +-
 test/unit/gemm/warp/gemm_sm90.cu              |   2 +-
 test/unit/gemm/warp/gemm_sparse_sm80.cu       |   2 +-
 test/unit/gemm/warp/testbed.h                 |   2 +-
 test/unit/gemm/warp/wmma_sm70.cu              |   2 +-
 test/unit/gemm/warp/wmma_sm72.cu              |   2 +-
 test/unit/gemm/warp/wmma_sm75.cu              |   2 +-
 test/unit/layout/CMakeLists.txt               |   2 +-
 test/unit/layout/matrix.cu                    |   2 +-
 test/unit/layout/tensor.cu                    |   2 +-
 test/unit/layout/tensor_nhwc.cu               |   2 +-
 test/unit/nvrtc/CMakeLists.txt                |   2 +-
 test/unit/nvrtc/cutlass/nvrtc/environment.h   |   2 +-
 test/unit/nvrtc/kernel/thread/contraction.hpp |   4 +-
 .../unit/nvrtc/kernel/thread/testbed_kernel.h |   2 +-
 test/unit/nvrtc/stdlib/assert.h               |   2 +-
 test/unit/nvrtc/stdlib/stdint.h               |   2 +-
 test/unit/nvrtc/thread/CMakeLists.txt         |   2 +-
 test/unit/nvrtc/thread/nvrtc_contraction.cu   |   2 +-
 test/unit/nvrtc/thread/nvrtc_gemm.cu          |   2 +-
 test/unit/nvrtc/thread/testbed.h              |   2 +-
 test/unit/pipeline/CMakeLists.txt             |   2 +-
 test/unit/pipeline/pipeline_async.cu          |   2 +-
 test/unit/pipeline/pipeline_tma_async.cu      |   2 +-
 .../pipeline_tma_async_warp_specialized.cu    |   2 +-
 ...e_tma_async_warp_specialized_persistent.cu |   2 +-
 test/unit/pipeline/sequence_barrier.cu        |   2 +-
 test/unit/pipeline/testbed.h                  |   2 +-
 test/unit/reduction/CMakeLists.txt            |   2 +-
 test/unit/reduction/device/CMakeLists.txt     |   2 +-
 .../device/tensor_reduce_contiguous.cu        |   2 +-
 .../reduction/device/tensor_reduce_strided.cu |   2 +-
 test/unit/reduction/kernel/CMakeLists.txt     |   2 +-
 test/unit/reduction/kernel/reduce_splitk.cu   |   2 +-
 .../reduction/kernel/reduce_splitk_testbed.h  |   2 +-
 test/unit/reduction/thread/CMakeLists.txt     |   2 +-
 .../unit/reduction/thread/reduction_thread.cu |   2 +-
 test/unit/reduction/thread/testbed.h          |   2 +-
 test/unit/substrate/CMakeLists.txt            |   2 +-
 test/unit/substrate/dependent_false.cpp       |   2 +-
 test/unit/test_unit.cpp                       |   2 +-
 test/unit/transform/CMakeLists.txt            |   2 +-
 test/unit/transform/device/CMakeLists.txt     |   2 +-
 .../device/sm90_sparse_gemm_compressor_f16.cu |   2 +-
 .../device/sm90_sparse_gemm_compressor_f32.cu |   2 +-
 .../device/sm90_sparse_gemm_compressor_f8.cu  |   2 +-
 .../sm90_sparse_gemm_compressor_legacy.hpp    |   2 +-
 .../device/testbed_sparse_gemm_compressor.hpp |   2 +-
 test/unit/transform/kernel/CMakeLists.txt     |   2 +-
 .../kernel/filter_format_transformer.cu       |   2 +-
 .../unit/transform/threadblock/CMakeLists.txt |   2 +-
 .../threadblock/predicated_tile_iterator.cu   |   2 +-
 .../regular_tile_iterator_tensor_op.cu        |   2 +-
 test/unit/util/CMakeLists.txt                 |   2 +-
 test/unit/util/cutlass_test_levels.cu         |   2 +-
 test/unit/util/rms_norm.cu                    |   2 +-
 test/unit/util/tensor_reduce.cu               |   2 +-
 tools/CMakeLists.txt                          |   2 +-
 tools/library/CMakeLists.txt                  |   4 +-
 .../include/cutlass/library/arch_mappings.h   |   2 +-
 .../include/cutlass/library/descriptions.h    |   2 +-
 .../library/include/cutlass/library/handle.h  |   2 +-
 .../library/include/cutlass/library/library.h |  24 +-
 .../include/cutlass/library/manifest.h        |   2 +-
 .../include/cutlass/library/operation_table.h |   2 +-
 .../include/cutlass/library/singleton.h       |   2 +-
 tools/library/include/cutlass/library/types.h |   2 +-
 tools/library/include/cutlass/library/util.h  |   9 +-
 tools/library/src/conv2d_operation.h          |  24 +-
 tools/library/src/conv3d_operation.h          |  13 +-
 tools/library/src/conv_operation_3x.hpp       |   7 +-
 tools/library/src/gemm_operation.h            |  67 +-
 tools/library/src/gemm_operation_3x.hpp       |  32 +-
 tools/library/src/handle.cu                   |   2 +-
 tools/library/src/library_internal.h          |   2 +-
 tools/library/src/manifest.cpp                |   2 +-
 tools/library/src/operation_table.cu          |   2 +-
 tools/library/src/rank_2k_operation.h         |  13 +-
 tools/library/src/rank_k_operation.h          |  13 +-
 .../reduction/init_reduction_operations.cu    |   2 +-
 .../library/src/reduction/reduction_device.cu |   2 +-
 .../src/reduction/reduction_operation.h       |  15 +-
 tools/library/src/reference/conv2d.cu         |   2 +-
 tools/library/src/reference/conv3d.cu         |   2 +-
 .../src/reference/conv_reference_operation.h  |   9 +-
 .../src/reference/gemm_e4m3a_e4m3out.cu       |   2 +-
 .../src/reference/gemm_e4m3a_e5m2out.cu       |   2 +-
 .../src/reference/gemm_e5m2a_e4m3out.cu       |   2 +-
 .../src/reference/gemm_e5m2a_e5m2out.cu       |   2 +-
 tools/library/src/reference/gemm_fp32out.cu   |   2 +-
 .../src/reference/gemm_fp8in_bf16out.cu       |   2 +-
 .../src/reference/gemm_fp8in_fp16out.cu       |   2 +-
 .../src/reference/gemm_fp8in_fp32out.cu       |   2 +-
 .../src/reference/gemm_fp_mixed_input.cu      |   2 +-
 tools/library/src/reference/gemm_fp_other.cu  |   2 +-
 tools/library/src/reference/gemm_int4.cu      |   2 +-
 .../src/reference/gemm_int8_interleaved_32.cu |   2 +-
 .../src/reference/gemm_int8_interleaved_64.cu |   2 +-
 .../src/reference/gemm_int_mixed_input.cu     |   2 +-
 .../src/reference/gemm_reference_operation.h  |   9 +-
 tools/library/src/reference/gemm_s8_s8_s32.cu |   2 +-
 tools/library/src/reference/gemm_u8_u8_s32.cu |   2 +-
 .../initialize_reference_operations.cu        |   2 +-
 tools/library/src/singleton.cu                |   2 +-
 .../library/src/sparse_gemm_operation_3x.hpp  |   8 +-
 tools/library/src/symm_operation.h            |  14 +-
 tools/library/src/trmm_operation.h            |  13 +-
 tools/library/src/util.cu                     |  49 +-
 tools/profiler/CMakeLists.txt                 |   2 +-
 .../profiler/conv2d_operation_profiler.h      |   2 +-
 .../profiler/conv3d_operation_profiler.h      |   2 +-
 .../include/cutlass/profiler/cublas_helpers.h |   2 +-
 .../include/cutlass/profiler/cudnn_helpers.h  |   2 +-
 .../cutlass/profiler/cutlass_profiler.h       |   2 +-
 .../profiler/include/cutlass/profiler/debug.h |   2 +-
 .../cutlass/profiler/device_allocation.h      |   2 +-
 .../include/cutlass/profiler/device_context.h |   2 +-
 .../cutlass/profiler/enumerated_types.h       |   2 +-
 .../profiler/gemm_operation_profiler.h        |   4 +-
 .../include/cutlass/profiler/gpu_timer.h      |   2 +-
 .../cutlass/profiler/operation_profiler.h     |  17 +-
 .../include/cutlass/profiler/options.h        |   9 +-
 .../cutlass/profiler/performance_report.h     |   2 +-
 .../cutlass/profiler/performance_result.h     |   2 +-
 .../include/cutlass/profiler/problem_space.h  |   9 +-
 .../profiler/rank_2k_operation_profiler.h     |   2 +-
 .../profiler/rank_k_operation_profiler.h      |   2 +-
 .../profiler/reduction_operation_profiler.h   |   2 +-
 .../profiler/sparse_gemm_operation_profiler.h |   2 +-
 .../profiler/symm_operation_profiler.h        |   2 +-
 .../profiler/trmm_operation_profiler.h        |   2 +-
 .../profiler/src/conv2d_operation_profiler.cu |  94 +-
 .../profiler/src/conv3d_operation_profiler.cu |  80 +-
 tools/profiler/src/cublas_helpers.cu          |   2 +-
 tools/profiler/src/cudnn_helpers.cpp          |   2 +-
 tools/profiler/src/cutlass_profiler.cu        |   2 +-
 tools/profiler/src/device_allocation.cu       |   2 +-
 tools/profiler/src/device_context.cu          |   2 +-
 tools/profiler/src/enumerated_types.cpp       |   2 +-
 tools/profiler/src/gemm_operation_profiler.cu | 230 ++---
 tools/profiler/src/gpu_timer.cpp              |   2 +-
 tools/profiler/src/main.cpp                   |   2 +-
 tools/profiler/src/operation_profiler.cu      | 281 ++++--
 tools/profiler/src/options.cu                 |  16 +-
 tools/profiler/src/performance_report.cpp     |   2 +-
 tools/profiler/src/performance_result.cu      |   2 +-
 tools/profiler/src/problem_space.cpp          |  40 +-
 .../src/rank_2k_operation_profiler.cu         |   6 +-
 .../profiler/src/rank_k_operation_profiler.cu |   6 +-
 .../src/sparse_gemm_operation_profiler.cu     |   6 +-
 tools/profiler/src/symm_operation_profiler.cu |   6 +-
 tools/profiler/src/trmm_operation_profiler.cu |   6 +-
 tools/util/CMakeLists.txt                     |   2 +-
 tools/util/include/cutlass/util/GPU_Clock.hpp |   2 +-
 .../util/include/cutlass/util/command_line.h  |   2 +-
 .../include/cutlass/util/cublas_wrappers.hpp  |   2 +-
 tools/util/include/cutlass/util/debug.h       |   2 +-
 tools/util/include/cutlass/util/device_dump.h |   2 +-
 .../include/cutlass/util/device_groupnorm.h   |   2 +-
 .../include/cutlass/util/device_layernorm.h   |   2 +-
 .../util/include/cutlass/util/device_memory.h |   2 +-
 .../cutlass/util/device_nchw_to_nhwc.h        |   2 +-
 .../cutlass/util/device_nhwc_padding.h        |   2 +-
 .../cutlass/util/device_nhwc_pooling.h        |   2 +-
 .../cutlass/util/device_nhwc_to_nchw.h        |   2 +-
 .../include/cutlass/util/device_rmsnorm.h     |   2 +-
 .../util/include/cutlass/util/device_utils.h  |   2 +-
 .../util/include/cutlass/util/distribution.h  |   2 +-
 tools/util/include/cutlass/util/exceptions.h  |   2 +-
 .../include/cutlass/util/gett_commandline.hpp |   2 +-
 .../util/include/cutlass/util/helper_cuda.hpp |   2 +-
 .../util/include/cutlass/util/host_reorder.h  |   2 +-
 tools/util/include/cutlass/util/host_tensor.h |   2 +-
 .../cutlass/util/host_tensor_planar_complex.h |   2 +-
 .../include/cutlass/util/host_uncompress.h    |   2 +-
 .../include/cutlass/util/index_sequence.h     |   2 +-
 .../include/cutlass/util/packed_stride.hpp    |   2 +-
 .../util/include/cutlass/util/print_error.hpp |   2 +-
 .../util/reference/detail/inner_product.h     |   2 +-
 .../reference/detail/linear_to_coordinate.h   |   2 +-
 .../util/reference/device/convolution.h       |   2 +-
 .../cutlass/util/reference/device/gemm.h      |   2 +-
 .../util/reference/device/gemm_complex.h      |   2 +-
 .../reference/device/gemm_planar_complex.h    |   2 +-
 .../cutlass/util/reference/device/gett.hpp    |   2 +-
 .../util/reference/device/kernel/gemm.h       |   2 +-
 .../device/kernel/tensor_elementwise.h        |   2 +-
 .../reference/device/kernel/tensor_foreach.h  |   2 +-
 .../util/reference/device/rank_2k_complex.h   |   2 +-
 .../util/reference/device/tensor_compare.h    |   2 +-
 .../util/reference/device/tensor_fill.h       |   2 +-
 .../util/reference/device/tensor_foreach.h    |   2 +-
 .../util/reference/device/tensor_reduce.h     |   2 +-
 .../util/reference/device/tensor_relu.h       |   2 +-
 .../util/reference/device/thread/gemm.h       |   2 +-
 .../cutlass/util/reference/host/conv.hpp      | 574 ++++++------
 .../cutlass/util/reference/host/convolution.h |   2 +-
 .../util/reference/host/error_metrics.h       |   2 +-
 .../cutlass/util/reference/host/gemm.h        |   2 +-
 .../util/reference/host/gemm_complex.h        |   2 +-
 .../util/reference/host/gemm_planar_complex.h |   2 +-
 .../cutlass/util/reference/host/gett.hpp      |   2 +-
 .../cutlass/util/reference/host/rank_2k.h     |   2 +-
 .../util/reference/host/rank_2k_complex.h     |   2 +-
 .../util/reference/host/rank_k_complex.h      |   2 +-
 .../cutlass/util/reference/host/symm.h        |   2 +-
 .../util/reference/host/symm_complex.h        |   2 +-
 .../util/reference/host/tensor_compare.h      |   2 +-
 .../util/reference/host/tensor_compare.hpp    |   2 +-
 .../cutlass/util/reference/host/tensor_copy.h |   2 +-
 .../util/reference/host/tensor_elementwise.h  |   2 +-
 .../cutlass/util/reference/host/tensor_fill.h |   2 +-
 .../util/reference/host/tensor_fill.hpp       |   2 +-
 .../util/reference/host/tensor_foreach.h      |   2 +-
 .../cutlass/util/reference/host/tensor_norm.h |   2 +-
 .../util/reference/host/tensor_reduce.h       |   2 +-
 .../util/reference/host/tensor_reduce.hpp     |   2 +-
 .../cutlass/util/reference/host/trmm.h        |   2 +-
 .../util/reference/host/trmm_complex.h        |   2 +-
 .../include/cutlass/util/tensor_view_io.h     |   2 +-
 tools/util/include/cutlass/util/type_traits.h |   2 +-
 2030 files changed, 8947 insertions(+), 3475 deletions(-)
 create mode 100644 examples/65_distributed_gemm/65_distributed_gemm.cu
 rename examples/{65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling => 65_distributed_gemm}/CMakeLists.txt (89%)
 create mode 100644 examples/65_distributed_gemm/README.md
 create mode 100644 examples/65_distributed_gemm/REQUIREMENTS.md
 create mode 100644 examples/65_distributed_gemm/util/benchmark.h
 create mode 100644 examples/65_distributed_gemm/util/device_copy.h
 rename examples/{65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu => 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu} (99%)
 create mode 100644 examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt
 rename examples/{65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling => 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling}/hopper_fp8_commandline.hpp (98%)
 rename examples/{65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling => 67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling}/reference/host/gemm_with_blockwise_scaling.h (99%)
 create mode 100644 include/cutlass/experimental/distributed/device/detail.hpp
 create mode 100644 include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp
 create mode 100644 include/cutlass/experimental/distributed/device/full_barrier.hpp
 create mode 100644 include/cutlass/experimental/distributed/kernel/detail.hpp
 create mode 100644 include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp
 create mode 100644 include/cutlass/experimental/distributed/kernel/full_barrier.hpp
 create mode 100644 include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp
 create mode 100644 include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp
 create mode 100644 include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c105cf8..f673e331 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,12 @@
 # NVIDIA CUTLASS Changelog
+## [3.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.7.0) (2025-01-11)
+- [Hopper blockwise scaling FP8 GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) uses 2D scaling tensor, assigning one value per threadblock.  This allows a finer-grained scaling to be applied for each output tile per gemm-k iteration. The operands and scaling tensors are loaded from global memory to shared memory using TMA and cp_async, respectively. The scaling is applied inside the mainloop.  Details with figures are [here](https://github.com/NVIDIA/cutlass/pull/1932#issue-2645398439).
+- [Distributed GEMM](./examples/65_distributed_gemm/65_distributed_gemm.cu) is a new (experimental) API which can turn existing CUTLASS GEMM kernels into pipelined Tensor Parallel GEMMs that run efficiently on NVLink-based network of GPUs. Its pipelining schedules can hide most of the communication behind computation, and relies on point-to-point communication, which can simply use CUDA runtime's peer device access feature. It also utilizes remote TMA loads and memcopies with CUDA graphs to handle communication primarily through the Copy Engine, leaving all SMs free for Hopper's persistent kernels.  For more details you can refer to the [DistGEMM blog post](https://blog.shi-labs.com/distributed-gemm-88be6a481e2b).
+- Improved persistent grid launch for Hopper kernels with large cluster sizes (>= size of 4) using the new `make_kernel_hardware_info` API as shown in [example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
+- Enabled high precision accumulation for Hopper FP8 Sparse GEMM.
+- Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs!
+- Optimal code generation with CUDA toolkit versions 12.6.
+
 ## [3.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.6.0) (2024-10-03)
 
 - [Hopper structured sparse GEMM](./examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu).
@@ -440,7 +448,7 @@
 
 ## Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9c501bc..e50fd76e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/CUDA.cmake b/CUDA.cmake
index 7e91adb8..cde1e995 100644
--- a/CUDA.cmake
+++ b/CUDA.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/LICENSE.txt b/LICENSE.txt
index 52550084..47016fa7 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index e61335f2..616b02d5 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition")
 
-# CUTLASS 3.6.0
+# CUTLASS 3.7.0
 
-_CUTLASS 3.6.0 - October 2024_
+_CUTLASS 3.7.0 - January 2025_
 
 CUTLASS is a collection of CUDA C++ template abstractions for implementing
 high-performance matrix-matrix multiplication (GEMM) and related computations at all levels 
@@ -41,27 +41,14 @@ and improves code composability and readability. More documentation specific to
 
 In addition to GEMMs, CUTLASS implements high-performance convolution via the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components.
 
+# What's New in CUTLASS 3.7
 
-# What's New in CUTLASS 3.6
+CUTLASS 3.7.0 is an update to CUTLASS adding:
 
-CUTLASS 3.6.0 is an update to CUTLASS adding:
-
-- [Hopper structured sparse GEMM](./examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu).
-  + [FP16](./test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu)
-  + [FP8](./test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu)
-  + [INT8](./test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu)
-  + [TF32](./test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu)
-- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](./include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API.
-- [An improved mixed input GEMM](./examples/55_hopper_mixed_dtype_gemm/README.md) and a [lookup table implementation](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode.
-- [EVT nodes for Top-K selection and softmax](./include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](./examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu).
-- [Programmatic Dependent Launch](./include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/dependent_kernel_launch.md).
-- [A new debugging tool, synclog](./include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details.
-- A new TMA-enabled [epilogue](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support.
-- A SIMT-enabled pointer-array [epilogue](./include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp).
-- A new [Ping-Pong kernel schedule for Grouped GEMM](./include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations.
-- [A new instantiation strategy for CUTLASS profiler kernels](./python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/profiler.md#instantiating-more-kernels-with-hopper).
-- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](./include/cutlass/bfloat16.h)
-- Fixed use of isnan on Windows for [`half_t`](./test/unit/core/functional.cu).
+- A new [Hopper blockwise scaling FP8 GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) where the operands and block scaling tensor are staged via shared memory.
+- [Distributed GEMM](./examples/65_distributed_gemm/65_distributed_gemm.cu) is an experimental pipelined Tensor Parallelism implementation utilizing existing CUTLASS kernels and CUDA runtime features, which can hide the most of communication behind computation.
+- Improved persistent grid launch for Hopper kernels with large cluster sizes (>= size of 4) using the new `make_kernel_hardware_info` API as shown in [example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu).
+- Enabled high precision accumulation for Hopper FP8 Sparse GEMM.
 
 Minimum requirements:
 
@@ -540,7 +527,7 @@ The official list of CUTLASS developers and contributors is available here: [CON
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/bin2hex.cmake b/bin2hex.cmake
index b34e0284..c03cdf78 100644
--- a/bin2hex.cmake
+++ b/bin2hex.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2019 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/cmake/CTestTestfile.configure.cmake b/cmake/CTestTestfile.configure.cmake
index 611b3d18..0bd42de3 100644
--- a/cmake/CTestTestfile.configure.cmake
+++ b/cmake/CTestTestfile.configure.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/cmake/CTestTestfile.test.configure.cmake b/cmake/CTestTestfile.test.configure.cmake
index 31dba544..c31f354a 100644
--- a/cmake/CTestTestfile.test.configure.cmake
+++ b/cmake/CTestTestfile.test.configure.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/cmake/NvidiaCutlassPackageConfig.cmake b/cmake/NvidiaCutlassPackageConfig.cmake
index 364fba7a..c9663644 100644
--- a/cmake/NvidiaCutlassPackageConfig.cmake
+++ b/cmake/NvidiaCutlassPackageConfig.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/cmake/googletest.cmake b/cmake/googletest.cmake
index d220cfad..4983d46c 100644
--- a/cmake/googletest.cmake
+++ b/cmake/googletest.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/cmake/nop.cu b/cmake/nop.cu
index be2b1588..25b9e74c 100644
--- a/cmake/nop.cu
+++ b/cmake/nop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cmake/version_extended.h.in b/cmake/version_extended.h.in
index 36130630..b94ed6c8 100644
--- a/cmake/version_extended.h.in
+++ b/cmake/version_extended.h.in
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/cuBLAS.cmake b/cuBLAS.cmake
index 383871fd..5ff21cd0 100644
--- a/cuBLAS.cmake
+++ b/cuBLAS.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/cuDNN.cmake b/cuDNN.cmake
index 0b37ff7c..30b58581 100644
--- a/cuDNN.cmake
+++ b/cuDNN.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt
index 9002aad9..8fcc9467 100644
--- a/examples/00_basic_gemm/CMakeLists.txt
+++ b/examples/00_basic_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu
index c867112f..df8009e0 100644
--- a/examples/00_basic_gemm/basic_gemm.cu
+++ b/examples/00_basic_gemm/basic_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt
index bf37d18a..e95fc292 100644
--- a/examples/01_cutlass_utilities/CMakeLists.txt
+++ b/examples/01_cutlass_utilities/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu
index 43a3d46d..1f3b4fc7 100644
--- a/examples/01_cutlass_utilities/cutlass_utilities.cu
+++ b/examples/01_cutlass_utilities/cutlass_utilities.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt
index 0216f2b4..fe80acc5 100644
--- a/examples/02_dump_reg_shmem/CMakeLists.txt
+++ b/examples/02_dump_reg_shmem/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
index 3db7821f..4f04914f 100644
--- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu
+++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt
index be8c7436..3244e502 100644
--- a/examples/03_visualize_layout/CMakeLists.txt
+++ b/examples/03_visualize_layout/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h
index d4224668..9cfb284e 100644
--- a/examples/03_visualize_layout/options.h
+++ b/examples/03_visualize_layout/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu
index d20c893a..64269c95 100644
--- a/examples/03_visualize_layout/register_layout.cu
+++ b/examples/03_visualize_layout/register_layout.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h
index 0375f325..c840c90e 100644
--- a/examples/03_visualize_layout/register_layout.h
+++ b/examples/03_visualize_layout/register_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp
index 1edf830d..ebc211a7 100644
--- a/examples/03_visualize_layout/visualize_layout.cpp
+++ b/examples/03_visualize_layout/visualize_layout.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h
index f070bad2..13318a05 100644
--- a/examples/03_visualize_layout/visualize_layout.h
+++ b/examples/03_visualize_layout/visualize_layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt
index 55482729..49399cc9 100644
--- a/examples/04_tile_iterator/CMakeLists.txt
+++ b/examples/04_tile_iterator/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu
index b9441a56..fdfaaac9 100644
--- a/examples/04_tile_iterator/tile_iterator.cu
+++ b/examples/04_tile_iterator/tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt
index cd69403a..c81579cc 100644
--- a/examples/05_batched_gemm/CMakeLists.txt
+++ b/examples/05_batched_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu
index 5fb7518f..8b8d64c2 100644
--- a/examples/05_batched_gemm/batched_gemm.cu
+++ b/examples/05_batched_gemm/batched_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt
index e0d11d0c..9523bfec 100644
--- a/examples/06_splitK_gemm/CMakeLists.txt
+++ b/examples/06_splitK_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu
index 1a559b83..4a4e93ef 100644
--- a/examples/06_splitK_gemm/splitk_gemm.cu
+++ b/examples/06_splitK_gemm/splitk_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt
index 2503cd3d..fb525ff6 100644
--- a/examples/07_volta_tensorop_gemm/CMakeLists.txt
+++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
index 23c2d9f4..d92d423b 100644
--- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
+++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index 2e0a5481..38d6e790 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
index 34f682de..cdb6c679 100644
--- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
+++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
index 673064ed..b0d1e8c2 100644
--- a/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/09_turing_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
index adca0568..cdb3c310 100644
--- a/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
+++ b/examples/09_turing_tensorop_conv2dfprop/turing_tensorop_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/10_planar_complex/CMakeLists.txt b/examples/10_planar_complex/CMakeLists.txt
index ebe78d6b..d6ff4d57 100644
--- a/examples/10_planar_complex/CMakeLists.txt
+++ b/examples/10_planar_complex/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu
index 2d7ee95e..4324e680 100644
--- a/examples/10_planar_complex/planar_complex.cu
+++ b/examples/10_planar_complex/planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/11_planar_complex_array/CMakeLists.txt b/examples/11_planar_complex_array/CMakeLists.txt
index 0e3fc9e9..23504fed 100644
--- a/examples/11_planar_complex_array/CMakeLists.txt
+++ b/examples/11_planar_complex_array/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu
index 0df6e572..aa5a8f02 100644
--- a/examples/11_planar_complex_array/planar_complex_array.cu
+++ b/examples/11_planar_complex_array/planar_complex_array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
index e3e428df..f9d58b83 100644
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
index bca8e0ac..14432e57 100644
--- a/examples/12_gemm_bias_relu/gemm_bias_relu.cu
+++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/CMakeLists.txt b/examples/13_two_tensor_op_fusion/CMakeLists.txt
index 6819a976..18293033 100644
--- a/examples/13_two_tensor_op_fusion/CMakeLists.txt
+++ b/examples/13_two_tensor_op_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/README.md b/examples/13_two_tensor_op_fusion/README.md
index 4b9cb6d1..9fa8297d 100644
--- a/examples/13_two_tensor_op_fusion/README.md
+++ b/examples/13_two_tensor_op_fusion/README.md
@@ -86,7 +86,7 @@ threadblock. Typically this requires the 2nd Convolution uses 1x1 filter without
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
index 03ae75c6..df4cb76a 100644
--- a/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_conv2d_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
index 8e828d1f..f0e85cda 100644
--- a/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
index 2206bac0..b6267a15 100644
--- a/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_grouped_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
index f70c21af..4693e864 100644
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_conv2d_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
index 43a33b12..453f44cd 100644
--- a/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
+++ b/examples/13_two_tensor_op_fusion/b2b_interleaved_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
index 33809076..f9b2f49c 100644
--- a/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
index 5d6a0e94..37f81374 100644
--- a/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/device/b2b_implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
index 9f5b89e5..43bbee00 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
index cf7133ee..ad00ddd5 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
index be6d7d54..50bb1030 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
index 50c886d1..313c7df5 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_f16_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
index 5e94c748..a3a6a34e 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
index aeea07f2..41f85ab6 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
index d91df2a6..56114219 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
index 2b865e6b..19b50d6a 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_convs_s8_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
index 44243c55..3af3ea70 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
index e4709be4..dd37b5cc 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
index 0e64d401..363eb956 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
index 9f6a2a08..852bc9cb 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_f16_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu
index 87331d04..f4df3e1d 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_grouped_f16_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
index a7f39d2d..14cd448f 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
index 671f48b7..515a788c 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm75_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
index b2f12b45..734c15f7 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_rf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
index 84354221..36406134 100644
--- a/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
+++ b/examples/13_two_tensor_op_fusion/fused_two_gemms_s8_sm80_shmem.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
index fca87a1d..6070f86a 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h
index 13faadf0..cc35d91b 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_gemm_grouped_problem_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
index d249a2c2..6794fcc1 100644
--- a/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
+++ b/examples/13_two_tensor_op_fusion/kernel/b2b_implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
index 1b604c04..46254b53 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
index 0168637b..dbb21aec 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
index d76fe812..feb238cf 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
index 462ad1ef..35a5681d 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
index e9535676..eca1c611 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_conv2d_fprop_smem_accumulator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
index 2ad3d7f3..e2cc9437 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
index ad548bc9..0a4530f6 100644
--- a/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/kernel/default_b2b_gemm_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/kernel/grouped.h b/examples/13_two_tensor_op_fusion/kernel/grouped.h
index 2698a281..0ac841d4 100644
--- a/examples/13_two_tensor_op_fusion/kernel/grouped.h
+++ b/examples/13_two_tensor_op_fusion/kernel/grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h b/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h
index e1ba6c56..4bf3c532 100644
--- a/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h
+++ b/examples/13_two_tensor_op_fusion/reference/device/tensor_scale_bias.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/test_run.h b/examples/13_two_tensor_op_fusion/test_run.h
index 2bd6c720..1fba44d6 100644
--- a/examples/13_two_tensor_op_fusion/test_run.h
+++ b/examples/13_two_tensor_op_fusion/test_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
index 574b123d..1feb71cf 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
index e7c7ad12..64181870 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_multistage_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
index 8313cef8..97466a1c 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
index 9775c19e..e5d91b12 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_implicit_gemm_pipelined_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
index 55a41be1..c845f202 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
index 2d5f6163..c0356df4 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_base_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
index 3fb684ce..b9388a73 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
index 35c4f5cc..b089bdba 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_multistage_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
index d5f16294..d8a9d4c6 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
index c3393e0c..eb23879b 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/b2b_mma_pipelined_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
index 2ea38ceb..b7aa1ffe 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
index 7a97ce03..a848f5c4 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/default_b2b_mma_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h b/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
index c79b7c77..e0331380 100644
--- a/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
+++ b/examples/13_two_tensor_op_fusion/threadblock/grouped_threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
index 3e0b870f..79d45375 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
+++ b/examples/14_ampere_tf32_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
index 99d3cdb1..895b52b1 100644
--- a/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
+++ b/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
index 02d32058..e4303f24 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
+++ b/examples/15_ampere_sparse_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
index e92b717c..8cbb832d 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu
index dcab5ac1..2dcd09ce 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_universal.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu
index 90aa4452..d18db16c 100644
--- a/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu
+++ b/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
index cdc3f11b..c98e4449 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
+++ b/examples/16_ampere_tensorop_conv2dfprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
index c0395f58..86e3a966 100644
--- a/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
+++ b/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/17_fprop_per_channel_bias/CMakeLists.txt b/examples/17_fprop_per_channel_bias/CMakeLists.txt
index 350a2799..667783d8 100644
--- a/examples/17_fprop_per_channel_bias/CMakeLists.txt
+++ b/examples/17_fprop_per_channel_bias/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
index f1658c0f..72db5657 100644
--- a/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
+++ b/examples/17_fprop_per_channel_bias/fprop_per_channel_bias.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
index 5f4541c3..8d40e7f3 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
index 1595dd60..a5a94c85 100644
--- a/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
+++ b/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/19_tensorop_canonical/CMakeLists.txt b/examples/19_tensorop_canonical/CMakeLists.txt
index 140f51bf..b5727948 100644
--- a/examples/19_tensorop_canonical/CMakeLists.txt
+++ b/examples/19_tensorop_canonical/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/19_tensorop_canonical/tensorop_canonical.cu b/examples/19_tensorop_canonical/tensorop_canonical.cu
index 1f0aa932..473e7ff4 100644
--- a/examples/19_tensorop_canonical/tensorop_canonical.cu
+++ b/examples/19_tensorop_canonical/tensorop_canonical.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/20_simt_canonical/CMakeLists.txt b/examples/20_simt_canonical/CMakeLists.txt
index 36dcda7a..ba4a2049 100644
--- a/examples/20_simt_canonical/CMakeLists.txt
+++ b/examples/20_simt_canonical/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/20_simt_canonical/simt_canonical.cu b/examples/20_simt_canonical/simt_canonical.cu
index 8f2fbc4d..bec2c04d 100644
--- a/examples/20_simt_canonical/simt_canonical.cu
+++ b/examples/20_simt_canonical/simt_canonical.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/21_quaternion_gemm/CMakeLists.txt b/examples/21_quaternion_gemm/CMakeLists.txt
index 742a9e78..3ebbaa16 100644
--- a/examples/21_quaternion_gemm/CMakeLists.txt
+++ b/examples/21_quaternion_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/21_quaternion_gemm/quaternion_gemm.cu b/examples/21_quaternion_gemm/quaternion_gemm.cu
index 025a9c97..dd2476a4 100644
--- a/examples/21_quaternion_gemm/quaternion_gemm.cu
+++ b/examples/21_quaternion_gemm/quaternion_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/22_quaternion_conv/CMakeLists.txt b/examples/22_quaternion_conv/CMakeLists.txt
index 52e17279..0881bbf0 100644
--- a/examples/22_quaternion_conv/CMakeLists.txt
+++ b/examples/22_quaternion_conv/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/22_quaternion_conv/quaternion_conv.cu b/examples/22_quaternion_conv/quaternion_conv.cu
index bc7173d1..170f978b 100644
--- a/examples/22_quaternion_conv/quaternion_conv.cu
+++ b/examples/22_quaternion_conv/quaternion_conv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt b/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
index e5b4ec03..ddaa2a64 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
index 4e5fca1a..d0b0aa06 100644
--- a/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
+++ b/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/24_gemm_grouped/CMakeLists.txt b/examples/24_gemm_grouped/CMakeLists.txt
index 32614a07..e321423f 100644
--- a/examples/24_gemm_grouped/CMakeLists.txt
+++ b/examples/24_gemm_grouped/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/24_gemm_grouped/gemm_grouped.cu b/examples/24_gemm_grouped/gemm_grouped.cu
index 993d554f..9dbe03b1 100644
--- a/examples/24_gemm_grouped/gemm_grouped.cu
+++ b/examples/24_gemm_grouped/gemm_grouped.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt b/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
index ce9a0bd0..3cd40fa7 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
+++ b/examples/25_ampere_fprop_mainloop_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
index a1ca2b07..285a0afa 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_3d_fprop_mainloop_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
index 87ed21c0..db337646 100644
--- a/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
+++ b/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt b/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
index e96050c3..0e4bf434 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
+++ b/examples/26_ampere_wgrad_mainloop_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
index abb66b52..e983e855 100644
--- a/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
+++ b/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
index 9e561cb6..e30718fe 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
index 5b38de6e..d6db6573 100644
--- a/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
+++ b/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
index 50a7c9e6..85840d36 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
index d2a3b4c6..7eec75e5 100644
--- a/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
+++ b/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
index 0a995bf9..2b7d7bef 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
index e406a7ed..ab0ed1bc 100644
--- a/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
+++ b/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/30_wgrad_split_k/30_wgrad_split_k.cu b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
index 822a7a55..d1f7417f 100644
--- a/examples/30_wgrad_split_k/30_wgrad_split_k.cu
+++ b/examples/30_wgrad_split_k/30_wgrad_split_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/30_wgrad_split_k/CMakeLists.txt b/examples/30_wgrad_split_k/CMakeLists.txt
index 98eda791..ca8e4f31 100644
--- a/examples/30_wgrad_split_k/CMakeLists.txt
+++ b/examples/30_wgrad_split_k/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/31_basic_syrk/CMakeLists.txt b/examples/31_basic_syrk/CMakeLists.txt
index 8d5571d2..b6b19a51 100644
--- a/examples/31_basic_syrk/CMakeLists.txt
+++ b/examples/31_basic_syrk/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu
index 9f9cd93a..adb1855a 100644
--- a/examples/31_basic_syrk/basic_syrk.cu
+++ b/examples/31_basic_syrk/basic_syrk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/32_basic_trmm/CMakeLists.txt b/examples/32_basic_trmm/CMakeLists.txt
index 459dbe8f..d12c6f90 100644
--- a/examples/32_basic_trmm/CMakeLists.txt
+++ b/examples/32_basic_trmm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu
index d2eda76a..b41d0700 100644
--- a/examples/32_basic_trmm/basic_trmm.cu
+++ b/examples/32_basic_trmm/basic_trmm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt b/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
index 11504039..1937743a 100644
--- a/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
+++ b/examples/33_ampere_3xtf32_tensorop_symm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu b/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
index 22cb3286..7bc761c9 100644
--- a/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
+++ b/examples/33_ampere_3xtf32_tensorop_symm/ampere_3xtf32_tensorop_symm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/34_transposed_conv2d/34_transposed_conv2d.cu b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
index f3393c7c..f3be99b9 100644
--- a/examples/34_transposed_conv2d/34_transposed_conv2d.cu
+++ b/examples/34_transposed_conv2d/34_transposed_conv2d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/34_transposed_conv2d/CMakeLists.txt b/examples/34_transposed_conv2d/CMakeLists.txt
index 414b011a..6225c2ba 100644
--- a/examples/34_transposed_conv2d/CMakeLists.txt
+++ b/examples/34_transposed_conv2d/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/35_gemm_softmax/CMakeLists.txt b/examples/35_gemm_softmax/CMakeLists.txt
index b7ecd99f..05e8b986 100644
--- a/examples/35_gemm_softmax/CMakeLists.txt
+++ b/examples/35_gemm_softmax/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/35_gemm_softmax/gemm_softmax.cu b/examples/35_gemm_softmax/gemm_softmax.cu
index 731e37b4..0e221452 100644
--- a/examples/35_gemm_softmax/gemm_softmax.cu
+++ b/examples/35_gemm_softmax/gemm_softmax.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
index 43208150..fc8f96c1 100644
--- a/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
+++ b/examples/35_gemm_softmax/gemm_with_epilogue_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/35_gemm_softmax/gemm_with_softmax.h b/examples/35_gemm_softmax/gemm_with_softmax.h
index 748905d9..31b2b769 100644
--- a/examples/35_gemm_softmax/gemm_with_softmax.h
+++ b/examples/35_gemm_softmax/gemm_with_softmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/36_gather_scatter_fusion/CMakeLists.txt b/examples/36_gather_scatter_fusion/CMakeLists.txt
index b54ea9ff..8698db84 100644
--- a/examples/36_gather_scatter_fusion/CMakeLists.txt
+++ b/examples/36_gather_scatter_fusion/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
index 55852730..badde725 100644
--- a/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
+++ b/examples/36_gather_scatter_fusion/gather_scatter_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt b/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt
index 334ec381..6a2ba65f 100644
--- a/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt
+++ b/examples/37_gemm_layernorm_gemm_fusion/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
index b5a0a1dc..95bda990 100644
--- a/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_layernorm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h
index 666f3cb5..5b36be05 100644
--- a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_epilogue_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h
index b33954ec..8411807d 100644
--- a/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h
+++ b/examples/37_gemm_layernorm_gemm_fusion/gemm_with_layernorm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/38_syr2k_grouped/CMakeLists.txt b/examples/38_syr2k_grouped/CMakeLists.txt
index 461619ed..586d9cd9 100644
--- a/examples/38_syr2k_grouped/CMakeLists.txt
+++ b/examples/38_syr2k_grouped/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/38_syr2k_grouped/syr2k_grouped.cu b/examples/38_syr2k_grouped/syr2k_grouped.cu
index c1fb82e8..168f99e4 100644
--- a/examples/38_syr2k_grouped/syr2k_grouped.cu
+++ b/examples/38_syr2k_grouped/syr2k_grouped.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/39_gemm_permute/CMakeLists.txt b/examples/39_gemm_permute/CMakeLists.txt
index dd916fdf..8125572d 100644
--- a/examples/39_gemm_permute/CMakeLists.txt
+++ b/examples/39_gemm_permute/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/39_gemm_permute/gemm_permute.cu b/examples/39_gemm_permute/gemm_permute.cu
index 3651b9c5..40540a1b 100644
--- a/examples/39_gemm_permute/gemm_permute.cu
+++ b/examples/39_gemm_permute/gemm_permute.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/39_gemm_permute/layouts.h b/examples/39_gemm_permute/layouts.h
index 3632ec0a..5ffb04fd 100644
--- a/examples/39_gemm_permute/layouts.h
+++ b/examples/39_gemm_permute/layouts.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/39_gemm_permute/permute_info.h b/examples/39_gemm_permute/permute_info.h
index 57672e7c..6baf635a 100644
--- a/examples/39_gemm_permute/permute_info.h
+++ b/examples/39_gemm_permute/permute_info.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/40_cutlass_py/conv2d.py b/examples/40_cutlass_py/conv2d.py
index 71e94259..cd11c74b 100644
--- a/examples/40_cutlass_py/conv2d.py
+++ b/examples/40_cutlass_py/conv2d.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/40_cutlass_py/customizable/conv2d.py b/examples/40_cutlass_py/customizable/conv2d.py
index c6cbf87a..e03e6dba 100644
--- a/examples/40_cutlass_py/customizable/conv2d.py
+++ b/examples/40_cutlass_py/customizable/conv2d.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/40_cutlass_py/customizable/gemm.py b/examples/40_cutlass_py/customizable/gemm.py
index 670294ad..8e0013f3 100644
--- a/examples/40_cutlass_py/customizable/gemm.py
+++ b/examples/40_cutlass_py/customizable/gemm.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/40_cutlass_py/customizable/gemm_grouped.py b/examples/40_cutlass_py/customizable/gemm_grouped.py
index ac2adefa..a3319e60 100644
--- a/examples/40_cutlass_py/customizable/gemm_grouped.py
+++ b/examples/40_cutlass_py/customizable/gemm_grouped.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/40_cutlass_py/gemm.py b/examples/40_cutlass_py/gemm.py
index 076f7582..dfd4113a 100644
--- a/examples/40_cutlass_py/gemm.py
+++ b/examples/40_cutlass_py/gemm.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/40_cutlass_py/gemm_grouped.py b/examples/40_cutlass_py/gemm_grouped.py
index 9ba2fa31..508b0894 100644
--- a/examples/40_cutlass_py/gemm_grouped.py
+++ b/examples/40_cutlass_py/gemm_grouped.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/CMakeLists.txt b/examples/41_fused_multi_head_attention/CMakeLists.txt
index 8ed62270..4087c3a8 100644
--- a/examples/41_fused_multi_head_attention/CMakeLists.txt
+++ b/examples/41_fused_multi_head_attention/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/debug_utils.h b/examples/41_fused_multi_head_attention/debug_utils.h
index efca4f13..a22f12b7 100644
--- a/examples/41_fused_multi_head_attention/debug_utils.h
+++ b/examples/41_fused_multi_head_attention/debug_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/default_fmha_grouped.h b/examples/41_fused_multi_head_attention/default_fmha_grouped.h
index 54e537c9..14604f10 100644
--- a/examples/41_fused_multi_head_attention/default_fmha_grouped.h
+++ b/examples/41_fused_multi_head_attention/default_fmha_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h b/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
index e166af4d..9ed17f4b 100644
--- a/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h b/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
index 6860ee9e..973ec345 100644
--- a/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_rescale_output.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h b/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
index bc2a28c0..b110abec 100644
--- a/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
+++ b/examples/41_fused_multi_head_attention/epilogue/epilogue_thread_apply_logsumexp.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/fmha_backward_test.py b/examples/41_fused_multi_head_attention/fmha_backward_test.py
index cdea9ded..8bc25462 100644
--- a/examples/41_fused_multi_head_attention/fmha_backward_test.py
+++ b/examples/41_fused_multi_head_attention/fmha_backward_test.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/fmha_grouped.h b/examples/41_fused_multi_head_attention/fmha_grouped.h
index 5a2f928a..afc25e43 100644
--- a/examples/41_fused_multi_head_attention/fmha_grouped.h
+++ b/examples/41_fused_multi_head_attention/fmha_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h b/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h
index 38695d5a..f8821930 100644
--- a/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h
+++ b/examples/41_fused_multi_head_attention/fmha_grouped_problem_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu b/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu
index 544e400f..e9154887 100644
--- a/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu
+++ b/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
index cf02a7b9..e21839c6 100644
--- a/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_fixed_seqlen.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
index 49d8699a..3383ff17 100644
--- a/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
+++ b/examples/41_fused_multi_head_attention/fused_multihead_attention_variable_seqlen.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma.h b/examples/41_fused_multi_head_attention/gemm/custom_mma.h
index 80f5d4ea..f3a1d4cb 100644
--- a/examples/41_fused_multi_head_attention/gemm/custom_mma.h
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h
index be25f79c..66c099d1 100644
--- a/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h
index eedcb637..145315e4 100644
--- a/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
index fd527a17..b967b86c 100644
--- a/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
+++ b/examples/41_fused_multi_head_attention/gemm/custom_mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/find_default_mma.h b/examples/41_fused_multi_head_attention/gemm/find_default_mma.h
index ee7d3d60..0e38a203 100644
--- a/examples/41_fused_multi_head_attention/gemm/find_default_mma.h
+++ b/examples/41_fused_multi_head_attention/gemm/find_default_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h b/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
index 0a67c4e8..7692389c 100644
--- a/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
+++ b/examples/41_fused_multi_head_attention/gemm/mma_accum_lambda_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
index 3e412743..94541b8d 100644
--- a/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
+++ b/examples/41_fused_multi_head_attention/gemm/mma_from_smem.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/gemm_kernel_utils.h b/examples/41_fused_multi_head_attention/gemm_kernel_utils.h
index a770e0b6..3703257a 100644
--- a/examples/41_fused_multi_head_attention/gemm_kernel_utils.h
+++ b/examples/41_fused_multi_head_attention/gemm_kernel_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h b/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
index 3dbb0cf2..dad26742 100644
--- a/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
+++ b/examples/41_fused_multi_head_attention/iterators/default_warp_iterator_from_smem.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
index 64a58278..7a52e96a 100644
--- a/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
+++ b/examples/41_fused_multi_head_attention/iterators/epilogue_predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/make_residual_last.h b/examples/41_fused_multi_head_attention/iterators/make_residual_last.h
index 845a3c6b..a667d675 100644
--- a/examples/41_fused_multi_head_attention/iterators/make_residual_last.h
+++ b/examples/41_fused_multi_head_attention/iterators/make_residual_last.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
index 6bc9e52c..d007f044 100644
--- a/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
+++ b/examples/41_fused_multi_head_attention/iterators/predicated_tile_access_iterator_residual_last.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h b/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
index 4db56560..1a3a9c7e 100644
--- a/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
+++ b/examples/41_fused_multi_head_attention/iterators/predicated_tile_iterator_residual_last.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h b/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h
index f0f8ea60..18858ab7 100644
--- a/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h
+++ b/examples/41_fused_multi_head_attention/iterators/transpose_warp_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h b/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h
index d19b1907..3f4ebec6 100644
--- a/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h
+++ b/examples/41_fused_multi_head_attention/iterators/warp_iterator_from_smem.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/kernel_backward.h b/examples/41_fused_multi_head_attention/kernel_backward.h
index 6fd94a6c..5cdb7c21 100644
--- a/examples/41_fused_multi_head_attention/kernel_backward.h
+++ b/examples/41_fused_multi_head_attention/kernel_backward.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/kernel_forward.h b/examples/41_fused_multi_head_attention/kernel_forward.h
index 71d79415..ed4e1677 100644
--- a/examples/41_fused_multi_head_attention/kernel_forward.h
+++ b/examples/41_fused_multi_head_attention/kernel_forward.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/piped_subprocess.py b/examples/41_fused_multi_head_attention/piped_subprocess.py
index 82351f49..536bdb43 100644
--- a/examples/41_fused_multi_head_attention/piped_subprocess.py
+++ b/examples/41_fused_multi_head_attention/piped_subprocess.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h b/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h
index 2db928a8..048c1e01 100644
--- a/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h
+++ b/examples/41_fused_multi_head_attention/transform/tile_smem_loader.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/42_ampere_tensorop_group_conv/CMakeLists.txt b/examples/42_ampere_tensorop_group_conv/CMakeLists.txt
index d470548c..60552785 100644
--- a/examples/42_ampere_tensorop_group_conv/CMakeLists.txt
+++ b/examples/42_ampere_tensorop_group_conv/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
index 120f04b6..0e773ee5 100644
--- a/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
+++ b/examples/42_ampere_tensorop_group_conv/ampere_tensorop_group_conv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/43_ell_block_sparse_gemm/CMakeLists.txt b/examples/43_ell_block_sparse_gemm/CMakeLists.txt
index 0676c7bd..a25046ff 100644
--- a/examples/43_ell_block_sparse_gemm/CMakeLists.txt
+++ b/examples/43_ell_block_sparse_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
index 52d2d0cb..18efab14 100644
--- a/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
+++ b/examples/43_ell_block_sparse_gemm/ell_block_sparse_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/README.md b/examples/44_multi_gemm_ir_and_codegen/README.md
index fd1839c5..369f4c49 100644
--- a/examples/44_multi_gemm_ir_and_codegen/README.md
+++ b/examples/44_multi_gemm_ir_and_codegen/README.md
@@ -32,7 +32,7 @@ This experimental example has the following restrictions:
 
 ## Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h
index 2535e28e..8f76fe1f 100644
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_bias_act_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h
index 22f8e282..35fe758a 100644
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/default_thread_map_tensor_op_for_fused_bias.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
index 1acb4a2d..fdb96db9 100644
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/fused_bias_act_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h
index c39e8ce1..c92307f5 100644
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/threadblock/output_tile_thread_map_for_fused_bias.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h
index cf12fef3..88259d10 100644
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/epilogue/warp/fused_bias_act_fragment_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h
index 0e89d6f8..cd4417bb 100644
--- a/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h
+++ b/examples/44_multi_gemm_ir_and_codegen/fixed_impl/gemm/warp/mma_tensor_op_fragment_iterator_without_output_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py
index 6aef3bca..15b10296 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_all_code.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py
index 5db6dd6e..c99ac9ae 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_cmake.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py
index 84621f2e..5abd22c8 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_customized_epilogue.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
index 371a4be8..e10b89ad 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_device.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py
index 919c777e..28df5665 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_ir.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
index 2bbaf26b..af901ac8 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_kernel.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py
index 6474d95c..75c2b686 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_sample.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
index 91f9ef3c..6dab42b2 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_threadblock.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py
index db1ec4c7..8697ca88 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_turing_and_volta.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
index 44f38765..64b0205b 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/gen_verify.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh b/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh
index 19d19ea9..aba1952b 100755
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/generate.sh
@@ -2,7 +2,7 @@
 
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
index d9891404..5b56f70f 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/helper.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py b/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py
index bbcd050f..b9e79efc 100644
--- a/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py
+++ b/examples/44_multi_gemm_ir_and_codegen/ir_gen/replace_fix_impl_header.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h b/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h
index 10b49049..cc34a412 100644
--- a/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h
+++ b/examples/44_multi_gemm_ir_and_codegen/leaky_bias.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/44_multi_gemm_ir_and_codegen/utils.h b/examples/44_multi_gemm_ir_and_codegen/utils.h
index 2b05ae93..a825033f 100644
--- a/examples/44_multi_gemm_ir_and_codegen/utils.h
+++ b/examples/44_multi_gemm_ir_and_codegen/utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/CMakeLists.txt b/examples/45_dual_gemm/CMakeLists.txt
index de704ed2..1c445bf3 100644
--- a/examples/45_dual_gemm/CMakeLists.txt
+++ b/examples/45_dual_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/device/dual_gemm.h b/examples/45_dual_gemm/device/dual_gemm.h
index f4807359..c6b6f30d 100644
--- a/examples/45_dual_gemm/device/dual_gemm.h
+++ b/examples/45_dual_gemm/device/dual_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/dual_gemm.cu b/examples/45_dual_gemm/dual_gemm.cu
index 8043adde..540618bc 100644
--- a/examples/45_dual_gemm/dual_gemm.cu
+++ b/examples/45_dual_gemm/dual_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/dual_gemm_common.h b/examples/45_dual_gemm/dual_gemm_common.h
index 41f5cfea..12ebe05e 100644
--- a/examples/45_dual_gemm/dual_gemm_common.h
+++ b/examples/45_dual_gemm/dual_gemm_common.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/dual_gemm_run.h b/examples/45_dual_gemm/dual_gemm_run.h
index b53ee806..d042f93c 100644
--- a/examples/45_dual_gemm/dual_gemm_run.h
+++ b/examples/45_dual_gemm/dual_gemm_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/kernel/dual_gemm.h b/examples/45_dual_gemm/kernel/dual_gemm.h
index 417f6ff2..2ff5b5e2 100644
--- a/examples/45_dual_gemm/kernel/dual_gemm.h
+++ b/examples/45_dual_gemm/kernel/dual_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/test_run.h b/examples/45_dual_gemm/test_run.h
index 4a58a3a1..c92af193 100644
--- a/examples/45_dual_gemm/test_run.h
+++ b/examples/45_dual_gemm/test_run.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/thread/left_silu_and_mul.h b/examples/45_dual_gemm/thread/left_silu_and_mul.h
index 47043267..003b6f6c 100644
--- a/examples/45_dual_gemm/thread/left_silu_and_mul.h
+++ b/examples/45_dual_gemm/thread/left_silu_and_mul.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/threadblock/dual_epilogue.h b/examples/45_dual_gemm/threadblock/dual_epilogue.h
index 3ef1c6d3..a234b200 100644
--- a/examples/45_dual_gemm/threadblock/dual_epilogue.h
+++ b/examples/45_dual_gemm/threadblock/dual_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/threadblock/dual_mma_base.h b/examples/45_dual_gemm/threadblock/dual_mma_base.h
index 3a25da9c..75471903 100644
--- a/examples/45_dual_gemm/threadblock/dual_mma_base.h
+++ b/examples/45_dual_gemm/threadblock/dual_mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/45_dual_gemm/threadblock/dual_mma_multistage.h b/examples/45_dual_gemm/threadblock/dual_mma_multistage.h
index 485922ef..5109b410 100644
--- a/examples/45_dual_gemm/threadblock/dual_mma_multistage.h
+++ b/examples/45_dual_gemm/threadblock/dual_mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt b/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt
index 9a9e74c1..6037be08 100644
--- a/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt
+++ b/examples/46_depthwise_simt_conv2dfprop/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
index cc7d2f10..23b30285 100644
--- a/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
+++ b/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt b/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt
index 00be87ed..188e38fe 100644
--- a/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt
+++ b/examples/47_ampere_gemm_universal_streamk/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
index 76bd0979..1f4f4312 100644
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
index ed65e58c..1707f082 100644
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
index 164c785e..7ed4593c 100644
--- a/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
+++ b/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -108,7 +108,7 @@ using ElementAccumulator  = float;                                          // E
 using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
 using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
 using TileShape           = Shape<_128,_128,_32>;                           // Threadblock-level tile size
-using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+using ClusterShape        = Shape<_4,_2,_1>;                                // Shape of the threadblocks in a cluster
 using StageCountType = cutlass::gemm::collective::StageCountAuto;           // Stage count maximized based on the tile size
 using KernelSchedule = cutlass::gemm::collective::KernelScheduleAuto;       // Kernel to launch based on the default setting in the Collective Builder
 
@@ -341,11 +341,17 @@ void initialize(const Options &options) {
 /// Populates a Gemm::Arguments structure from the given commandline options
 typename Gemm::Arguments args_from_options(const Options &options)
 {
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  int device_id = 0;
+  cutlass::KernelHardwareInfo kernel_hw_info = cutlass::KernelHardwareInfo::make_kernel_hardware_info<Gemm::GemmKernel>(device_id);
+
   typename Gemm::Arguments arguments{
     cutlass::gemm::GemmUniversalMode::kGemm,
     {options.m, options.n, options.k},
     {block_A.get(), stride_A, block_B.get(), stride_B},
-    {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}
+    {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+    kernel_hw_info
   };
 
   arguments.scheduler.raster_order = options.raster;
diff --git a/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt b/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt
index 903da1ea..3aa37768 100644
--- a/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt
+++ b/examples/48_hopper_warp_specialized_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
index 1e820ddb..5852cd8d 100644
--- a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
+++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt b/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt
index 4925105d..7cf782de 100644
--- a/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt
+++ b/examples/49_hopper_gemm_with_collective_builder/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu
index a736e5ce..69a3c030 100644
--- a/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu
+++ b/examples/50_hopper_gemm_with_epilogue_swizzle/50_hopper_gemm_with_epilogue_swizzle.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt b/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt
index 5498d4ef..6bf1cb49 100644
--- a/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt
+++ b/examples/50_hopper_gemm_with_epilogue_swizzle/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/51_hopper_gett/51_hopper_gett.cu b/examples/51_hopper_gett/51_hopper_gett.cu
index 005eaec5..f2eb5c0b 100644
--- a/examples/51_hopper_gett/51_hopper_gett.cu
+++ b/examples/51_hopper_gett/51_hopper_gett.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/51_hopper_gett/CMakeLists.txt b/examples/51_hopper_gett/CMakeLists.txt
index f18dff38..4da575a9 100644
--- a/examples/51_hopper_gett/CMakeLists.txt
+++ b/examples/51_hopper_gett/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/51_hopper_gett/gett_kernel.cuh b/examples/51_hopper_gett/gett_kernel.cuh
index 6a775d13..609168c3 100644
--- a/examples/51_hopper_gett/gett_kernel.cuh
+++ b/examples/51_hopper_gett/gett_kernel.cuh
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -90,6 +90,7 @@ gett_kernel(
   // No changes are required to the default epilogue
   using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
     cutlass::epilogue::collective::DefaultEpilogue<
+      ElementC,
       StrideC,
       StrideD,
       EpilogueThreadOp,
diff --git a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
index 0a74e02a..8a198420 100644
--- a/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
+++ b/examples/52_hopper_gather_scatter_fusion/52_hopper_gather_scatter_fusion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -253,7 +253,7 @@ struct ExampleRunner
 
   using EpilogueRef = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
     cutlass::epilogue::collective::DefaultEpilogue<
-      StrideC, StrideD,
+      ElementC, StrideC, StrideD,
       typename Epilogue::ThreadEpilogueOp,
       typename Epilogue::EpilogueSchedule
     >
diff --git a/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt b/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt
index bf675370..a3ae8fd5 100644
--- a/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt
+++ b/examples/52_hopper_gather_scatter_fusion/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp b/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
index c71109aa..959e1e63 100644
--- a/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
+++ b/examples/52_hopper_gather_scatter_fusion/gather_gemm.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
index 592bf57e..b4cafccb 100644
--- a/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
+++ b/examples/52_hopper_gather_scatter_fusion/gather_kernel.cuh
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp b/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp
index dc9c0df8..4292e018 100644
--- a/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp
+++ b/examples/52_hopper_gather_scatter_fusion/scatter_epilogue.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
index d24c5f29..d8096d9e 100644
--- a/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
+++ b/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/53_hopper_gemm_permute/CMakeLists.txt b/examples/53_hopper_gemm_permute/CMakeLists.txt
index dc70d95f..38c6382f 100644
--- a/examples/53_hopper_gemm_permute/CMakeLists.txt
+++ b/examples/53_hopper_gemm_permute/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/53_hopper_gemm_permute/permute_kernel.cuh b/examples/53_hopper_gemm_permute/permute_kernel.cuh
index 8abe7010..0cb1aad9 100644
--- a/examples/53_hopper_gemm_permute/permute_kernel.cuh
+++ b/examples/53_hopper_gemm_permute/permute_kernel.cuh
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/53_hopper_gemm_permute/permute_traits.hpp b/examples/53_hopper_gemm_permute/permute_traits.hpp
index 4c5bacca..e7a45d72 100644
--- a/examples/53_hopper_gemm_permute/permute_traits.hpp
+++ b/examples/53_hopper_gemm_permute/permute_traits.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
index efe35d7c..a6f33d1c 100644
--- a/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/54_hopper_fp8_warp_specialized_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt b/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt
index 209b2779..2f6af90b 100644
--- a/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp b/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp
index 96d8794d..e8ea5330 100644
--- a/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp
+++ b/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu
index ab82b40c..8bca0a35 100644
--- a/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu
+++ b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_bf16_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
index 40fa6894..7bc65f9b 100644
--- a/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
+++ b/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu b/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu
index b482d0d1..cf64f37b 100644
--- a/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu
+++ b/examples/55_hopper_mixed_dtype_gemm/55_hopper_mixed_dtype_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt b/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt
index 23dca4f3..24e7d890 100644
--- a/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt
+++ b/examples/55_hopper_mixed_dtype_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/55_hopper_mixed_dtype_gemm/README.md b/examples/55_hopper_mixed_dtype_gemm/README.md
index ecb4f41c..48eca35c 100644
--- a/examples/55_hopper_mixed_dtype_gemm/README.md
+++ b/examples/55_hopper_mixed_dtype_gemm/README.md
@@ -9,17 +9,12 @@ This first version only supports mixed type GEMMs using TMA.
 
 ## Performance
 
-While the example offers a harness for straightforward benchmarking, this initial implementation isn't optimized for performance in the majority of scenarios. We expect this implementation to be performant for `{fp16, bf16} x {int8, int4, int2}` and `{fp8} x {int4}` for problems that are compute bound. Additionally, we expect good performance for `fp16`, `bf16` or `fp32` scales and zero-points. For best performance, it is ideal to have the scales and zero-points be the same type as mma's type.
+While the example offers a harness for straightforward benchmarking, this initial implementation isn't optimized for performance in the majority of scenarios. We expect this implementation to be performant for `{fp16, bf16} x {int8, int4}` and `{fp8} x {int4}` for problems that are compute bound. Additionally, we expect good performance for `fp16, bf16` or `fp32` scales and zero-points. For best performance, it is ideal to have the scales and zero-points be the same type.
 
 The scale only mode for `fp8 x int4` is significantly slower than direct conversion mode. There is a lookup-table workaround targeting this mode, as shown in `55_hopper_int4_fp8_gemm.cu`. To use this feature, use `cutlass::Array<ElementScale, 8>` as the scale type in the collective builder. However, it requires modifications to the encoding of quantized weights and scale factors. Also, scale with zero point mode is not supported for now.
 
-
-Additionally, it's recommended to reorder the narrow data type tensor such that elements read into register file by the same thread are contiguous in global and shared memory. The user can use the helper function `compute_memory_reordering_atom` and `reorder_tensor` to achieve this. See `55_hopper_int4_fp8_gemm.cu` and `55_hopper_int4_bf16_gemm.cu` for more details.
-
-
 We are currently optimizing the following cases:
 1. Memory bound cases for all types
-2. `fp8 x {int2, uint2}` case
 
 ## Limitations
 
diff --git a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
index 55de3fab..56382d83 100644
--- a/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
+++ b/examples/55_hopper_mixed_dtype_gemm/mixed_dtype_utils.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -151,16 +151,16 @@ void mixed_dtype_profiling(
   runtimes.reserve(options.iterations);
 
   for (int iter = 0; iter < options.warmup + options.iterations; ++iter) {
-    cudaEventRecord(start);
-    CUTLASS_CHECK(gemm.run());
-    cudaEventRecord(stop);
-    cudaEventSynchronize(stop);
+      cudaEventRecord(start);
+      CUTLASS_CHECK(gemm.run());
+      cudaEventRecord(stop);
+      cudaEventSynchronize(stop);
 
-    if (iter >= options.warmup) {
-        float milliseconds = 0;
-        cudaEventElapsedTime(&milliseconds, start, stop);
-        runtimes.push_back(milliseconds);
-    }
+      if (iter >= options.warmup) {
+          float milliseconds = 0;
+          cudaEventElapsedTime(&milliseconds, start, stop);
+          runtimes.push_back(milliseconds);
+      }
   }
 
   cudaEventDestroy(start);
diff --git a/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp b/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp
index bd71e9cf..98d6df55 100644
--- a/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp
+++ b/examples/55_hopper_mixed_dtype_gemm/packed_scale.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -33,9 +33,6 @@
 
 #include <cstdint>
 
-
-#include "cutlass/util/device_memory.h"
-#include "cutlass/integer_subbyte.h"
 #include "cutlass/float8.h"
 #include "cutlass/util/reference/device/tensor_fill.h"
 
@@ -200,6 +197,7 @@ bool initialize_packed_scale(
   {
     cutlass::packed_scale_t<ElementScale> tmp(data_in[i]);
     data_out[i] = reinterpret_cast<cutlass::Array<ElementScale, 8> const&>(tmp);
+    // std::cout << data_in[i] << ":" << std::hex << static_cast<uint16_t>(data_in[i].storage) << ",\t" << -data_in[i] << ":" << std::hex << static_cast<uint16_t>((-data_in[i]).storage) << std::endl;
   }
   try {
     block_out.copy_from_host(data_out.data());
diff --git a/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp b/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp
index de5a3d3f..0f4e38d6 100644
--- a/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp
+++ b/examples/55_hopper_mixed_dtype_gemm/reorder_utils.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
index 51ce970d..886d39a2 100644
--- a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
+++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt b/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
index 1f59ceb8..bd1f8897 100644
--- a/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
+++ b/examples/56_hopper_ptr_array_batched_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
index 7b20a335..3aeafb4d 100644
--- a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
+++ b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -550,11 +550,10 @@ void initialize(const Options &options) {
 template <typename GemmT>
 typename GemmT::Arguments args_from_options(const Options &options, bool host_problem_shapes_available = true)
 {
-  cutlass::KernelHardwareInfo hw_info;
   // Change device_id to another value if you are running on a machine with multiple GPUs and wish
   // to use a GPU other than that with device ID 0.
-  hw_info.device_id = 0;
-  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+  int device_id = 0;
+  cutlass::KernelHardwareInfo kernel_hw_info = cutlass::KernelHardwareInfo::make_kernel_hardware_info<Gemm::GemmKernel>(device_id);
 
   typename GemmT::Arguments arguments;
   decltype(arguments.epilogue.thread) fusion_args;
@@ -590,7 +589,7 @@ typename GemmT::Arguments args_from_options(const Options &options, bool host_pr
       {options.groups, problem_sizes.get(), options.problem_sizes_host.data()},
       {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()},
       {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), stride_D.get()},
-      hw_info
+      kernel_hw_info
     };
   }
   else {
@@ -599,7 +598,7 @@ typename GemmT::Arguments args_from_options(const Options &options, bool host_pr
       {options.groups, problem_sizes.get(), nullptr},
       {ptr_A.get(), stride_A.get(), ptr_B.get(), stride_B.get()},
       {fusion_args, ptr_C.get(), stride_C.get(), ptr_D.get(), stride_D.get()},
-      hw_info
+      kernel_hw_info
     };
   }
 
diff --git a/examples/57_hopper_grouped_gemm/CMakeLists.txt b/examples/57_hopper_grouped_gemm/CMakeLists.txt
index 1dadbfa8..e2417734 100644
--- a/examples/57_hopper_grouped_gemm/CMakeLists.txt
+++ b/examples/57_hopper_grouped_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/58_ada_fp8_gemm/CMakeLists.txt b/examples/58_ada_fp8_gemm/CMakeLists.txt
index 2af32542..3eba010b 100644
--- a/examples/58_ada_fp8_gemm/CMakeLists.txt
+++ b/examples/58_ada_fp8_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
index 79bead36..cdf94c01 100644
--- a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
+++ b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/59_ampere_gather_scatter_conv/CMakeLists.txt b/examples/59_ampere_gather_scatter_conv/CMakeLists.txt
index ce22cd1f..d9d3c65a 100644
--- a/examples/59_ampere_gather_scatter_conv/CMakeLists.txt
+++ b/examples/59_ampere_gather_scatter_conv/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h b/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h
index cc00cced..d3d0982d 100644
--- a/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h
+++ b/examples/59_ampere_gather_scatter_conv/ampere_conv_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu b/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu
index 341d1e9f..ee1d658a 100644
--- a/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu
+++ b/examples/59_ampere_gather_scatter_conv/ampere_gather_scatter_conv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/60_cutlass_import/CMakeLists.txt b/examples/60_cutlass_import/CMakeLists.txt
index 974bf410..27ade3cd 100644
--- a/examples/60_cutlass_import/CMakeLists.txt
+++ b/examples/60_cutlass_import/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/60_cutlass_import/main.cpp b/examples/60_cutlass_import/main.cpp
index f17f5458..fb2b7967 100644
--- a/examples/60_cutlass_import/main.cpp
+++ b/examples/60_cutlass_import/main.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
index 8bb14b45..a71a63eb 100644
--- a/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
+++ b/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt b/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt
index 7d9160a7..90eac71e 100644
--- a/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt
+++ b/examples/61_hopper_gemm_with_topk_and_softmax/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
index c3f1ce70..01a10046 100644
--- a/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
+++ b/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -92,15 +92,15 @@ using ElementAccumulator  = float;                                          // E
 using TileShape           = Shape<_128,_128,_128>;                          // Threadblock-level tile size for sparse kernel
 using TileShapeRef        = Shape<_128,_128, _64>;                          // Threadblock-level tile size for reference (dense) kernel
 using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
-using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecialized;        // Kernel schedule policy
-using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;          // Epilogue schedule policy
+using KernelSchedule      = cutlass::gemm::collective::KernelScheduleAuto;        // Kernel schedule policy
+using EpilogueSchedule    = cutlass::epilogue::collective::EpilogueScheduleAuto;  // Epilogue schedule policy
 
 using ProblemShape = Shape<int,int,int,int>;
 
 // Sparse kernel setup
 
 using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
-    cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+    cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
     TileShape, ClusterShape,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ElementAccumulator, ElementAccumulator,
@@ -224,18 +224,12 @@ cutlass::DeviceAllocation<typename Gemm::EpilogueOutputOp::ElementOutput> block_
 // Command line options parsing
 struct Options {
 
-  bool help;
+  bool help = false;
 
-  float alpha, beta;
-  int iterations;
-  int m, n, k, l;
-
-  Options():
-    help(false),
-    m(5120), n(4096), k(16384), l(1),
-    alpha(1.f), beta(0.f),
-    iterations(10)
-  { }
+  float alpha = 1.f, beta = 0.f;
+  int iterations = 100;
+  int warmup = 100;
+  int m = 5120, n = 4096, k = 16384, l = 1;
 
   // Parses the command line
   void parse(int argc, char const **args) {
@@ -253,26 +247,32 @@ struct Options {
     cmd.get_cmd_line_argument("alpha", alpha);
     cmd.get_cmd_line_argument("beta", beta);
     cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("warmup", warmup);
   }
 
   /// Prints the usage statement.
   std::ostream & print_usage(std::ostream &out) const {
 
-    out << "62_hopper_sparse_gemm\n\n"
-      << "  Hopper Sparse GEMM example.\n\n"
-      << "Options:\n\n"
-      << "  --help                      If specified, displays this usage statement\n\n"
-      << "  --m=<int>                   Sets the M extent of the GEMM\n"
-      << "  --n=<int>                   Sets the N extent of the GEMM\n"
-      << "  --k=<int>                   Sets the K extent of the GEMM\n"
-      << "  --l=<int>                   Sets the L extent of the GEMM (batch size)\n"
-      << "  --alpha=<f32>               Epilogue scalar alpha\n"
-      << "  --beta=<f32>                Epilogue scalar beta\n\n"
-      << "  --iterations=<int>          Number of profiling iterations to perform.\n\n";
-
-    out
-      << "\n\nExamples:\n\n"
-      << "$ " << "62_hopper_sparse_gemm" << " --m=4096 --n=5120 --k=8192 --l=1 --alpha=2 --beta=0.707 \n\n";
+    out << "62_hopper_sparse_gemm\n"
+           "\n"
+           "  Hopper Sparse GEMM example.\n"
+           "\n"
+           "Options:\n"
+           "\n"
+           "  --help                      If specified, displays this usage statement\n\n"
+           "  --m=<int>                   Sets the M extent of the GEMM\n"
+           "  --n=<int>                   Sets the N extent of the GEMM\n"
+           "  --k=<int>                   Sets the K extent of the GEMM\n"
+           "  --l=<int>                   Sets the L extent of the GEMM (batch size)\n"
+           "  --alpha=<f32>               Epilogue scalar alpha\n"
+           "  --beta=<f32>                Epilogue scalar beta\n"
+           "  --iterations=<int>          Number of profiling iterations to perform.\n"
+           "  --warmup=<int>              Number of warmup iterations to perform.\n"
+           "\n"
+           "Examples:\n"
+           "\n"
+           "62_hopper_sparse_gemm --m=4096 --n=5120 --k=8192 --l=1 --alpha=2 --beta=0.707\n"
+           "\n";
 
     return out;
   }
@@ -442,7 +442,6 @@ void print_device_tensor(cute::Tensor<Engine, Layout> const& t)
 }
 
 bool verify(Options const& options) {
-  CUDA_CHECK(cudaDeviceSynchronize());
 
   bool passed = cutlass::reference::device::BlockCompareEqual(block_D_ref.get(), block_D.get(), block_D.size());
 
@@ -492,6 +491,10 @@ struct Runner
   void benchmark(Options const& options) {
     if (options.iterations > 0)
     {
+      for (int iter = 0; iter < options.warmup; ++iter) {
+        run();
+      }
+
       GpuTimer timer;
       timer.start();
       for (int iter = 0; iter < options.iterations; ++iter) {
@@ -525,12 +528,16 @@ void run(Options &options) {
   Runner<Gemm> gemm(make_args(options));
   Runner<GemmRef> gemm_ref(make_args_ref(options));
 
+  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+
   gemm.run();
+  CUDA_CHECK(cudaDeviceSynchronize());
+
   gemm_ref.run();
+  CUDA_CHECK(cudaDeviceSynchronize());
 
   bool passed = verify(options);
 
-  std::cout << "  Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << std::endl;
   std::cout << "  Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
 
   if (!passed) {
diff --git a/examples/62_hopper_sparse_gemm/CMakeLists.txt b/examples/62_hopper_sparse_gemm/CMakeLists.txt
index cf55da45..0bc87f49 100644
--- a/examples/62_hopper_sparse_gemm/CMakeLists.txt
+++ b/examples/62_hopper_sparse_gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
index 03c54a8e..d1db304b 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
+++ b/examples/63_hopper_gemm_with_weight_prefetch/63_hopper_gemm_with_weight_prefetch.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
index f4867324..c9f638e6 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
+++ b/examples/63_hopper_gemm_with_weight_prefetch/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp b/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp
index bfb64820..09bb9b65 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/collective/builder.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp b/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp
index 37369176..8bf0bf37 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/collective/dispatch_policy_extra.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp b/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp
index 9bcb1f5a..a8b220d1 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/collective/sm90_mma_tma_gmma_ss_warpspecialized_with_prefetch.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp b/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp
index 6be87768..8c85edb7 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/gemm_with_weight_prefetch_commandline.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
index 6e33d8fc..0c54bc05 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/kernel/sm90_gemm_tma_warpspecialized_with_prefetch.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp b/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp
index 7abd39cc..157d201a 100644
--- a/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp
+++ b/examples/63_hopper_gemm_with_weight_prefetch/pipeline/prefetch_pipeline_sm90.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt b/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt
index 18320259..5fe68056 100644
--- a/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt
+++ b/examples/64_ada_fp8_gemm_grouped/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
index 8e3dbbb0..7763bc10 100644
--- a/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
+++ b/examples/64_ada_fp8_gemm_grouped/ada_fp8_gemm_grouped.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/65_distributed_gemm/65_distributed_gemm.cu b/examples/65_distributed_gemm/65_distributed_gemm.cu
new file mode 100644
index 00000000..f0b59ca3
--- /dev/null
+++ b/examples/65_distributed_gemm/65_distributed_gemm.cu
@@ -0,0 +1,864 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Distributed GEMM (DistGEMM) for Hopper
+
+    This example runs Tensor Parallel GEMMs using the (experimental) Distributed GEMM API in 
+    CUTLASS. For more information, please refer to README.md.
+
+    Note that Distributed GEMM assumes an any-to-any NVLink network topology.
+    To check whether your device is compatible, run:
+
+      $ nvidia-smi topo -m
+
+    and make sure there's an any-to-any NVLink topology. It would look like this:
+
+                GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
+        GPU0     X      NV18    NV18    NV18    NV18    NV18    NV18    NV18
+        GPU1    NV18     X      NV18    NV18    NV18    NV18    NV18    NV18
+        GPU2    NV18    NV18     X      NV18    NV18    NV18    NV18    NV18
+        GPU3    NV18    NV18    NV18     X      NV18    NV18    NV18    NV18
+        GPU4    NV18    NV18    NV18    NV18     X      NV18    NV18    NV18
+        GPU5    NV18    NV18    NV18    NV18    NV18     X      NV18    NV18
+        GPU6    NV18    NV18    NV18    NV18    NV18    NV18     X      NV18
+        GPU7    NV18    NV18    NV18    NV18    NV18    NV18    NV18     X
+
+    You should also additionally check if the driver enables peer to peer access:
+
+      $ nvidia-smi topo -p2p r
+
+    Output should be something like this:
+
+               GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
+        GPU0   X       OK      OK      OK      OK      OK      OK      OK
+        GPU1   OK      X       OK      OK      OK      OK      OK      OK
+        GPU2   OK      OK      X       OK      OK      OK      OK      OK
+        GPU3   OK      OK      OK      X       OK      OK      OK      OK
+        GPU4   OK      OK      OK      OK      X       OK      OK      OK
+        GPU5   OK      OK      OK      OK      OK      X       OK      OK
+        GPU6   OK      OK      OK      OK      OK      OK      X       OK
+        GPU7   OK      OK      OK      OK      OK      OK      OK      X
+
+    It is recommended to build this target with the following flag to enable 
+    Grid Dependency Control instructions (GDC) in CUTLASS:
+      - CUTLASS_ENABLE_GDC_FOR_SM90
+
+    Example:
+
+      $ mkdir build && cd build
+
+      $ cmake .. -DCUTLASS_NVCC_ARCHS="90a" -DCUTLASS_ENABLE_GDC_FOR_SM90=1
+
+      $ cd examples/65_distributed_gemm
+
+      $ make
+
+      $ ./65_distributed_gemm
+*/
+
+#include <iostream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/tensor_ref.h"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/gemm/collective/collective_builder.hpp"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/kernel/gemm_universal.hpp"
+
+#include "cutlass/epilogue/dispatch_policy.hpp"
+#include "cutlass/epilogue/collective/collective_builder.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/reference/host/error_metrics.h"
+#include "cutlass/util/reference/device/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+
+// Distributed GEMM headers
+#include "cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp"
+#include "cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp"
+#include "cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp"
+
+#include "helper.h"
+
+// Distributed GEMM helpers
+#include "util/benchmark.h"
+#include "util/device_copy.h"
+
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Distributed GEMM configuration
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// TP size (= number of processors/GPUs)
+using TP = _8;
+static constexpr int TP_ = TP{};
+
+#if (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+
+// Distributed GEMM tiling/sharding schedule
+// Choices:
+//
+// * All Gather + GEMM:
+//   * AllGather1D_TilingCD_RotatingA
+//   * AllGather1D_TilingCD_RotatingB
+//
+// * GEMM + Reduce Scatter:
+//   * ReduceScatter1D_TilingA_RotatingC
+//   * ReduceScatter1D_TilingB_RotatingC
+
+using DistSchedule = cutlass::distributed::schedules::AllGather1D_TilingCD_RotatingA<TP>;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM kernel configurations
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A matrix configuration
+using         ElementA    = cutlass::half_t;                                // Element type for A matrix operand
+using         LayoutA     = cutlass::layout::RowMajor;                      // Layout type for A matrix operand
+constexpr int AlignmentA  = 128 / cutlass::sizeof_bits<ElementA>::value;    // Memory access granularity/alignment of A matrix in units of elements (up to 16 bytes)
+
+// B matrix configuration
+using         ElementB    = cutlass::half_t;                                // Element type for B matrix operand
+using         LayoutB     = cutlass::layout::ColumnMajor;                   // Layout type for B matrix operand
+constexpr int AlignmentB  = 128 / cutlass::sizeof_bits<ElementB>::value;    // Memory access granularity/alignment of B matrix in units of elements (up to 16 bytes)
+
+// C matrix configuration
+using         ElementC    = cutlass::half_t;                                // Element type for C and D matrix operands
+using         LayoutC     = cutlass::layout::ColumnMajor;                   // Layout type for C and D matrix operands
+constexpr int AlignmentC  = 128 / cutlass::sizeof_bits<ElementC>::value;    // Memory access granularity/alignment of C matrix in units of elements (up to 16 bytes)
+
+// D matrix configuration
+using         ElementD    = ElementC;
+using         LayoutD     = LayoutC;
+constexpr int AlignmentD  = AlignmentC;
+
+// Core kernel configurations
+using ElementAccumulator  = cutlass::half_t;                                // Element type for internal accumulation
+using ElementCompute      = cutlass::half_t;                                // Element type for epilogue computation
+using ArchTag             = cutlass::arch::Sm90;                            // Tag indicating the minimum SM that supports the intended feature
+using OperatorClass       = cutlass::arch::OpClassTensorOp;                 // Operator class tag
+using TileShape           = Shape<_128,_256,_64>;                           // Threadblock-level tile size
+using ClusterShape        = Shape<_1,_2,_1>;                                // Shape of the threadblocks in a cluster
+
+using KernelSchedule      = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
+using EpilogueSchedule    = cutlass::epilogue::TmaWarpSpecialized;
+using EpilogueTileType    = cutlass::epilogue::collective::EpilogueTileAuto;
+
+using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    TileShape, ClusterShape,
+    EpilogueTileType,
+    ElementAccumulator, ElementCompute,
+    ElementC, LayoutC, AlignmentC,
+    ElementD, LayoutD, AlignmentD,
+    EpilogueSchedule
+  >::CollectiveOp;
+
+using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+    ArchTag, OperatorClass,
+    ElementA, LayoutA, AlignmentA,
+    ElementB, LayoutB, AlignmentB,
+    ElementAccumulator,
+    TileShape, ClusterShape,
+    cutlass::gemm::collective::StageCountAutoCarveout<
+      static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))
+    >,
+    KernelSchedule
+  >::CollectiveOp;
+
+using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+    Shape<int,int,int,int>, // Indicates ProblemShape
+    CollectiveMainloop,
+    CollectiveEpilogue
+>;
+
+// We're going to use the single-device GEMM as reference
+using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+// Instantiate Distributed GEMM kernel
+using DistGemmKernel = cutlass::distributed::kernel::DistributedGemmKernelWrapper<
+  GemmKernel,
+  DistSchedule
+>;
+using DistGemm = cutlass::distributed::device::DistributedGemmUniversalAdapter<DistGemmKernel>;
+
+using StrideA = typename Gemm::GemmKernel::StrideA;
+using StrideB = typename Gemm::GemmKernel::StrideB;
+using StrideC = typename Gemm::GemmKernel::StrideC;
+using StrideD = typename Gemm::GemmKernel::StrideD;
+
+/// Initialization
+StrideA stride_A;
+StrideB stride_B;
+StrideC stride_C;
+StrideD stride_D;
+uint64_t seed;
+
+using HostTensorA = typename cutlass::HostTensor<ElementA, LayoutA>;
+using HostTensorB = typename cutlass::HostTensor<ElementB, LayoutB>;
+using HostTensorC = typename cutlass::HostTensor<ElementC, LayoutC>;
+using HostTensorD = typename cutlass::HostTensor<ElementD, LayoutD>;
+
+// Reference GEMM tensors
+HostTensorA tensor_A;
+HostTensorB tensor_B;
+HostTensorC tensor_C;
+HostTensorD tensor_D;
+HostTensorD tensor_ref_D;
+
+// DistGEMM tensors (multi-device)
+HostTensorA tensor_A_arr[TP_];
+HostTensorB tensor_B_arr[TP_];
+HostTensorD tensor_C_arr[TP_];
+HostTensorD tensor_D_arr[TP_];
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Testbed utility types
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+  bool help = false;
+
+  float alpha = 1.f, beta = 0.f;
+  int iterations = 100;
+  int warmup_iterations = 10;
+  int m = 16384, n = 106496, k = 16384, l = 1;
+  float eps = 0.f;
+
+  // Parses the command line
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+
+    if (cmd.check_cmd_line_flag("help")) {
+      help = true;
+      return;
+    }
+
+    cmd.get_cmd_line_argument("m", m);
+    cmd.get_cmd_line_argument("n", n);
+    cmd.get_cmd_line_argument("k", k);
+    cmd.get_cmd_line_argument("l", l);
+    cmd.get_cmd_line_argument("alpha", alpha);
+    cmd.get_cmd_line_argument("beta", beta);
+    cmd.get_cmd_line_argument("iterations", iterations);
+    cmd.get_cmd_line_argument("warmup-iterations", warmup_iterations);
+    cmd.get_cmd_line_argument("eps", eps);
+  }
+
+  /// Prints the usage statement.
+  std::ostream & print_usage(std::ostream &out) const {
+
+    out << "65_distributed_gemm\n\n"
+      << "  Hopper Distributed GEMM (DistGEMM). \n"
+      << "  For more details please refer to the source file.\n\n"
+      << "Options:\n\n"
+      << "  --help                      If specified, displays this usage statement\n\n"
+      << "  --m=<int>                   Sets the M extent of the GEMM\n"
+      << "  --n=<int>                   Sets the N extent of the GEMM\n"
+      << "  --k=<int>                   Sets the K extent of the GEMM\n"
+      << "  --l=<int>                   Sets the L extent (batch) of the GEMM (default: 1)\n"
+      << "  --alpha=<f32>               Epilogue scalar alpha (default: 1.0)\n"
+      << "  --beta=<f32>                Epilogue scalar beta (default: 0.0)\n"
+      << "  --iterations=<int>          Number of profiling iterations to perform (default: 100)\n"
+      << "  --warmup-iterations=<int>   Number of warmup iterations prior to profiling (default: 10)\n"
+      << "  --eps=<f32>                 Threshold for error compared to reference " 
+      << "GEMM (default: 0.0)\n\n";
+
+    out
+      << "\n\nExamples:\n\n"
+      << "$ " << "65_distributed_gemm" << " --m=16384 --n=106496 --k=16384 \n\n";
+
+    return out;
+  }
+
+  /// Compute performance in TFLOP/s
+  double tflops(double runtime_s) const {
+
+    // Two flops per multiply-add
+    uint64_t flop = uint64_t(2) * m * n * k * l / TP_;
+    double tflop = double(flop) / double(1.0e12);
+    return tflop / runtime_s;
+  }
+};
+
+/// Result structure
+struct Result {
+  double avg_runtime_ms;
+  double tflops;
+  cutlass::Status status;
+  cudaError_t error;
+  bool passed;
+
+  Result(
+    double avg_runtime_ms = 0,
+    double tflops = 0,
+    cutlass::Status status = cutlass::Status::kSuccess,
+    cudaError_t error = cudaSuccess)
+  :
+    avg_runtime_ms(avg_runtime_ms), tflops(tflops), status(status), error(error), passed(false)
+  {}
+
+};
+
+#if (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// GEMM setup and evaluation
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/// Helper to initialize a block of device data
+template <typename Element, typename Layout>
+bool initialize_tensor(
+  cutlass::TensorView<Element, Layout> view,
+  uint64_t seed,
+  bool is_device_tensor = false) {
+
+  double scope_max, scope_min;
+  int bits = cutlass::sizeof_bits<Element>::value;
+
+  if (bits == 1) {
+    scope_max = 2;
+    scope_min = 0;
+  }
+  else if (bits <= 16) {
+    scope_max = 2;
+    scope_min = -2;
+  }
+  else {
+    scope_max = 8;
+    scope_min = -8;
+  }
+
+  if (is_device_tensor) {
+    using Real = typename cutlass::RealType<Element>::Type;
+    cutlass::reference::device::TensorFillRandomUniform(
+      view, seed, static_cast<Real>(scope_max), static_cast<Real>(scope_min), 0);
+    cudaDeviceSynchronize();
+  } else {
+    cutlass::reference::host::TensorFillRandomUniform(
+      view, seed, scope_max, scope_min, 0);
+  }
+
+  return true;
+}
+
+/// Initialize operands to be used in the GEMM and reference GEMM
+void initialize(const Options &options) {
+  auto problem_shape = cute::make_tuple(options.m, options.n, options.k, options.l);
+
+  // Setup (reference) GEMM tensors
+  auto shape_A = cute::select<0,2,3>(problem_shape);
+  auto shape_B = cute::select<1,2,3>(problem_shape);
+  auto shape_C = cute::select<0,1,3>(problem_shape);
+  auto shape_D = cute::select<0,1,3>(problem_shape);
+
+  stride_A = cutlass::make_cute_packed_stride(StrideA{}, shape_A);
+  stride_B = cutlass::make_cute_packed_stride(StrideB{}, shape_B);
+  stride_C = cutlass::make_cute_packed_stride(StrideC{}, shape_C);
+  stride_D = cutlass::make_cute_packed_stride(StrideD{}, shape_D);
+
+  auto a_coord = cutlass::make_Coord(size(shape_A), 1);
+  auto b_coord = cutlass::make_Coord(size(shape_B), 1);
+  auto c_coord = cutlass::make_Coord(size(shape_C), 1);
+
+  tensor_A.resize(a_coord);
+  tensor_B.resize(b_coord);
+  tensor_C.resize(c_coord);
+  tensor_D.resize(c_coord);
+  tensor_ref_D.resize(c_coord);
+
+  initialize_tensor(tensor_A.device_view(), seed + 2022, /* is_device_tensor = */ true);
+  initialize_tensor(tensor_B.device_view(), seed + 2023, /* is_device_tensor = */ true);
+  initialize_tensor(tensor_C.device_view(), seed + 2024, /* is_device_tensor = */ true);
+
+  tensor_A.sync_host();
+  tensor_B.sync_host();
+  tensor_C.sync_host();
+  tensor_D.sync_host();
+  tensor_ref_D.sync_host();
+
+  // Set up DistGEMM tensors
+  auto local_shape_A = DistSchedule::get_local_a_shape(problem_shape);
+  auto local_shape_B = DistSchedule::get_local_b_shape(problem_shape);
+  auto local_shape_C = DistSchedule::get_local_c_shape(problem_shape);
+  auto local_shape_D = DistSchedule::get_local_d_shape(problem_shape);
+
+  auto a_coord_device = cutlass::make_Coord(size(local_shape_A), 1);
+  auto b_coord_device = cutlass::make_Coord(size(local_shape_B), 1);
+  auto c_coord_device = cutlass::make_Coord(size(local_shape_C), 1);
+
+  int primary_device_idx;
+  CUDA_CHECK(cudaGetDevice(&primary_device_idx));
+
+  // Enable any-to-any access
+  for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+    int can_access;
+    CUDA_CHECK(cudaSetDevice(device_idx));
+    for (int peer_idx = 0; peer_idx < TP_; ++peer_idx) {
+      if (peer_idx != device_idx) {
+        CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, device_idx, peer_idx));
+        if (not can_access) {
+          std::cerr << "FAILURE: Device " << device_idx << " can't access device " << peer_idx << "." <<
+            std::endl;
+          exit(EXIT_FAILURE);
+        }
+        CUDA_CHECK(cudaDeviceEnablePeerAccess(peer_idx, 0));
+      }
+    }
+
+    tensor_A_arr[device_idx].resize(a_coord_device);
+    tensor_B_arr[device_idx].resize(b_coord_device);
+    tensor_C_arr[device_idx].resize(c_coord_device);
+    tensor_D_arr[device_idx].resize(c_coord_device);
+  }
+  CUDA_CHECK(cudaSetDevice(primary_device_idx));
+}
+
+/// Commandline options -> Gemm/DistGemm Arguments
+using GemmArguments = typename Gemm::Arguments;
+GemmArguments gemm_args_from_options(const Options &options) {
+  typename Gemm::Arguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,
+    {options.m, options.n, options.k, options.l},
+    {tensor_A.device_data(), stride_A, tensor_B.device_data(), stride_B},
+    {
+      {static_cast<ElementCompute>(options.alpha), static_cast<ElementCompute>(options.beta)},
+      tensor_C.device_data(), stride_C,
+      tensor_ref_D.device_data(), stride_D
+    }
+  };
+
+  return arguments;
+}
+
+using DistGemmArguments = typename DistGemm::Arguments;
+DistGemmArguments dist_gemm_args_from_options(
+    const Options &options,
+    int device_idx,
+    cudaStream_t stream) {
+
+  auto problem_shape = cute::make_tuple(options.m, options.n, options.k, options.l);
+
+  auto global_A = cute::make_tensor(tensor_A.device_data(),
+      cute::make_layout(cute::make_shape(options.m, options.k, options.l), stride_A));
+  auto global_B = cute::make_tensor(tensor_B.device_data(),
+      cute::make_layout(cute::make_shape(options.n, options.k, options.l), stride_B));
+  auto global_C = cute::make_tensor(tensor_C.device_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_C));
+
+  auto global_A_device_slice = DistSchedule::get_device_slice_A(global_A, device_idx);
+  auto global_B_device_slice = DistSchedule::get_device_slice_B(global_B, device_idx);
+  auto global_C_device_slice = DistSchedule::get_device_slice_C(global_C, device_idx);
+
+  auto local_shape_A = DistSchedule::get_local_a_shape(problem_shape);
+  auto local_shape_B = DistSchedule::get_local_b_shape(problem_shape);
+  auto local_shape_C = DistSchedule::get_local_c_shape(problem_shape);
+  auto local_shape_D = DistSchedule::get_local_d_shape(problem_shape);
+
+  auto local_stride_A = cutlass::make_cute_packed_stride(StrideA{}, local_shape_A);
+  auto local_stride_B = cutlass::make_cute_packed_stride(StrideB{}, local_shape_B);
+  auto local_stride_C = cutlass::make_cute_packed_stride(StrideC{}, local_shape_C);
+  auto local_stride_D = cutlass::make_cute_packed_stride(StrideD{}, local_shape_D);
+
+  auto local_A = cute::make_tensor(
+      tensor_A_arr[device_idx].device_data(),
+      make_layout(local_shape_A, local_stride_A));
+  auto local_B = cute::make_tensor(
+      tensor_B_arr[device_idx].device_data(),
+      make_layout(local_shape_B, local_stride_B));
+  auto local_C = cute::make_tensor(
+      tensor_C_arr[device_idx].device_data(),
+      make_layout(local_shape_C, local_stride_C));
+  auto local_D = cute::make_tensor(
+      tensor_D_arr[device_idx].device_data(),
+      make_layout(local_shape_D, local_stride_D));
+
+  // Copy over tensor tiles for the first iteration
+  cutlass::device_copy(global_A_device_slice, local_A, stream);
+  cutlass::device_copy(global_B_device_slice, local_B, stream);
+  cutlass::device_copy(global_C_device_slice, local_C, stream);
+
+  DistGemmArguments arguments{
+    cutlass::gemm::GemmUniversalMode::kGemm,                                       // mode
+    problem_shape,                                                                 // problem shape
+    {
+      reinterpret_cast<const ElementA*>(local_A.data()),
+      local_A.stride(),
+      reinterpret_cast<const ElementB*>(local_B.data()),
+      local_B.stride()
+    },                                                                             // mainloop
+    {
+      {                                                                            // epilogue.thread
+        static_cast<ElementCompute>(options.alpha),
+        static_cast<ElementCompute>(options.beta)
+      },
+      reinterpret_cast<const ElementC*>(local_C.data()),
+      local_C.stride(),
+      reinterpret_cast<const ElementD*>(local_D.data()),
+      local_D.stride(),
+    },                                                                             // epilogue
+    {},                                                                            // hw_info
+    {}                                                                             // scheduler
+  };
+
+  return arguments;
+}
+
+// Gathers results, moves back to the original full-sized D tensor on the primary device.
+void gather_results(const Options &options, int device_idx, cudaStream_t stream = nullptr) {
+
+  auto problem_shape = cute::make_tuple(options.m, options.n, options.k, options.l);
+
+  // Global dest
+  auto global_D = cute::make_tensor(tensor_D.device_data(),
+      cute::make_layout(cute::make_shape(options.m, options.n, options.l), stride_D));
+  auto global_D_device_slice = DistSchedule::get_device_slice_D(global_D, device_idx);
+
+  // Device_idx local dest
+  auto local_shape_D = DistSchedule::get_local_d_shape(problem_shape);
+  auto local_stride_D = cutlass::make_cute_packed_stride(StrideD{}, local_shape_D);
+  auto local_D = cute::make_tensor(
+      tensor_D_arr[device_idx].device_data(),
+      make_layout(local_shape_D, local_stride_D)
+  );
+
+  // Copy to global dest
+  cutlass::device_copy(local_D, global_D_device_slice, stream);
+}
+
+bool verify(const Options &options) {
+  tensor_D.sync_host();
+  tensor_ref_D.sync_host();
+
+  bool passed = false;
+  if (options.eps == 0.f) {
+    passed = cutlass::reference::host::TensorEquals(tensor_ref_D.host_view(), tensor_D.host_view());
+  } else {
+    double err = cutlass::reference::host::TensorRelativeErrorMetric(
+      tensor_D.host_view(),
+      tensor_ref_D.host_view());
+    passed = err < 1e-5;
+  }
+
+  if (options.m <= 64 && options.n <= 64) {
+    std::cout << "GEMM output:\n" << tensor_D.host_view() << "\n\n";
+    std::cout << "Reference output:\n" << tensor_ref_D.host_view() << "\n\n";
+  }
+
+  return passed;
+}
+
+/// Execute a given example GEMM computation
+int run(Options &options) {
+
+  int primary_device_idx;
+  cudaError_t device_get_result = cudaGetDevice(&primary_device_idx);
+  if (device_get_result != cudaSuccess) {
+    throw std::runtime_error("cudaGetDevice() failed");
+  }
+
+  initialize(options);
+
+  // Reference single-GPU GEMM
+  Gemm reference_gemm;
+  cutlass::device_memory::allocation<uint8_t> reference_workspace;
+
+  auto reference_arguments = gemm_args_from_options(options);
+  size_t reference_workspace_size = Gemm::get_workspace_size(reference_arguments);
+  reference_workspace = cutlass::device_memory::allocation<uint8_t>(reference_workspace_size);
+
+  CUTLASS_CHECK(reference_gemm.can_implement(reference_arguments));
+  CUTLASS_CHECK(reference_gemm.initialize(reference_arguments, reference_workspace.get()));
+  CUTLASS_CHECK(reference_gemm.run());
+
+  using ElementBarrier = typename DistGemm::ElementBarrier;
+  using ElementFlag = typename DistGemmKernel::ElementFlag;
+
+  // Set up per-device streams
+  cudaStream_t stream_arr[TP_];
+
+  for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+    CUDA_CHECK(cudaSetDevice(device_idx));
+
+    // Create stream
+    CUDA_CHECK(cudaStreamCreate(&stream_arr[device_idx]));
+  }
+
+  // Instantiate DistGEMM
+  DistGemm dist_gemm_arr[TP_];  // Distributed GEMM array for multiple devices
+
+  // Allocate workspace memory
+  cutlass::device_memory::allocation<uint8_t> workspace_arr[TP_];
+  cutlass::device_memory::allocation<uint8_t> exclusive_workspace_arr[TP_];
+
+  // Cross-device workspace pointer array for gemm.initialize()
+  void * workspace_ptr_arr[TP_];
+  void * exclusive_workspace_ptr_arr[TP_];
+
+  // Create a structure of gemm kernel arguments suitable for invoking an instance of Gemm
+  DistGemmArguments arguments_[TP_];
+
+  for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+    CUDA_CHECK(cudaSetDevice(device_idx));
+
+    arguments_[device_idx] = dist_gemm_args_from_options(options, device_idx, stream_arr[device_idx]);
+
+    // Using the arguments, query for extra workspace required for matrix multiplication computation
+    size_t workspace_size = DistGemm::get_workspace_size(arguments_[device_idx]);
+    size_t exclusive_workspace_size = DistGemm::get_exclusive_workspace_size();
+
+    workspace_arr[device_idx] = cutlass::device_memory::allocation<uint8_t>(workspace_size);
+    exclusive_workspace_arr[device_idx] = cutlass::device_memory::allocation<uint8_t>(exclusive_workspace_size);
+
+    // Throw workspace pointers into arrays for gemm.initialize()
+    workspace_ptr_arr[device_idx] = workspace_arr[device_idx].get();
+    exclusive_workspace_ptr_arr[device_idx] = exclusive_workspace_arr[device_idx].get();
+
+    // Zero out exclusive workspace
+    cudaMemsetAsync(exclusive_workspace_ptr_arr[device_idx], 0, exclusive_workspace_size, stream_arr[device_idx]);
+
+    cudaDeviceSynchronize();
+  }
+
+  for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+    CUDA_CHECK(cudaSetDevice(device_idx));
+
+    // Check if the problem size is supported or not
+    CUTLASS_CHECK(dist_gemm_arr[device_idx].can_implement(arguments_[device_idx]));
+
+#if defined(CUTLASS_ENABLE_GDC_FOR_SM90)
+    bool launch_with_pdl = true;
+#else
+    bool launch_with_pdl = false;
+#endif
+
+    // Initialize CUTLASS kernel with arguments and workspace pointer
+    CUTLASS_CHECK(dist_gemm_arr[device_idx].initialize(
+          arguments_,
+          workspace_ptr_arr,
+          exclusive_workspace_ptr_arr,
+          device_idx,
+          stream_arr[device_idx],
+          launch_with_pdl
+          ));
+
+    cudaDeviceSynchronize();
+  }
+
+  // Correctness / Warmup iteration
+  std::cout << std::endl << "  running DistGEMM..." << std::endl;
+
+  for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+    CUDA_CHECK(cudaSetDevice(device_idx));
+    CUTLASS_CHECK(dist_gemm_arr[device_idx].run(stream_arr[device_idx]));
+  }
+  for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+    CUDA_CHECK(cudaStreamSynchronize(stream_arr[device_idx]));
+    CUDA_CHECK(cudaGetLastError());
+    gather_results(options, device_idx);
+  }
+
+  std::cout << "  running DistGEMM finished without runtime errors" << std::endl;
+
+  //// Check if output from CUTLASS kernel and reference kernel are equal or not
+  Result result;
+
+  result.passed = verify(options);
+
+  std::cout << std::endl << "  Disposition (eps: " << options.eps << "): " << 
+    (result.passed ? "Passed" : "Failed") << std::endl;
+
+  if (!result.passed) {
+    exit(-1);
+  }
+
+  // Run profiling loop
+  if (options.iterations > 0) {
+    float elapsed_ms = 0.f;
+
+    // Warmup
+    std::cout << "  Warming up for " << options.warmup_iterations << " iterations." << std::endl;
+    for (int warmup_iter = 0; warmup_iter < options.warmup_iterations; ++warmup_iter) {
+      for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+        CUDA_CHECK(cudaSetDevice(device_idx));
+        CUTLASS_CHECK(dist_gemm_arr[device_idx].run(stream_arr[device_idx]));
+      }
+    }
+
+    for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+      CUDA_CHECK(cudaSetDevice(device_idx));
+      CUDA_CHECK(cudaStreamSynchronize(stream_arr[device_idx]));
+    }
+
+    CUDA_CHECK(cudaSetDevice(primary_device_idx));
+
+    // Benchmark
+    std::cout << "  Profiling for " << options.iterations << " iterations." << std::endl;
+    using AtomicBoolean = cuda::atomic<bool>;
+    AtomicBoolean* atomic_flag_ptr;
+    CUDA_CHECK(cudaHostAlloc(&atomic_flag_ptr, sizeof(AtomicBoolean), cudaHostAllocPortable));
+    atomic_flag_ptr->store(false);
+
+    cutlass::DistGpuTimer<TP_> timer;
+
+    for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+      CUDA_CHECK(cudaSetDevice(device_idx));
+      cutlass::delay_kernel<<<1, 1, 0, stream_arr[device_idx]>>>(atomic_flag_ptr);
+      CUDA_CHECK(cudaGetLastError());
+    }
+
+    for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+      timer.start(device_idx, stream_arr[device_idx]);
+    }
+
+    atomic_flag_ptr->store(true);
+
+    for (int profile_iter = 0; profile_iter < options.iterations; ++profile_iter) {
+      for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+        CUDA_CHECK(cudaSetDevice(device_idx));
+        CUTLASS_CHECK(dist_gemm_arr[device_idx].run(stream_arr[device_idx]));
+      }
+    }
+
+    for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+      CUDA_CHECK(cudaSetDevice(device_idx));
+      timer.stop(device_idx, stream_arr[device_idx]);
+    }
+
+    CUDA_CHECK(cudaSetDevice(primary_device_idx));
+
+    for (int device_idx = 0; device_idx < TP_; ++device_idx) {
+      elapsed_ms = max(elapsed_ms, timer.elapsed_millis(device_idx));
+    }
+
+    // Compute average runtime and TFLOPs.
+    result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations);
+    double avg_runtime_s = (double)(result.avg_runtime_ms / 1000.0);
+    result.tflops = options.tflops(avg_runtime_s);
+
+    auto [local_M, local_N, local_K, local_L] = DistSchedule::get_local_gemm_shape(
+        cute::make_tuple(options.m, options.n, options.k, options.l));
+
+    std::cout << std::endl;
+    std::cout << "  TP: " << TP::value << std::endl;
+    std::cout << "  Problem Size: " << 
+      options.m << " x " << 
+      options.n << " x " << 
+      options.k << " x " << 
+      options.l << std::endl;
+    std::cout << "  Local GEMM Problem Size: " << 
+      local_M << " x " << 
+      local_N << " x " << 
+      local_K << " x " << 
+      local_L<< std::endl;
+    std::cout << "  Avg runtime: " << result.avg_runtime_ms << " ms" << std::endl;
+    std::cout << "  TFLOPS: " << result.tflops << std::endl;
+  }
+
+  return 0;
+}
+
+#endif // (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char const **args) {
+
+  // CUTLASS must be compiled with CUDA Toolkit 12.4 or newer to run this example
+  // and must have compute capability at least 90.
+  // Some necessary cuda graph APIs were only introduced in CUDA 12.4.
+  if (__CUDACC_VER_MAJOR__ < 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ < 4)) {
+    std::cerr << "This example requires CUDA 12.4 or newer." << std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  int num_devices;
+  CUDA_CHECK(cudaGetDeviceCount(&num_devices));
+  if (num_devices < TP_) {
+    std::cerr << "Distributed GEMM is compiled with TP = " << TP::value << ", but " << 
+      "found only " << num_devices << " devices." <<
+      std::endl;
+    // Returning zero so this test passes on older Toolkits. Its actions are no-op.
+    return 0;
+  }
+
+  cudaDeviceProp props;
+  int current_device_id;
+  CUDA_CHECK(cudaGetDevice(&current_device_id));
+  CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
+  cudaError_t error = cudaGetDeviceProperties(&props, 0);
+  if (props.major < 9) {
+    std::cerr
+      << "This example requires a GPU of NVIDIA's Hopper Architecture or "
+      << "later (compute capability 90 or greater)." << std::endl;
+    return 0;
+  }
+
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, args);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  //
+  // Evaluate CUTLASS kernels
+  //
+
+#if (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+  run(options);
+#endif
+
+  return 0;
+}
diff --git a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt b/examples/65_distributed_gemm/CMakeLists.txt
similarity index 89%
rename from examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt
rename to examples/65_distributed_gemm/CMakeLists.txt
index 891ca085..247b3407 100644
--- a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt
+++ b/examples/65_distributed_gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -27,6 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 cutlass_example_add_executable(
-  65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling
-  65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+  65_distributed_gemm
+  65_distributed_gemm.cu
   )
diff --git a/examples/65_distributed_gemm/README.md b/examples/65_distributed_gemm/README.md
new file mode 100644
index 00000000..fc53e6bf
--- /dev/null
+++ b/examples/65_distributed_gemm/README.md
@@ -0,0 +1,64 @@
+# Distributed GEMM
+
+This example implements Tensor Parallel GEMMs for the Hopper architecture with the experimental
+[Distributed GEMM](../../include/cutlass/experimental/distributed) API in CUTLASS.
+
+This example requires Hopper GPUs with an any-to-any NVLink network.
+Please refer to [REQUIREMENTS.md](REQUIREMENTS.md) for more information.
+
+By default, the example assumes 8 GPUs (TP=8) and runs an All Gather + GEMM operation, which rotates
+operand A. To run with a different number of GPUs or schedule, please refer to
+[65_distributed_gemm.cu](65_distributed_gemm.cu).
+
+
+## Getting started
+
+Command line arguments are mostly similar to other examples:
+
+```
+--m=<int>                   Sets the M extent of the GEMM
+--n=<int>                   Sets the N extent of the GEMM
+--k=<int>                   Sets the K extent of the GEMM
+--l=<int>                   Sets the L extent (batch) of the GEMM (default: 1)
+--alpha=<f32>               Epilogue scalar alpha (default: 1.0)
+--beta=<f32>                Epilogue scalar beta (default: 0.0)
+--iterations=<int>          Number of profiling iterations to perform (default: 100)
+--warmup-iterations=<int>   Number of warmup iterations prior to profiling (default: 10)
+--eps=<f32>                 Threshold for error compared to reference GEMM (default: 0.0)
+```
+
+Sample run command:
+
+```bash
+./65_distributed_gemm --m=16384 --n=106496 --k=16384 --warmup-iterations=10 --iterations=100
+```
+
+This executes a GEMM with shape `<16384, 106496, 16384>`, and reports average runtime
+over 100 iterations, with 10 warmup iterations.
+A reference check with respect to a single-device GEMM is also performed by default.
+
+## Trying out other schedules
+
+Schedules that are currently supported are:
+
+* All Gather + GEMM:
+  * `AllGather1D_TilingCD_RotatingA`
+  * `AllGather1D_TilingCD_RotatingB`
+
+* GEMM + Reduce Scatter:
+  * `ReduceScatter1D_TilingA_RotatingC`
+  * `ReduceScatter1D_TilingB_RotatingC`
+
+To try out different schedules, simply change this line in the example, and set your desired
+schedule:
+
+```cpp
+using DistSchedule = cutlass::distributed::schedules::AllGather1D_TilingCD_RotatingA<TP>;
+```
+
+If you're interesting it trying out other TP values (run on a different number of GPUs), the
+procedure is the same, simply modify the following line in the example:
+
+```cpp
+using TP = _8;
+```
diff --git a/examples/65_distributed_gemm/REQUIREMENTS.md b/examples/65_distributed_gemm/REQUIREMENTS.md
new file mode 100644
index 00000000..cc0d5632
--- /dev/null
+++ b/examples/65_distributed_gemm/REQUIREMENTS.md
@@ -0,0 +1,86 @@
+# Distributed GEMM
+
+## Requirements
+
+### Build
+Make sure to set up CUTLASS with
+support for [Programmatic Dependent Launch (PDL)](../../media/docs/dependent_kernel_launch.md),
+that is with the `CUTLASS_ENABLE_GDC_FOR_SM90` flag.
+
+```bash
+cmake $PATH -DCUTLASS_NVCC_ARCHS="90a" -DCUTLASS_ENABLE_GDC_FOR_SM90=1
+```
+
+### Minimum software
+
+Like all other CUTLASS examples, the NVIDIA driver, runtime, and CUDA Toolkit are required.
+This example specifically requires CUDA Toolkit 12.6 or newer, due to some of the necessary
+CUDA graph APIs.
+
+### Hardware / driver settings
+
+This example requires Hopper GPUs with NVLink network.
+
+If you're not sure, first run the following command and make sure your GPU
+compute capability is 9.0:
+
+```bash
+nvidia-smi --query-gpu=name,compute_cap --format=csv
+```
+
+Sample output:
+
+```
+name, compute_cap
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+NVIDIA H100 80GB HBM3, 9.0
+```
+
+
+Then you should make sure there is an NVLink network by checking the GPU network topology,
+and making sure there's `NV*` links between every pair of GPUs:
+
+```bash
+nvidia-smi topo -m
+```
+
+Sample output:
+
+```
+        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
+GPU0     X      NV18    NV18    NV18    NV18    NV18    NV18    NV18
+GPU1    NV18     X      NV18    NV18    NV18    NV18    NV18    NV18
+GPU2    NV18    NV18     X      NV18    NV18    NV18    NV18    NV18
+GPU3    NV18    NV18    NV18     X      NV18    NV18    NV18    NV18
+GPU4    NV18    NV18    NV18    NV18     X      NV18    NV18    NV18
+GPU5    NV18    NV18    NV18    NV18    NV18     X      NV18    NV18
+GPU6    NV18    NV18    NV18    NV18    NV18    NV18     X      NV18
+GPU7    NV18    NV18    NV18    NV18    NV18    NV18    NV18     X
+```
+
+Finally, check if the driver enables peer to peer access, which should usually be the case,
+but it's good to check anyway:
+
+```bash
+nvidia-smi topo -p2p r
+```
+
+Sample output:
+
+```
+       GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7
+GPU0   X       OK      OK      OK      OK      OK      OK      OK
+GPU1   OK      X       OK      OK      OK      OK      OK      OK
+GPU2   OK      OK      X       OK      OK      OK      OK      OK
+GPU3   OK      OK      OK      X       OK      OK      OK      OK
+GPU4   OK      OK      OK      OK      X       OK      OK      OK
+GPU5   OK      OK      OK      OK      OK      X       OK      OK
+GPU6   OK      OK      OK      OK      OK      OK      X       OK
+GPU7   OK      OK      OK      OK      OK      OK      OK      X
+```
diff --git a/examples/65_distributed_gemm/util/benchmark.h b/examples/65_distributed_gemm/util/benchmark.h
new file mode 100644
index 00000000..66a0dbb5
--- /dev/null
+++ b/examples/65_distributed_gemm/util/benchmark.h
@@ -0,0 +1,118 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Benchmark helpers for Distributed GEMM
+
+    A delay kernel to gate all GEMMs across devices, controlled by a flag that
+    the host will set off once it launches DistGEMM across all devices.
+
+    DistGpuTimer extends cutlass's existing cudaEvent-based timer to multiple devices.
+*/
+
+#pragma once
+
+#include <iostream>
+#include <cuda/atomic>
+#include <cuda/std/atomic>
+
+
+namespace cutlass {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Delay kernel
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+using AtomicBoolean = cuda::atomic<bool>;
+
+__global__ void delay_kernel(const AtomicBoolean* atomic_flag_ptr) {
+  while (not atomic_flag_ptr->load()) {
+    __nanosleep(40);
+  }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+/// Distributed GPU Timer
+/// Sets up cuda events for multiple processors.
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <int NP>
+struct DistGpuTimer {
+  int _primary_device;
+  cudaEvent_t _start[NP];
+  cudaEvent_t _stop[NP];
+
+  /// Constructor
+  DistGpuTimer()
+  {
+    CUDA_CHECK(cudaGetDevice(&_primary_device));
+    for (int device = 0; device < NP; ++device) {
+      CUDA_CHECK(cudaSetDevice(device));
+      CUDA_CHECK(cudaEventCreate(&_start[device]));
+      CUDA_CHECK(cudaEventCreate(&_stop[device]));
+    }
+    CUDA_CHECK(cudaSetDevice(_primary_device));
+  }
+
+  /// Destructor
+  ~DistGpuTimer()
+  {
+    for (int device = 0; device < NP; ++device) {
+      CUDA_CHECK(cudaSetDevice(device));
+      CUDA_CHECK(cudaEventDestroy(_start[device]));
+      CUDA_CHECK(cudaEventDestroy(_stop[device]));
+    }
+    CUDA_CHECK(cudaSetDevice(_primary_device));
+  }
+
+  /// Start the timer for a given stream (defaults to the default stream)
+  void start(int device, cudaStream_t stream) {
+    assert(device >= 0 && device < NP);
+    CUDA_CHECK(cudaEventRecord(_start[device], stream));
+  }
+
+  /// Stop the timer
+  void stop(int device, cudaStream_t stream) {
+    assert(device >= 0 && device < NP);
+    CUDA_CHECK(cudaEventRecord(_stop[device], stream));
+  }
+
+  /// Return the elapsed time (in milliseconds)
+  float elapsed_millis(int device) {
+    assert(device >= 0 && device < NP);
+    float elapsed = 0.0;
+    CUDA_CHECK(cudaEventSynchronize(_stop[device]));
+    CUDA_CHECK(cudaEventElapsedTime(&elapsed, _start[device], _stop[device]));
+    return elapsed;
+  }
+};
+
+} //namespace cutlass
diff --git a/examples/65_distributed_gemm/util/device_copy.h b/examples/65_distributed_gemm/util/device_copy.h
new file mode 100644
index 00000000..257800a0
--- /dev/null
+++ b/examples/65_distributed_gemm/util/device_copy.h
@@ -0,0 +1,84 @@
+/******************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/*! \file
+    \brief generic device-to-device data movement kernel based for CuTe tensors.
+
+    NOTE: this kernel assigns one element copy to every thread, and is by no means
+    an efficient way of copying tensors. It should only be used for convenience in
+    reference checks.
+
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+#include "cutlass/cuda_host_adapter.hpp"
+
+namespace cutlass {
+
+template <typename TensorSource, typename TensorDestination>
+void device_copy(TensorSource      tensor_source,
+                 TensorDestination tensor_destination,
+                 cudaStream_t stream);
+
+
+template <typename TensorSource, typename TensorDestination>
+__global__ void device_copy_kernel(TensorSource const tensor_source, 
+                                   TensorDestination tensor_destination) {
+  auto linear_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  using ElementSrc = typename TensorSource::value_type;
+  using ElementDst = typename TensorDestination::value_type;
+  NumericConverter<ElementDst, ElementSrc> converter;
+  if (linear_idx < size(tensor_source)) {
+    tensor_destination(linear_idx) = converter(tensor_source(linear_idx));
+  }
+}
+
+template <typename TensorSource, typename TensorDestination>
+void device_copy(TensorSource      tensor_source,
+                 TensorDestination tensor_destination,
+                 cudaStream_t stream) {
+  
+  assert(tensor_source.size() == tensor_destination.size());
+
+  auto numel = tensor_source.size();
+  static constexpr int NumThreads = 128;
+  auto grid_size = cute::ceil_div(numel, NumThreads);
+
+  dim3 grid(grid_size);
+  dim3 block(NumThreads);
+  device_copy_kernel<<<grid, block, 0, stream>>>(tensor_source, tensor_destination);
+}
+
+} //namespace cutlass
diff --git a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
similarity index 99%
rename from examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
rename to examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
index ca991968..0228f8b1 100644
--- a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,29 +31,21 @@
 
 /*! \file
     \brief Blocked scale Hopper FP8 GEMM example using CUTLASS 3.0 APIs for NVIDIA Hopper architecture
-
     This example demonstrate a blocked scaled FP8 GEMM using the new CUTLASS 3.0.
     APIs on NVIDIA Hopper architecture. New features that will be showcased in this example are as follows:
-
     1. NVIDIA Hopper architecture introduces a new series of tensor core instructions (GMMA)
     which are more efficient than the Ampere tensor core instructions.
-
     2. NVIDIA Hopper architecture includes new Tensor Memory Accelerator (TMA) unit to transfer large
     blocks of data efficiently between global memory and shared memory. TMA also supports asynchronous
     copies between thread blocks in a cluster.
-
     3. This example uses the Warp Specialized kernel design (see /media/docs/efficient_gemm.md for details).
-
     4. This example shows all important fusions used by FP8 gemm kernels, i.e., blocked scale factor for
     A, B tensor, the abs_max value of D tensor.
-
     5. A simple way to tune the CTA rasterization direction and swizzle pattern of Hopper kernels. Both the
     CTA rasterization direction and swizzle pattern impact cross-CTA locality of accesses. By tuning we can
     improve performance.
-
     Examples:
-
-      $ ./examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/64_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling  \
+      $ ./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling  \
         --m=2816 --n=3072 --k=16384 \
         --save_aux=false --save_amax=false \
         --device_scale=false --raster=h --swizzle=2
@@ -730,7 +722,7 @@ int main(int argc, char const **args) {
   CUDA_CHECK(cudaGetDevice(&current_device_id));
   CUDA_CHECK(cudaGetDeviceProperties(&props, current_device_id));
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
-  if (props.major < 9) {
+  if (props.major != 9) {
     std::cerr
       << "This example requires a GPU of NVIDIA's Hopper Architecture or "
       << "later (compute capability 90 or greater).\n";
diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt
new file mode 100644
index 00000000..cdfd522c
--- /dev/null
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+cutlass_example_add_executable(
+  67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling
+  67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu
+  )
diff --git a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
similarity index 98%
rename from examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
rename to examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
index 96d8794d..e8ea5330 100644
--- a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/hopper_fp8_commandline.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
similarity index 99%
rename from examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
rename to examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
index 52a8809f..8904060c 100644
--- a/examples/65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
+++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/reference/host/gemm_with_blockwise_scaling.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -191,7 +191,7 @@ void gett_mainloop(
 
   static_assert(cute::rank(typename MainloopParams::LayoutA{}) == 3, "M, K, B");
   static_assert(cute::rank(typename MainloopParams::LayoutB{}) == 3, "N, K, B");
-  
+
   using cute::raw_pointer_cast;
 
   using ElementA = typename ElementTraits<typename MainloopParams::EngineA::value_type>::type;
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 89dae79e..2524378a 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -144,9 +144,11 @@ foreach(EXAMPLE
   62_hopper_sparse_gemm
   63_hopper_gemm_with_weight_prefetch
   64_ada_fp8_gemm_grouped
-  65_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling
+  65_distributed_gemm
+  67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling
   )
 
   add_subdirectory(${EXAMPLE})
 
 endforeach()
+
diff --git a/examples/common/gather_tensor.hpp b/examples/common/gather_tensor.hpp
index 62616e00..67ae811b 100644
--- a/examples/common/gather_tensor.hpp
+++ b/examples/common/gather_tensor.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/common/helper.h b/examples/common/helper.h
index a7a81e74..aec1c719 100644
--- a/examples/common/helper.h
+++ b/examples/common/helper.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/CMakeLists.txt b/examples/cute/CMakeLists.txt
index 69aefd7c..77487813 100644
--- a/examples/cute/CMakeLists.txt
+++ b/examples/cute/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/CMakeLists.txt b/examples/cute/tutorial/CMakeLists.txt
index b427d936..f263e5ce 100644
--- a/examples/cute/tutorial/CMakeLists.txt
+++ b/examples/cute/tutorial/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/sgemm_1.cu b/examples/cute/tutorial/sgemm_1.cu
index cdd23503..797ccd0f 100644
--- a/examples/cute/tutorial/sgemm_1.cu
+++ b/examples/cute/tutorial/sgemm_1.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/sgemm_2.cu b/examples/cute/tutorial/sgemm_2.cu
index ee2b6b2e..f63d553d 100644
--- a/examples/cute/tutorial/sgemm_2.cu
+++ b/examples/cute/tutorial/sgemm_2.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/sgemm_sm70.cu b/examples/cute/tutorial/sgemm_sm70.cu
index ef6284cf..74b8ee05 100644
--- a/examples/cute/tutorial/sgemm_sm70.cu
+++ b/examples/cute/tutorial/sgemm_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/sgemm_sm80.cu b/examples/cute/tutorial/sgemm_sm80.cu
index 5ae0bf0f..bcc31a0a 100644
--- a/examples/cute/tutorial/sgemm_sm80.cu
+++ b/examples/cute/tutorial/sgemm_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/tiled_copy.cu b/examples/cute/tutorial/tiled_copy.cu
index a8ae3b10..3cdc2784 100644
--- a/examples/cute/tutorial/tiled_copy.cu
+++ b/examples/cute/tutorial/tiled_copy.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/examples/cute/tutorial/wgmma_sm90.cu b/examples/cute/tutorial/wgmma_sm90.cu
index 0baa494a..eb634e23 100644
--- a/examples/cute/tutorial/wgmma_sm90.cu
+++ b/examples/cute/tutorial/wgmma_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/axpby.hpp b/include/cute/algorithm/axpby.hpp
index 339743f4..8c54f46d 100644
--- a/include/cute/algorithm/axpby.hpp
+++ b/include/cute/algorithm/axpby.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/clear.hpp b/include/cute/algorithm/clear.hpp
index 0b3a8eaa..225c46ec 100644
--- a/include/cute/algorithm/clear.hpp
+++ b/include/cute/algorithm/clear.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/cooperative_copy.hpp b/include/cute/algorithm/cooperative_copy.hpp
index c9e02245..8a23bf60 100644
--- a/include/cute/algorithm/cooperative_copy.hpp
+++ b/include/cute/algorithm/cooperative_copy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
-* Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+* Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/cooperative_gemm.hpp b/include/cute/algorithm/cooperative_gemm.hpp
index e4bd5ea6..ecc9d33d 100644
--- a/include/cute/algorithm/cooperative_gemm.hpp
+++ b/include/cute/algorithm/cooperative_gemm.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/copy.hpp b/include/cute/algorithm/copy.hpp
index 84ef4916..740565b5 100644
--- a/include/cute/algorithm/copy.hpp
+++ b/include/cute/algorithm/copy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/fill.hpp b/include/cute/algorithm/fill.hpp
index 3f33a42a..37b97f18 100644
--- a/include/cute/algorithm/fill.hpp
+++ b/include/cute/algorithm/fill.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/functional.hpp b/include/cute/algorithm/functional.hpp
index ef80d018..5c56eb5c 100644
--- a/include/cute/algorithm/functional.hpp
+++ b/include/cute/algorithm/functional.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/gemm.hpp b/include/cute/algorithm/gemm.hpp
index c4713838..97839c04 100644
--- a/include/cute/algorithm/gemm.hpp
+++ b/include/cute/algorithm/gemm.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/prefer.hpp b/include/cute/algorithm/prefer.hpp
index a69e5042..0a1c53e7 100644
--- a/include/cute/algorithm/prefer.hpp
+++ b/include/cute/algorithm/prefer.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/prefetch.hpp b/include/cute/algorithm/prefetch.hpp
index c39f63ac..16bbec51 100644
--- a/include/cute/algorithm/prefetch.hpp
+++ b/include/cute/algorithm/prefetch.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/tensor_algorithms.hpp b/include/cute/algorithm/tensor_algorithms.hpp
index dbffc613..6359a55e 100644
--- a/include/cute/algorithm/tensor_algorithms.hpp
+++ b/include/cute/algorithm/tensor_algorithms.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/algorithm/tuple_algorithms.hpp b/include/cute/algorithm/tuple_algorithms.hpp
index 5a70f590..0ad8af52 100644
--- a/include/cute/algorithm/tuple_algorithms.hpp
+++ b/include/cute/algorithm/tuple_algorithms.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp
index 8fff51be..ba22ef1c 100644
--- a/include/cute/arch/cluster_sm90.hpp
+++ b/include/cute/arch/cluster_sm90.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp
index 84d7779a..4af01e33 100644
--- a/include/cute/arch/config.hpp
+++ b/include/cute/arch/config.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy.hpp b/include/cute/arch/copy.hpp
index 47dbef2f..8b62fa91 100644
--- a/include/cute/arch/copy.hpp
+++ b/include/cute/arch/copy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy_sm50.hpp b/include/cute/arch/copy_sm50.hpp
index 925d9ebe..12518fcc 100644
--- a/include/cute/arch/copy_sm50.hpp
+++ b/include/cute/arch/copy_sm50.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy_sm75.hpp b/include/cute/arch/copy_sm75.hpp
index 3d3d37ac..0e4821b2 100644
--- a/include/cute/arch/copy_sm75.hpp
+++ b/include/cute/arch/copy_sm75.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy_sm80.hpp b/include/cute/arch/copy_sm80.hpp
index e04181bf..71a7b3a4 100644
--- a/include/cute/arch/copy_sm80.hpp
+++ b/include/cute/arch/copy_sm80.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy_sm90.hpp b/include/cute/arch/copy_sm90.hpp
index bcb3b7d1..5c0745da 100644
--- a/include/cute/arch/copy_sm90.hpp
+++ b/include/cute/arch/copy_sm90.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy_sm90_desc.hpp b/include/cute/arch/copy_sm90_desc.hpp
index cc0bf4a3..040c9f4d 100644
--- a/include/cute/arch/copy_sm90_desc.hpp
+++ b/include/cute/arch/copy_sm90_desc.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/copy_sm90_tma.hpp b/include/cute/arch/copy_sm90_tma.hpp
index fb33d63c..60f320e3 100644
--- a/include/cute/arch/copy_sm90_tma.hpp
+++ b/include/cute/arch/copy_sm90_tma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma.hpp b/include/cute/arch/mma.hpp
index 6e06114a..8b97f50a 100644
--- a/include/cute/arch/mma.hpp
+++ b/include/cute/arch/mma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm61.hpp b/include/cute/arch/mma_sm61.hpp
index f7bcb7d1..fce9a47f 100644
--- a/include/cute/arch/mma_sm61.hpp
+++ b/include/cute/arch/mma_sm61.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm70.hpp b/include/cute/arch/mma_sm70.hpp
index 63d96cf5..2acd421d 100644
--- a/include/cute/arch/mma_sm70.hpp
+++ b/include/cute/arch/mma_sm70.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm75.hpp b/include/cute/arch/mma_sm75.hpp
index c33f7b39..7d34e2b4 100644
--- a/include/cute/arch/mma_sm75.hpp
+++ b/include/cute/arch/mma_sm75.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm80.hpp b/include/cute/arch/mma_sm80.hpp
index 17860dd4..e0dbc630 100644
--- a/include/cute/arch/mma_sm80.hpp
+++ b/include/cute/arch/mma_sm80.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm90.hpp b/include/cute/arch/mma_sm90.hpp
index 51d34563..6aa5fba2 100644
--- a/include/cute/arch/mma_sm90.hpp
+++ b/include/cute/arch/mma_sm90.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm90_desc.hpp b/include/cute/arch/mma_sm90_desc.hpp
index a53a9748..e5eff988 100644
--- a/include/cute/arch/mma_sm90_desc.hpp
+++ b/include/cute/arch/mma_sm90_desc.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm90_gmma.hpp b/include/cute/arch/mma_sm90_gmma.hpp
index d809aa4a..c0ce4a2e 100644
--- a/include/cute/arch/mma_sm90_gmma.hpp
+++ b/include/cute/arch/mma_sm90_gmma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm90_gmma_ext.hpp b/include/cute/arch/mma_sm90_gmma_ext.hpp
index 10a36aff..f697b949 100644
--- a/include/cute/arch/mma_sm90_gmma_ext.hpp
+++ b/include/cute/arch/mma_sm90_gmma_ext.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm90_gmma_sparse.hpp b/include/cute/arch/mma_sm90_gmma_sparse.hpp
index ecca91b9..453dbb7b 100644
--- a/include/cute/arch/mma_sm90_gmma_sparse.hpp
+++ b/include/cute/arch/mma_sm90_gmma_sparse.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp b/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
index c224e403..8945551a 100644
--- a/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
+++ b/include/cute/arch/mma_sm90_gmma_sparse_ext.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/arch/util.hpp b/include/cute/arch/util.hpp
index 3749a9c2..ac196028 100644
--- a/include/cute/arch/util.hpp
+++ b/include/cute/arch/util.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_atom.hpp b/include/cute/atom/copy_atom.hpp
index 75b7aa4d..612ef0b6 100644
--- a/include/cute/atom/copy_atom.hpp
+++ b/include/cute/atom/copy_atom.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits.hpp b/include/cute/atom/copy_traits.hpp
index ac746a64..9117a1fb 100644
--- a/include/cute/atom/copy_traits.hpp
+++ b/include/cute/atom/copy_traits.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm50.hpp b/include/cute/atom/copy_traits_sm50.hpp
index 7a693805..5299894b 100644
--- a/include/cute/atom/copy_traits_sm50.hpp
+++ b/include/cute/atom/copy_traits_sm50.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm75.hpp b/include/cute/atom/copy_traits_sm75.hpp
index 9ad82c61..416938b1 100644
--- a/include/cute/atom/copy_traits_sm75.hpp
+++ b/include/cute/atom/copy_traits_sm75.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm80.hpp b/include/cute/atom/copy_traits_sm80.hpp
index 3795f52a..ab8d128c 100644
--- a/include/cute/atom/copy_traits_sm80.hpp
+++ b/include/cute/atom/copy_traits_sm80.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm90.hpp b/include/cute/atom/copy_traits_sm90.hpp
index f9590848..ad479df0 100644
--- a/include/cute/atom/copy_traits_sm90.hpp
+++ b/include/cute/atom/copy_traits_sm90.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm90_im2col.hpp b/include/cute/atom/copy_traits_sm90_im2col.hpp
index 54f76073..e342dbb2 100644
--- a/include/cute/atom/copy_traits_sm90_im2col.hpp
+++ b/include/cute/atom/copy_traits_sm90_im2col.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp
index 4ad7f808..b4fdec0d 100644
--- a/include/cute/atom/copy_traits_sm90_tma.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp b/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
index 3286e72b..eaf3c020 100644
--- a/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
+++ b/include/cute/atom/copy_traits_sm90_tma_swizzle.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_atom.hpp b/include/cute/atom/mma_atom.hpp
index ee77df39..957f0707 100644
--- a/include/cute/atom/mma_atom.hpp
+++ b/include/cute/atom/mma_atom.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits.hpp b/include/cute/atom/mma_traits.hpp
index 0994698a..de24b643 100644
--- a/include/cute/atom/mma_traits.hpp
+++ b/include/cute/atom/mma_traits.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm61.hpp b/include/cute/atom/mma_traits_sm61.hpp
index f72a6394..6b12903b 100644
--- a/include/cute/atom/mma_traits_sm61.hpp
+++ b/include/cute/atom/mma_traits_sm61.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm70.hpp b/include/cute/atom/mma_traits_sm70.hpp
index f0702a96..0b5b5300 100644
--- a/include/cute/atom/mma_traits_sm70.hpp
+++ b/include/cute/atom/mma_traits_sm70.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm75.hpp b/include/cute/atom/mma_traits_sm75.hpp
index 1d3f5196..d60c65fe 100644
--- a/include/cute/atom/mma_traits_sm75.hpp
+++ b/include/cute/atom/mma_traits_sm75.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm80.hpp b/include/cute/atom/mma_traits_sm80.hpp
index 5f7e73e4..f7d5d2fc 100644
--- a/include/cute/atom/mma_traits_sm80.hpp
+++ b/include/cute/atom/mma_traits_sm80.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm90.hpp b/include/cute/atom/mma_traits_sm90.hpp
index b2ced3f8..0467dec3 100644
--- a/include/cute/atom/mma_traits_sm90.hpp
+++ b/include/cute/atom/mma_traits_sm90.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm90_gmma.hpp b/include/cute/atom/mma_traits_sm90_gmma.hpp
index 8f59ff55..b9e78751 100644
--- a/include/cute/atom/mma_traits_sm90_gmma.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm90_gmma_ext.hpp b/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
index 15e2412c..3cab34d2 100644
--- a/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma_ext.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
index 161dc7ec..13ff07c8 100644
--- a/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma_sparse.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp b/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
index 3680b7e1..fc28b8a9 100644
--- a/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
+++ b/include/cute/atom/mma_traits_sm90_gmma_sparse_ext.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/config.hpp b/include/cute/config.hpp
index 792eee90..9e0b33a6 100644
--- a/include/cute/config.hpp
+++ b/include/cute/config.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/alignment.hpp b/include/cute/container/alignment.hpp
index 52e4cbad..f285004c 100644
--- a/include/cute/container/alignment.hpp
+++ b/include/cute/container/alignment.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/array.hpp b/include/cute/container/array.hpp
index 9cdcf5f4..ea3eaf72 100644
--- a/include/cute/container/array.hpp
+++ b/include/cute/container/array.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/array_aligned.hpp b/include/cute/container/array_aligned.hpp
index a9d14a1a..6491f72d 100644
--- a/include/cute/container/array_aligned.hpp
+++ b/include/cute/container/array_aligned.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/array_subbyte.hpp b/include/cute/container/array_subbyte.hpp
index 48d416f4..d6b1fafb 100644
--- a/include/cute/container/array_subbyte.hpp
+++ b/include/cute/container/array_subbyte.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/bit_field.hpp b/include/cute/container/bit_field.hpp
index d7fac42a..cecdaeed 100644
--- a/include/cute/container/bit_field.hpp
+++ b/include/cute/container/bit_field.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/cuda_types.hpp b/include/cute/container/cuda_types.hpp
index fbc314e5..5615fdef 100644
--- a/include/cute/container/cuda_types.hpp
+++ b/include/cute/container/cuda_types.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/packed_tuple.hpp b/include/cute/container/packed_tuple.hpp
index c20df2c2..a7a1c3b2 100644
--- a/include/cute/container/packed_tuple.hpp
+++ b/include/cute/container/packed_tuple.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/tuple.hpp b/include/cute/container/tuple.hpp
index 3123a68d..f2505b35 100644
--- a/include/cute/container/tuple.hpp
+++ b/include/cute/container/tuple.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/container/type_list.hpp b/include/cute/container/type_list.hpp
index a15f2c1c..44001b6d 100644
--- a/include/cute/container/type_list.hpp
+++ b/include/cute/container/type_list.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/int_tuple.hpp b/include/cute/int_tuple.hpp
index 95d06bbd..a5bef3ec 100644
--- a/include/cute/int_tuple.hpp
+++ b/include/cute/int_tuple.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/layout.hpp b/include/cute/layout.hpp
index 26195a47..d1b18694 100644
--- a/include/cute/layout.hpp
+++ b/include/cute/layout.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/layout_composed.hpp b/include/cute/layout_composed.hpp
index 26ae8dc7..fc26fbb3 100644
--- a/include/cute/layout_composed.hpp
+++ b/include/cute/layout_composed.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/arithmetic_tuple.hpp b/include/cute/numeric/arithmetic_tuple.hpp
index 2e469057..32163072 100644
--- a/include/cute/numeric/arithmetic_tuple.hpp
+++ b/include/cute/numeric/arithmetic_tuple.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/complex.hpp b/include/cute/numeric/complex.hpp
index 7dd9ea5b..3115d615 100644
--- a/include/cute/numeric/complex.hpp
+++ b/include/cute/numeric/complex.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/int.hpp b/include/cute/numeric/int.hpp
index 571b3e3e..7031e7ab 100644
--- a/include/cute/numeric/int.hpp
+++ b/include/cute/numeric/int.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/integer_sequence.hpp b/include/cute/numeric/integer_sequence.hpp
index 60801795..4118d9cb 100644
--- a/include/cute/numeric/integer_sequence.hpp
+++ b/include/cute/numeric/integer_sequence.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/integral_constant.hpp b/include/cute/numeric/integral_constant.hpp
index 3a8d036e..349ad3ed 100644
--- a/include/cute/numeric/integral_constant.hpp
+++ b/include/cute/numeric/integral_constant.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -341,14 +341,14 @@ operator||(U, C<t>) {
 #define CUTE_NAMED_UNARY_FN(OP)                                      \
   template <auto t>                                                  \
   CUTE_HOST_DEVICE constexpr                                         \
-  C<OP(t)> OP (C<t>) {                                               \
-    return {};                                                       \
+  auto OP (C<t>) {                                                   \
+    return C<OP(t)>{};                                               \
   }
 #define CUTE_NAMED_BINARY_FN(OP)                                     \
   template <auto t, auto u>                                          \
   CUTE_HOST_DEVICE constexpr                                         \
-  C<OP(t,u)> OP (C<t>, C<u>) {                                       \
-    return {};                                                       \
+  auto OP (C<t>, C<u>) {                                             \
+    return C<OP(t,u)>{};                                             \
   }                                                                  \
   template <auto t, class U,                                         \
             __CUTE_REQUIRES(is_std_integral<U>::value)>              \
diff --git a/include/cute/numeric/integral_ratio.hpp b/include/cute/numeric/integral_ratio.hpp
index a614bdb2..0104c31f 100644
--- a/include/cute/numeric/integral_ratio.hpp
+++ b/include/cute/numeric/integral_ratio.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/math.hpp b/include/cute/numeric/math.hpp
index e493a3a9..83dcd4e6 100644
--- a/include/cute/numeric/math.hpp
+++ b/include/cute/numeric/math.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/numeric_types.hpp b/include/cute/numeric/numeric_types.hpp
index b9943b8c..c5669165 100644
--- a/include/cute/numeric/numeric_types.hpp
+++ b/include/cute/numeric/numeric_types.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/numeric/real.hpp b/include/cute/numeric/real.hpp
index 4ce58dfa..0bc9555f 100644
--- a/include/cute/numeric/real.hpp
+++ b/include/cute/numeric/real.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/pointer.hpp b/include/cute/pointer.hpp
index cc49b6a3..4df82c72 100644
--- a/include/cute/pointer.hpp
+++ b/include/cute/pointer.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/pointer_base.hpp b/include/cute/pointer_base.hpp
index 57ad0b3c..701fe135 100644
--- a/include/cute/pointer_base.hpp
+++ b/include/cute/pointer_base.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/pointer_flagged.hpp b/include/cute/pointer_flagged.hpp
index eb8d7e45..7f205347 100644
--- a/include/cute/pointer_flagged.hpp
+++ b/include/cute/pointer_flagged.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/pointer_sparse.hpp b/include/cute/pointer_sparse.hpp
index ccae4586..56c4c295 100644
--- a/include/cute/pointer_sparse.hpp
+++ b/include/cute/pointer_sparse.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/pointer_swizzle.hpp b/include/cute/pointer_swizzle.hpp
index 1a802cfd..5706cd76 100644
--- a/include/cute/pointer_swizzle.hpp
+++ b/include/cute/pointer_swizzle.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/stride.hpp b/include/cute/stride.hpp
index f2d31f4e..629cdfda 100644
--- a/include/cute/stride.hpp
+++ b/include/cute/stride.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/swizzle.hpp b/include/cute/swizzle.hpp
index 52abf856..2ae7d09a 100644
--- a/include/cute/swizzle.hpp
+++ b/include/cute/swizzle.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -68,7 +68,7 @@ struct Swizzle
   using zzz_msk = cute::constant<int, bit_msk{} << (num_base - min(0,num_shft))>;
   using msk_sft = cute::constant<int, num_shft>;
 
-  static constexpr uint32_t swizzle_code = uint32_t(yyy_msk{} | zzz_msk{});
+  static constexpr uint32_t swizzle_code = uint32_t(yyy_msk::value | zzz_msk::value);
 
   template <class Offset>
   CUTE_HOST_DEVICE constexpr static
diff --git a/include/cute/swizzle_layout.hpp b/include/cute/swizzle_layout.hpp
index 7f7161bc..61a91d92 100644
--- a/include/cute/swizzle_layout.hpp
+++ b/include/cute/swizzle_layout.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/tensor.hpp b/include/cute/tensor.hpp
index 3f3335b6..1ab62fd5 100644
--- a/include/cute/tensor.hpp
+++ b/include/cute/tensor.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/tensor_impl.hpp b/include/cute/tensor_impl.hpp
index 2be19c15..5218ba37 100644
--- a/include/cute/tensor_impl.hpp
+++ b/include/cute/tensor_impl.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/tensor_predicate.hpp b/include/cute/tensor_predicate.hpp
index 9c8a2ba6..d39f6ada 100644
--- a/include/cute/tensor_predicate.hpp
+++ b/include/cute/tensor_predicate.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/tensor_zip.hpp b/include/cute/tensor_zip.hpp
index 6d70ffc8..3b9b2ae3 100644
--- a/include/cute/tensor_zip.hpp
+++ b/include/cute/tensor_zip.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/underscore.hpp b/include/cute/underscore.hpp
index e9d80fe5..8a83b867 100644
--- a/include/cute/underscore.hpp
+++ b/include/cute/underscore.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/util/debug.hpp b/include/cute/util/debug.hpp
index 26454443..5e704b25 100644
--- a/include/cute/util/debug.hpp
+++ b/include/cute/util/debug.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/util/print.hpp b/include/cute/util/print.hpp
index dbd65816..72c852e2 100644
--- a/include/cute/util/print.hpp
+++ b/include/cute/util/print.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cute/util/type_traits.hpp b/include/cute/util/type_traits.hpp
index a3074ef9..6ab1d471 100644
--- a/include/cute/util/type_traits.hpp
+++ b/include/cute/util/type_traits.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h
index 0d2bb290..8468f54b 100644
--- a/include/cutlass/aligned_buffer.h
+++ b/include/cutlass/aligned_buffer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h
index 36d4676b..e8859700 100644
--- a/include/cutlass/arch/arch.h
+++ b/include/cutlass/arch/arch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/barrier.h b/include/cutlass/arch/barrier.h
index d35debb9..ad4564ca 100644
--- a/include/cutlass/arch/barrier.h
+++ b/include/cutlass/arch/barrier.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h
index 9d2344bf..5128ee02 100644
--- a/include/cutlass/arch/cache_operation.h
+++ b/include/cutlass/arch/cache_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h
index 0fc60f41..5f842a4b 100644
--- a/include/cutlass/arch/config.h
+++ b/include/cutlass/arch/config.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/grid_dependency_control.h b/include/cutlass/arch/grid_dependency_control.h
index 14ef1974..10421406 100644
--- a/include/cutlass/arch/grid_dependency_control.h
+++ b/include/cutlass/arch/grid_dependency_control.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
index db9ad739..0fb47b17 100644
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h
index 0e957c72..91926877 100644
--- a/include/cutlass/arch/memory_sm75.h
+++ b/include/cutlass/arch/memory_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h
index cb0ba4b5..4e812935 100644
--- a/include/cutlass/arch/memory_sm80.h
+++ b/include/cutlass/arch/memory_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h
index 007ba19b..2fcf2ee1 100644
--- a/include/cutlass/arch/mma.h
+++ b/include/cutlass/arch/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -73,15 +73,15 @@ struct OpMultiplyAddFastF16 {};
 struct OpMultiplyAddMixedInputUpcast {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input is converted to 2 (big and small) TF32 components
-//  Perform 3xTF32 or 4xTF32 for every F32 output element
+/// Tag indicating the input is converted to 2 (big and small) TF32 or FP16 components
+//  Perform 3xTF32 or 4xTF32 for every F32 output element on Ampere
+//  Perform 3xFP16 or 4xFP16 for every F32 output element on Hopper with axiswise quantization factor support
 struct OpMultiplyAddFastF32 {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
-/// Tag indicating the input is converted to 2 (big and small) TF32 components
-//  Perform 3xTF32 or 4xTF32 for every complex<F32> output element
+/// Tag indicating the input is converted to 2 (big and small) TF32 or FP16 components
+//  Perform 3xTF32 or 4xTF32 for every complex<F32> output element on Ampere
+//  Perform 3xFP16 or 4xFP16 for every complex<F32> output element on Hopper with axiswise quantization factor support
 struct OpMultiplyAddComplexFastF32 {};
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h
index 98ff18be..1701158b 100644
--- a/include/cutlass/arch/mma_sm50.h
+++ b/include/cutlass/arch/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h
index 32322bc4..31ef2b65 100644
--- a/include/cutlass/arch/mma_sm60.h
+++ b/include/cutlass/arch/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h
index 82a5aa72..b780335e 100644
--- a/include/cutlass/arch/mma_sm61.h
+++ b/include/cutlass/arch/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h
index 28bb4638..e4889a21 100644
--- a/include/cutlass/arch/mma_sm70.h
+++ b/include/cutlass/arch/mma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h
index a39ededb..120b116b 100644
--- a/include/cutlass/arch/mma_sm75.h
+++ b/include/cutlass/arch/mma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h
index 19d78bf2..d89974fc 100644
--- a/include/cutlass/arch/mma_sm80.h
+++ b/include/cutlass/arch/mma_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm89.h b/include/cutlass/arch/mma_sm89.h
index d8a75b66..80a62b13 100644
--- a/include/cutlass/arch/mma_sm89.h
+++ b/include/cutlass/arch/mma_sm89.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sm90.h b/include/cutlass/arch/mma_sm90.h
index 16108f0a..b1314a56 100644
--- a/include/cutlass/arch/mma_sm90.h
+++ b/include/cutlass/arch/mma_sm90.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sparse_sm80.h b/include/cutlass/arch/mma_sparse_sm80.h
index ed2a5ad0..187ccc17 100644
--- a/include/cutlass/arch/mma_sparse_sm80.h
+++ b/include/cutlass/arch/mma_sparse_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/mma_sparse_sm89.h b/include/cutlass/arch/mma_sparse_sm89.h
index 2fae35be..b6c1bfe3 100644
--- a/include/cutlass/arch/mma_sparse_sm89.h
+++ b/include/cutlass/arch/mma_sparse_sm89.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/reg_reconfig.h b/include/cutlass/arch/reg_reconfig.h
index d2b43445..766c2223 100644
--- a/include/cutlass/arch/reg_reconfig.h
+++ b/include/cutlass/arch/reg_reconfig.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h
index f670fc29..a1dc7dff 100644
--- a/include/cutlass/arch/simd.h
+++ b/include/cutlass/arch/simd.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h
index 6e1ef204..59f38d62 100644
--- a/include/cutlass/arch/simd_sm60.h
+++ b/include/cutlass/arch/simd_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h
index b783c943..46c22665 100644
--- a/include/cutlass/arch/simd_sm61.h
+++ b/include/cutlass/arch/simd_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/synclog.hpp b/include/cutlass/arch/synclog.hpp
index 8cf65ad7..b9819838 100644
--- a/include/cutlass/arch/synclog.hpp
+++ b/include/cutlass/arch/synclog.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h
index 720895f3..2cafa510 100644
--- a/include/cutlass/arch/wmma.h
+++ b/include/cutlass/arch/wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h
index d75ee2b0..99d81487 100644
--- a/include/cutlass/arch/wmma_sm70.h
+++ b/include/cutlass/arch/wmma_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h
index b644181b..3c488c76 100644
--- a/include/cutlass/arch/wmma_sm72.h
+++ b/include/cutlass/arch/wmma_sm72.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h
index f6036051..d49e8ca8 100644
--- a/include/cutlass/arch/wmma_sm75.h
+++ b/include/cutlass/arch/wmma_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/array.h b/include/cutlass/array.h
index e85d19fa..0258d0d5 100644
--- a/include/cutlass/array.h
+++ b/include/cutlass/array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -985,6 +985,20 @@ struct multiply_add<Array<T, N>, Array<T, N>, Array<T, N>> {
 
     return result;
   }
+
+  CUTLASS_HOST_DEVICE
+  Array<T, N> operator()(Array<T, N> const &a, T const &scalar_b, T const &scalar_c) const {
+
+    Array<T, N> result;
+    multiply_add<T> scalar_op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = scalar_op(a[i], scalar_b, scalar_c);
+    }
+
+    return result;
+  }
 };
 
 /// Fused square-and-plus
@@ -1760,6 +1774,50 @@ struct multiply_add<Array<half_t, N>, Array<half_t, N>, Array<half_t, N>> {
 
     return result;
   }
+
+  CUTLASS_HOST_DEVICE
+  Array<half_t, N> operator()(
+    Array<half_t, N> const &a,
+    half_t const &b,
+    half_t const &c) const {
+
+    Array<half_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+
+    __half2 *result_ptr = reinterpret_cast<__half2 *>(&result);
+    __half2 const *a_ptr = reinterpret_cast<__half2 const *>(&a);
+    __half2 b_pair = __half2half2(reinterpret_cast<__half const &>(b));
+    __half2 c_pair = __half2half2(reinterpret_cast<__half const &>(c));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      result_ptr[i] = __hfma2(a_ptr[i], b_pair, c_pair);
+    }
+
+    if constexpr (N % 2) {
+
+      __half const *a_residual_ptr = reinterpret_cast<__half const *>(&a);
+
+      __half d_residual = __hfma(
+        a_residual_ptr[N - 1],
+        reinterpret_cast<__half const &>(b),
+        reinterpret_cast<__half const &>(c));
+
+      result[N - 1] = reinterpret_cast<half_t const &>(d_residual);
+    }
+
+    #else
+
+    multiply_add<half_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c);
+    }
+    #endif
+
+    return result;
+  }
 };
 
 /// Fused multiply-add-relu0
@@ -2385,6 +2443,60 @@ struct multiply_add<Array<bfloat16_t, N>, Array<bfloat16_t, N>, Array<bfloat16_t
 
     return result;
   }
+
+  CUTLASS_HOST_DEVICE
+  Array<bfloat16_t, N> operator()(
+    Array<bfloat16_t, N> const &a,
+    bfloat16_t const &b,
+    bfloat16_t const &c) const {
+
+    Array<bfloat16_t, N> result;
+    #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+
+    unsigned *result_ptr = reinterpret_cast<unsigned *>(&result);
+
+    unsigned const *a_ptr = reinterpret_cast<unsigned const *>(&a);
+
+    unsigned b_packed = static_cast<unsigned>(b.raw());
+    b_packed = (b_packed | (b_packed << 16));
+
+    unsigned c_packed = static_cast<unsigned>(c.raw());
+    c_packed = (c_packed | (c_packed << 16));
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N / 2; ++i) {
+      asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n"
+        : "=r"(result_ptr[i])
+        : "r"(a_ptr[i]), "r"(b_packed), "r"(c_packed)
+      );
+    }
+
+    if constexpr (N % 2) {
+
+      uint16_t *result_ptr = reinterpret_cast<uint16_t *>(&result);
+      uint16_t const *a_residual_ptr = reinterpret_cast<uint16_t const *>(&a);
+      uint16_t const *b_residual_ptr = reinterpret_cast<uint16_t const *>(&b);
+      uint16_t const *c_residual_ptr = reinterpret_cast<uint16_t const *>(&c);
+
+      asm ("fma.rn.bf16 %0, %1, %2, %3;\n"
+        : "=h"(result_ptr[N - 1])
+        : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[0])
+      );
+    }
+
+
+    #else
+
+    multiply_add<bfloat16_t> op;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < N; ++i) {
+      result[i] = op(a[i], b, c);
+    }
+    #endif
+
+    return result;
+  }
 };
 
 
diff --git a/include/cutlass/array_planar_complex.h b/include/cutlass/array_planar_complex.h
index 2dd8aa84..0bd9d0d7 100644
--- a/include/cutlass/array_planar_complex.h
+++ b/include/cutlass/array_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h
index d2e0e5ef..6a61379c 100644
--- a/include/cutlass/array_subbyte.h
+++ b/include/cutlass/array_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index 6f2373b6..8919e992 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h
index 5af6d3ab..5e2f40b1 100644
--- a/include/cutlass/bfloat16.h
+++ b/include/cutlass/bfloat16.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/blas3.h b/include/cutlass/blas3.h
index d41f1ee6..8788f18b 100644
--- a/include/cutlass/blas3.h
+++ b/include/cutlass/blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/blas3_types.h b/include/cutlass/blas3_types.h
index 653b93b7..e47002b1 100644
--- a/include/cutlass/blas3_types.h
+++ b/include/cutlass/blas3_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/block_striped.h b/include/cutlass/block_striped.h
index 09f3fb04..93665c64 100644
--- a/include/cutlass/block_striped.h
+++ b/include/cutlass/block_striped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/cluster_launch.hpp b/include/cutlass/cluster_launch.hpp
index a0fa22b6..3b089bf6 100644
--- a/include/cutlass/cluster_launch.hpp
+++ b/include/cutlass/cluster_launch.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,7 @@
  **************************************************************************************************/
 
 /*! \file
-    \brief PTX for TMA Tensor Memory Access operators on memory added for SM90
+    \brief CUDA interfaces to launch CUTLASS device-level operators (for >= SM90) that use thread-block clusters.
 */
 
 #pragma once
@@ -75,6 +75,17 @@ namespace cutlass {
 struct ClusterLauncher {
   constexpr static int MaxClusterSize = 32;
 
+  struct LaunchConfig {
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    cudaLaunchConfig_t launch_config;
+    constexpr static int numAttrs = 2;
+    cudaLaunchAttribute launch_attribute[numAttrs];
+  // Commonly used utility functions
+  dim3 gridDim()  { return launch_config.gridDim;  }
+  dim3 blockDim() { return launch_config.blockDim; }
+#endif
+  };
+
   // Check for hardware compatibility
   static inline CUTLASS_HOST
   Status check_cluster_dims(dim3 grid, dim3 cluster) {
@@ -124,6 +135,42 @@ struct ClusterLauncher {
 #endif
   }
 
+  static inline CUTLASS_HOST
+  LaunchConfig make_cluster_launch_config(
+      dim3 const grid_dims,
+      dim3 const cluster_dims,
+      dim3 const block_dims,
+      size_t const smem_size = 0,
+      cudaStream_t cuda_stream = 0,
+      bool launch_with_pdl = false
+    ) {
+    LaunchConfig cluster_launch_config;
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    auto &launch_config    = cluster_launch_config.launch_config;
+    auto &launch_attribute = cluster_launch_config.launch_attribute;
+    auto numAttrs = cluster_launch_config.numAttrs;
+
+    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
+    launch_attribute[0].val.clusterDim = {cluster_dims.x, cluster_dims.y, cluster_dims.z};
+    CUTLASS_TRACE_HOST("ClusterLauncher: Setting ClusterDims = "
+        "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
+    // PDL attributes
+    launch_attribute[numAttrs - 1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    launch_attribute[numAttrs - 1].val.programmaticStreamSerializationAllowed = 1;
+
+    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
+    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
+    launch_config.dynamicSmemBytes = smem_size;
+    launch_config.stream = cuda_stream;
+    launch_config.numAttrs = launch_with_pdl ? numAttrs : numAttrs - 1;
+    launch_config.attrs = launch_attribute;
+    return cluster_launch_config;
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
+    return cluster_launch_config;
+#endif
+  }
+
   // This is the method we expect to use going forward
   static inline CUTLASS_HOST
   Status launch(
@@ -136,7 +183,11 @@ struct ClusterLauncher {
       void** kernel_params,
       bool launch_with_pdl = false) {
 #if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
-    if (check_cluster_dims(grid_dims, cluster_dims) != Status::kSuccess) {
+    LaunchConfig cluster_launch_config = make_cluster_launch_config(grid_dims, cluster_dims,
+                                            block_dims, smem_size, cuda_stream, launch_with_pdl);
+
+    auto launch_grid_dims = cluster_launch_config.gridDim();
+    if (check_cluster_dims(launch_grid_dims, cluster_dims) != Status::kSuccess) {
       CUTLASS_TRACE_HOST("ClusterLauncher: check_cluster_dims() failed. Aborting.");
       return Status::kInvalid;
     }
@@ -147,33 +198,13 @@ struct ClusterLauncher {
       return Status::kInvalid;
     }
 
-    cudaLaunchConfig_t launch_config;
-    launch_config.gridDim = {grid_dims.x, grid_dims.y, grid_dims.z};
-    launch_config.blockDim = {block_dims.x, block_dims.y, block_dims.z};
-    launch_config.dynamicSmemBytes = smem_size;
-    launch_config.stream = cuda_stream;
-
-    cudaLaunchAttribute launch_attribute[2];
-
-    launch_attribute[0].id = cudaLaunchAttributeClusterDimension;
-    launch_attribute[0].val.clusterDim.x = cluster_dims.x;
-    launch_attribute[0].val.clusterDim.y = cluster_dims.y;
-    launch_attribute[0].val.clusterDim.z = cluster_dims.z;
-
-    launch_attribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
-    launch_attribute[1].val.programmaticStreamSerializationAllowed = 1;
-
-    launch_config.numAttrs = launch_with_pdl ? 2 : 1;
-
-    launch_config.attrs = launch_attribute;
-
     CUTLASS_TRACE_HOST("ClusterLauncher: Launching GPC_CLUSTER_GRID GridDims = "
-        "(" << grid_dims.x << ", " << grid_dims.y << ", " << grid_dims.z << "), "
+        "(" << launch_grid_dims.x << ", " << launch_grid_dims.y << ", " << launch_grid_dims.z << "), "
         "And ClusterDims = "
         "(" << cluster_dims.x << ", " << cluster_dims.y << ", " << cluster_dims.z << ")\n");
 
     cutlass::arch::synclog_setup();
-    cudaError_t status = cudaLaunchKernelExC(&launch_config, kernel, kernel_params);
+    cudaError_t status = cudaLaunchKernelExC(&cluster_launch_config.launch_config, kernel, kernel_params);
     Return_Status(status);
 #else
     CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster launch.");
diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h
index 6d0bf31d..723f1e3f 100644
--- a/include/cutlass/complex.h
+++ b/include/cutlass/complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/constants.h b/include/cutlass/constants.h
index 49d96045..f5df0172 100644
--- a/include/cutlass/constants.h
+++ b/include/cutlass/constants.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/builders/sm90_common.inl b/include/cutlass/conv/collective/builders/sm90_common.inl
index 526db83e..c0a48ebc 100644
--- a/include/cutlass/conv/collective/builders/sm90_common.inl
+++ b/include/cutlass/conv/collective/builders/sm90_common.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl b/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
index a08209ef..c298ffb6 100644
--- a/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
+++ b/include/cutlass/conv/collective/builders/sm90_gmma_builder.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/collective_builder.hpp b/include/cutlass/conv/collective/collective_builder.hpp
index 9d6a16c0..278271d7 100644
--- a/include/cutlass/conv/collective/collective_builder.hpp
+++ b/include/cutlass/conv/collective/collective_builder.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/collective_conv.hpp b/include/cutlass/conv/collective/collective_conv.hpp
index d187b5ec..8ecd6c95 100644
--- a/include/cutlass/conv/collective/collective_conv.hpp
+++ b/include/cutlass/conv/collective/collective_conv.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/detail.hpp b/include/cutlass/conv/collective/detail.hpp
index ac272c8e..a9867546 100644
--- a/include/cutlass/conv/collective/detail.hpp
+++ b/include/cutlass/conv/collective/detail.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
index 0e5d898d..f29d5780 100644
--- a/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/conv/collective/sm90_implicit_gemm_gmma_ss_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -390,6 +390,22 @@ public:
       }
     }
 
+    if (is_im2col_A || is_im2col_B) {
+      // Check valid filter offsets for TMA_LOAD_IM2COL, unsigned int ranging from [0, offset_limit - 1]
+      constexpr int32_t offset_limit = 1 << (16 / NumSpatialDimensions);
+      auto flt_data = (ConvOp == conv::Operator::kWgrad) ? problem_shape.shape_C : problem_shape.shape_B;
+      for (int i = 0; i < problem_shape.RankS; ++i) {
+        // flt_data array contains [K, T, R, S, C], so pure filter [T, R, S] starts from the second position in the array
+        implementable = implementable && (flt_data[i+1] * problem_shape.dilation[i] >= 0)
+                                      && (flt_data[i+1] * problem_shape.dilation[i] < offset_limit);
+      }
+
+      if (!implementable) {
+        CUTLASS_TRACE_HOST("  CAN IMPLEMENT: tensor coordinate offset values don't meet requirements for TMA LOAD IM2COL.\n");
+        return false;
+      }
+    }
+
     // Wgrad kernels don't support non-packed output strides, non-packed tensor A stride (linearized)
     if constexpr (ConvOp == conv::Operator::kWgrad) {
 #if defined(CUTLASS_DEBUG_TRACE_LEVEL) && (CUTLASS_DEBUG_TRACE_LEVEL > 1)
diff --git a/include/cutlass/conv/conv2d_problem_size.h b/include/cutlass/conv/conv2d_problem_size.h
index d2e89529..b5d78ce5 100644
--- a/include/cutlass/conv/conv2d_problem_size.h
+++ b/include/cutlass/conv/conv2d_problem_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/conv3d_problem_size.h b/include/cutlass/conv/conv3d_problem_size.h
index 9a9514f2..a7e08361 100644
--- a/include/cutlass/conv/conv3d_problem_size.h
+++ b/include/cutlass/conv/conv3d_problem_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/convnd_problem_shape.hpp b/include/cutlass/conv/convnd_problem_shape.hpp
index cd2f674f..3c31c21b 100644
--- a/include/cutlass/conv/convnd_problem_shape.hpp
+++ b/include/cutlass/conv/convnd_problem_shape.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/convolution.h b/include/cutlass/conv/convolution.h
index 243ee269..a3cc98b4 100644
--- a/include/cutlass/conv/convolution.h
+++ b/include/cutlass/conv/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/detail.hpp b/include/cutlass/conv/detail.hpp
index 3e417356..0802921d 100644
--- a/include/cutlass/conv/detail.hpp
+++ b/include/cutlass/conv/detail.hpp
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/device/conv_universal_adapter.hpp b/include/cutlass/conv/device/conv_universal_adapter.hpp
index 193f8d88..4437ae15 100644
--- a/include/cutlass/conv/device/conv_universal_adapter.hpp
+++ b/include/cutlass/conv/device/conv_universal_adapter.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/device/direct_convolution.h b/include/cutlass/conv/device/direct_convolution.h
index 43ab94b5..387574b9 100644
--- a/include/cutlass/conv/device/direct_convolution.h
+++ b/include/cutlass/conv/device/direct_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution.h b/include/cutlass/conv/device/implicit_gemm_convolution.h
index a1cb06e9..f166afc8 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
index 265156cc..efd3dcba 100644
--- a/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
+++ b/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/dispatch_policy.hpp b/include/cutlass/conv/dispatch_policy.hpp
index b8b5eb2b..d9e20f46 100644
--- a/include/cutlass/conv/dispatch_policy.hpp
+++ b/include/cutlass/conv/dispatch_policy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/conv_universal.hpp b/include/cutlass/conv/kernel/conv_universal.hpp
index 23ccea2f..c9bd4b9f 100644
--- a/include/cutlass/conv/kernel/conv_universal.hpp
+++ b/include/cutlass/conv/kernel/conv_universal.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -60,6 +60,5 @@ class ConvUniversal {
 } // namespace cutlass::conv::kernel
 
 ////////////////////////////////////////////////////////////////////////////////
-
 #include "cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp"
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/conv/kernel/default_conv2d.h b/include/cutlass/conv/kernel/default_conv2d.h
index 79bedb2c..f9647a59 100644
--- a/include/cutlass/conv/kernel/default_conv2d.h
+++ b/include/cutlass/conv/kernel/default_conv2d.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_dgrad.h b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
index c5a8b131..27a96a56 100644
--- a/include/cutlass/conv/kernel/default_conv2d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop.h b/include/cutlass/conv/kernel/default_conv2d_fprop.h
index 9fbd97e5..77e4c5dc 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h b/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
index 8589ace0..107a1be6 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
index 76bc1288..ccc75153 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
index 0825789c..b7fca981 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h b/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
index e6e8a822..5c2c7ffc 100644
--- a/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
+++ b/include/cutlass/conv/kernel/default_conv2d_fprop_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_group_fprop.h b/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
index e2deaf6f..99e353d8 100644
--- a/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv2d_group_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad.h b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
index d0e52dfe..d55d453e 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h b/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
index 110e07db..83b680ec 100644
--- a/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv2d_wgrad_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv3d_dgrad.h b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
index cb50ba49..309924ce 100644
--- a/include/cutlass/conv/kernel/default_conv3d_dgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop.h b/include/cutlass/conv/kernel/default_conv3d_fprop.h
index 41fdd64a..4b6709f0 100644
--- a/include/cutlass/conv/kernel/default_conv3d_fprop.h
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h b/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
index d0457d57..024fb820 100644
--- a/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h b/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
index 0fc291e6..2fb12c2a 100644
--- a/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
+++ b/include/cutlass/conv/kernel/default_conv3d_fprop_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_conv3d_wgrad.h b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
index 4ed5e0c1..6b50d208 100644
--- a/include/cutlass/conv/kernel/default_conv3d_wgrad.h
+++ b/include/cutlass/conv/kernel/default_conv3d_wgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_deconv2d.h b/include/cutlass/conv/kernel/default_deconv2d.h
index 4db152cd..a58046ff 100644
--- a/include/cutlass/conv/kernel/default_deconv2d.h
+++ b/include/cutlass/conv/kernel/default_deconv2d.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h b/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
index d11432ed..e62187e3 100644
--- a/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
+++ b/include/cutlass/conv/kernel/default_deconv2d_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_deconv3d.h b/include/cutlass/conv/kernel/default_deconv3d.h
index 70800c7a..cb7ca07e 100644
--- a/include/cutlass/conv/kernel/default_deconv3d.h
+++ b/include/cutlass/conv/kernel/default_deconv3d.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h b/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
index affe7a06..e25c8b2e 100644
--- a/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
+++ b/include/cutlass/conv/kernel/default_deconv3d_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/default_depthwise_fprop.h b/include/cutlass/conv/kernel/default_depthwise_fprop.h
index aa4f2c35..ba70813e 100644
--- a/include/cutlass/conv/kernel/default_depthwise_fprop.h
+++ b/include/cutlass/conv/kernel/default_depthwise_fprop.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/direct_convolution.h b/include/cutlass/conv/kernel/direct_convolution.h
index d4e98fa4..8c049887 100644
--- a/include/cutlass/conv/kernel/direct_convolution.h
+++ b/include/cutlass/conv/kernel/direct_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution.h b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
index b1e0b477..d3fa0e90 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
index 74ecae40..5451c176 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
index bf00f90b..071854cd 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_strided_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
index b05fd2d3..0113473f 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
index 1f27e068..1e810e3d 100644
--- a/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
+++ b/include/cutlass/conv/kernel/implicit_gemm_convolution_with_fused_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp b/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
index 657ac6b3..2c02a453 100644
--- a/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/thread/depthwise_mma.h b/include/cutlass/conv/thread/depthwise_mma.h
index 37ece792..41eaba2f 100644
--- a/include/cutlass/conv/thread/depthwise_mma.h
+++ b/include/cutlass/conv/thread/depthwise_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
index 978c14fe..2da2b73b 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
index 6fb1cb18..8a5e60b9 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 1de41f3f..b33645c1 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
index ffa13c93..638c6607 100644
--- a/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
index 9317ea0c..e4eb011e 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
index 5a4489c0..c608ce53 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
index 3f1f2bc1..ed0e38c2 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
index 243d724b..1a5c33e8 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
index 1725db5a..ed200ed3 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
index a1291aa0..f208c9a5 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_few_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
index e90d5017..2dc2151d 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_fixed_channels.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
index 4c2343c3..9b12fbe3 100644
--- a/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_params.h b/include/cutlass/conv/threadblock/conv2d_params.h
index d34bc9fa..8a3828fc 100644
--- a/include/cutlass/conv/threadblock/conv2d_params.h
+++ b/include/cutlass/conv/threadblock/conv2d_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
index 17f4594b..13bd29b7 100644
--- a/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
+++ b/include/cutlass/conv/threadblock/conv2d_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
index 3e3a4f15..b5a24077 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
index 8cbcc3d9..56197279 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
index 793649db..ea48bc6d 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
index 07233d89..8e5048fd 100644
--- a/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv2d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
index 943ab88c..d996003f 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
index 2d5837dd..a269b18b 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
index 30b7f2fc..700c3d12 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
index 5a53c8cb..69915bab 100644
--- a/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_dgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
index f0f9a86a..5a888e0f 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
index 78b270eb..057023c0 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
index 9f04adc4..4a40d37e 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
index efe34497..b4e7db3a 100644
--- a/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_fprop_filter_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_params.h b/include/cutlass/conv/threadblock/conv3d_params.h
index ac422b8f..941f4e1d 100644
--- a/include/cutlass/conv/threadblock/conv3d_params.h
+++ b/include/cutlass/conv/threadblock/conv3d_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
index cc8faea7..97cad0a1 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
index 2b10d207..7e5475f8 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_activation_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
index be9d4fb7..cbe49985 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_analytic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
index 0ef145f1..6c2f2e51 100644
--- a/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
+++ b/include/cutlass/conv/threadblock/conv3d_wgrad_output_gradient_tile_access_iterator_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h b/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
index 80231834..f5cd2a74 100644
--- a/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
+++ b/include/cutlass/conv/threadblock/depthwise_direct_conv_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
index 192d9610..012e306d 100644
--- a/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_fixed_stride_dilation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
index a858a23f..b8ae9b93 100644
--- a/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_activation_tile_access_iterator_direct_conv_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h b/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
index 50aeee00..846f1f3a 100644
--- a/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_direct_conv_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h b/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
index 52d604e4..1035fda3 100644
--- a/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_filter_tile_access_iterator_direct_conv_optimized.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
index c2825fa6..30d13e90 100644
--- a/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
+++ b/include/cutlass/conv/threadblock/depthwise_fprop_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_mma_base.h b/include/cutlass/conv/threadblock/depthwise_mma_base.h
index 967587be..44dafcb5 100644
--- a/include/cutlass/conv/threadblock/depthwise_mma_base.h
+++ b/include/cutlass/conv/threadblock/depthwise_mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h b/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
index de84180f..9e3cc417 100644
--- a/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
+++ b/include/cutlass/conv/threadblock/depthwise_mma_core_with_lane_access_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
index 3bee07d0..482a52fe 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_fprop_fusion_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
index eea7743a..6c9c4792 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
index 79bcb78a..45e27949 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h b/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
index 1ec0c61d..3be08c1a 100644
--- a/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
+++ b/include/cutlass/conv/threadblock/implicit_gemm_wgrad_fusion_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
index bfe9a398..dac64238 100644
--- a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
index 24f0de4c..e9844be9 100644
--- a/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
+++ b/include/cutlass/conv/threadblock/predicated_scale_bias_vector_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/threadblock/threadblock_swizzle.h b/include/cutlass/conv/threadblock/threadblock_swizzle.h
index 67418e68..0c5aed6d 100644
--- a/include/cutlass/conv/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/conv/threadblock/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/warp/mma_depthwise_simt.h b/include/cutlass/conv/warp/mma_depthwise_simt.h
index ed385df0..b7af2e37 100644
--- a/include/cutlass/conv/warp/mma_depthwise_simt.h
+++ b/include/cutlass/conv/warp/mma_depthwise_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h b/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
index 26d9638b..47fd1e08 100644
--- a/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
+++ b/include/cutlass/conv/warp/mma_depthwise_simt_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/conv/warp/scale_bias_relu_transform.h b/include/cutlass/conv/warp/scale_bias_relu_transform.h
index 4da31ab8..6cb3935a 100644
--- a/include/cutlass/conv/warp/scale_bias_relu_transform.h
+++ b/include/cutlass/conv/warp/scale_bias_relu_transform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h
index fe884d70..c0199e15 100644
--- a/include/cutlass/coord.h
+++ b/include/cutlass/coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index 40ae2224..577638ef 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/cuda_host_adapter.hpp b/include/cutlass/cuda_host_adapter.hpp
index 2adfd266..b2240c51 100644
--- a/include/cutlass/cuda_host_adapter.hpp
+++ b/include/cutlass/cuda_host_adapter.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h
index e12616a2..ed81aec9 100644
--- a/include/cutlass/cutlass.h
+++ b/include/cutlass/cutlass.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/detail/collective.hpp b/include/cutlass/detail/collective.hpp
index 9d8f9e2f..d7a83d04 100644
--- a/include/cutlass/detail/collective.hpp
+++ b/include/cutlass/detail/collective.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/detail/collective/mixed_input_utils.hpp b/include/cutlass/detail/collective/mixed_input_utils.hpp
index c740eb98..6f7d35b0 100644
--- a/include/cutlass/detail/collective/mixed_input_utils.hpp
+++ b/include/cutlass/detail/collective/mixed_input_utils.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -237,7 +237,7 @@ struct LayoutAwareConvertImpl<
   }
 };
 
-// Specialization for UINT4 -> FP16 with [02461357] value order
+// Specialization for UINT4 -> FPF16 with [02461357] value order
 template <>
 struct LayoutAwareConvertImpl<
   cutlass::uint4b_t,
@@ -804,15 +804,14 @@ public:
         {
           auto&& scale_neg_ = reinterpret_cast<cutlass::Array<uint32_t, 2> const&>(scales_neg_vm_(i));
           auto&& scale_pos_ = reinterpret_cast<cutlass::Array<uint32_t, 2>      &>(scales_pos_vm_(i));
-          constexpr uint32_t immLut = (0xf0 & 0xcc) ^ 0xaa;
           asm volatile(
               "{\n"
-              "  lop3 .b32 %0, %2, %4, %5, %6;\n" \
-              "  xor  .b32 %1, %3, %5;        \n" \
+              "  and  .b32 %0, %2, %4             ;\n" \
+              "  and  .b32 %1, %3, %5             ;\n" \
               "}\n"
               : "=r"(scale_pos_[0]), "=r"(scale_pos_[1])
-              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0xFFFFFF00), "n"(0x80808080), "n"(immLut)
-            );
+              : "r"(scale_neg_[0]), "r"(scale_neg_[1]), "n"(0x7F7F7F00), "n"(0x7F7F7F7F)
+              );
         }
       }
       CUTLASS_PRAGMA_UNROLL
diff --git a/include/cutlass/detail/dependent_false.hpp b/include/cutlass/detail/dependent_false.hpp
index 76e52d2b..d2dd6a16 100644
--- a/include/cutlass/detail/dependent_false.hpp
+++ b/include/cutlass/detail/dependent_false.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/detail/helper_macros.hpp b/include/cutlass/detail/helper_macros.hpp
index 039f5e84..1d58d30f 100644
--- a/include/cutlass/detail/helper_macros.hpp
+++ b/include/cutlass/detail/helper_macros.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -60,7 +60,7 @@
 #if ! defined(_MSC_VER)
 #define CUTLASS_LAMBDA_FUNC_INLINE __attribute__((always_inline))
 #else
-#define CUTLASS_LAMBDA_FUNC_INLINE [[msvc::forceinline]]
+#define CUTLASS_LAMBDA_FUNC_INLINE
 #endif
 
 #define CUTLASS_HOST __host__
diff --git a/include/cutlass/detail/layout.hpp b/include/cutlass/detail/layout.hpp
index cbed61f6..79a1f97b 100644
--- a/include/cutlass/detail/layout.hpp
+++ b/include/cutlass/detail/layout.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp b/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
index 914443dd..84de1c7d 100644
--- a/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
+++ b/include/cutlass/detail/mainloop_fusion_helper_scale_factor.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/detail/mma.hpp b/include/cutlass/detail/mma.hpp
index 0e491b9c..0f2d0e1b 100644
--- a/include/cutlass/detail/mma.hpp
+++ b/include/cutlass/detail/mma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h
index cc7caede..40e19a37 100644
--- a/include/cutlass/device_kernel.h
+++ b/include/cutlass/device_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
index 720dcc00..13924280 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -442,6 +442,7 @@ struct CollectiveBuilder<
     cute::is_same_v<Schedule, NoSmemWarpSpecialized>,
     cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
       cutlass::epilogue::collective::DefaultEpilogue<
+        ElementC_,
         cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
         cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
         ThreadOp,
@@ -449,6 +450,7 @@ struct CollectiveBuilder<
     // Epilogue for Ptr-Array and Grouped Gemm
     cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
       cutlass::epilogue::collective::DefaultEpilogueArray<
+        ElementC_,
         cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
         cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
         ThreadOp,
@@ -801,6 +803,7 @@ struct CollectiveBuilder<
 
   using CollectiveOp = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<
     cutlass::epilogue::collective::DefaultEpilogue<
+      ElementC_,
       cutlass::detail::TagToStrideC_t<GmemLayoutTagC>,
       cutlass::detail::TagToStrideC_t<GmemLayoutTagD>,
       ThreadOp,
diff --git a/include/cutlass/epilogue/collective/builders/sm90_common.inl b/include/cutlass/epilogue/collective/builders/sm90_common.inl
index cd2639c5..a6affcfc 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_common.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_common.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/collective_builder.hpp b/include/cutlass/epilogue/collective/collective_builder.hpp
index d54cd0a8..f9539302 100644
--- a/include/cutlass/epilogue/collective/collective_builder.hpp
+++ b/include/cutlass/epilogue/collective/collective_builder.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/collective_epilogue.hpp b/include/cutlass/epilogue/collective/collective_epilogue.hpp
index 8fb1a958..4a6e558b 100644
--- a/include/cutlass/epilogue/collective/collective_epilogue.hpp
+++ b/include/cutlass/epilogue/collective/collective_epilogue.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp
index cd4a6ccd..562f7724 100644
--- a/include/cutlass/epilogue/collective/default_epilogue.hpp
+++ b/include/cutlass/epilogue/collective/default_epilogue.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -53,6 +53,7 @@ namespace collective {
 /// Applies an element wise operation to all elements within the fragment
 /// and writes them out to destination storage.
 template <
+  class ElementC_,
   class StrideC_,
   class StrideD_,
   class ThreadEpilogueOp_,
@@ -72,11 +73,13 @@ public:
   using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
   using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
   using ElementScalar = ElementCompute;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using ElementC = ElementC_;
   using StrideC = StrideC_;
   using ElementD = typename ThreadEpilogueOp::ElementD;
   using StrideD = StrideD_;
 
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>, ElementD, ElementC>; // prevents void ref breakages
+
   using GmemTiledCopyC = void;
   using GmemTiledCopyD = void;
 
@@ -183,7 +186,7 @@ public:
     auto stride_d = detail::get_epilogue_stride<EpilogueSchedule>(params.dD);
 
     // Represent the full output tensor
-    Tensor mC_mnl = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), stride_c);                 // (m,n,l)
+    Tensor mC_mnl = make_tensor(make_gmem_ptr<GmemElementC>(params.ptr_C), make_shape(M,N,L), stride_c);   // (m,n,l)
     Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M,N,L), stride_d);                 // (m,n,l)
     Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
     Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
diff --git a/include/cutlass/epilogue/collective/default_epilogue_array.hpp b/include/cutlass/epilogue/collective/default_epilogue_array.hpp
index da7562b4..e4d0fc89 100644
--- a/include/cutlass/epilogue/collective/default_epilogue_array.hpp
+++ b/include/cutlass/epilogue/collective/default_epilogue_array.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -54,6 +54,7 @@ namespace collective {
 // Applies an element wise operation to all elements within the fragment
 // and writes them out to destination storage.
 template <
+  class ElementC_,
   class StrideC_,
   class StrideD_,
   class ThreadEpilogueOp_,
@@ -73,13 +74,15 @@ public:
   using ElementAccumulator = typename ThreadEpilogueOp::ElementAccumulator;
   using ElementCompute = typename ThreadEpilogueOp::ElementCompute;
   using ElementScalar = ElementCompute;
-  using ElementC = typename ThreadEpilogueOp::ElementC;
+  using ElementC = ElementC_;
   using StrideC = StrideC_;
   using InternalStrideC = cute::remove_pointer_t<StrideC>;
   using ElementD = typename ThreadEpilogueOp::ElementD;
   using StrideD = StrideD_;
   using InternalStrideD = cute::remove_pointer_t<StrideD>;
 
+  using GmemElementC = cute::conditional_t<cute::is_void_v<ElementC>, ElementD, ElementC>; // prevents void ref breakages
+
   using GmemTiledCopyC = void;
   using GmemTiledCopyD = void;
 
@@ -227,7 +230,7 @@ public:
     if (epilogue_op.is_source_needed()) {
       ptr_C_l = params.ptr_C[l_coord];
     }
-    Tensor mC_mnl = make_tensor(make_gmem_ptr(ptr_C_l), make_shape(M,N,mock_L), stride_c);      // (m,n,l)
+    Tensor mC_mnl = make_tensor(make_gmem_ptr<GmemElementC>(ptr_C_l), make_shape(M,N,mock_L), stride_c);      // (m,n,l)
     Tensor mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D[l_coord]), make_shape(M,N,mock_L), stride_d);      // (m,n,l)
     Tensor gC_mnl = local_tile(mC_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
     Tensor gD_mnl = local_tile(mD_mnl, blk_shape_MNK, make_coord(_,_,_), Step<_1,_1, X>{});    // (BLK_M,BLK_N,m,n,l)
diff --git a/include/cutlass/epilogue/collective/detail.hpp b/include/cutlass/epilogue/collective/detail.hpp
index 23e57d99..d5194064 100644
--- a/include/cutlass/epilogue/collective/detail.hpp
+++ b/include/cutlass/epilogue/collective/detail.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp b/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
index 48833ecf..d32dd6ae 100644
--- a/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
+++ b/include/cutlass/epilogue/collective/epilogue_tensor_broadcast.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
index a8083dab..81b08fcb 100644
--- a/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
+++ b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
index 8a70370b..5030efde 100644
--- a/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
+++ b/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
index 54fe9b1d..f8c5b287 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -851,6 +851,9 @@ public:
 
     // Pre-loop fusion callback entry point
     cst_callbacks.begin();
+    if (cst_callbacks.begin_sync_needed()) {
+      synchronize();
+    }
 
     // For each output tile
     CUTLASS_PRAGMA_UNROLL
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
index b3c7bf38..f13a6b6f 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -43,6 +43,7 @@
 #include "cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp"
 #include "cutlass/detail/collective.hpp"
 #include "cutlass/detail/layout.hpp"
+#include "cutlass/detail/helper_macros.hpp"
 #include "cutlass/trace.h"
 
 #include "cute/tensor.hpp"
@@ -584,7 +585,7 @@ public:
     TiledCopy tiled_copy_C_atom = make_tiled_copy_C_atom(CopyAtomC{}, tiled_mma);
 
     // (t)hread-partition for (r)egister to (r)egister copy (tRR_)
-    TiledCopy tiled_r2r = [&]() {
+    TiledCopy tiled_r2r = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
       if constexpr (IsUseR2R) {
         return make_tiled_copy_S(Copy_Atom<CopyOpR2R, ElementCompute>{}, tiled_copy_C_atom);
       }
@@ -596,7 +597,7 @@ public:
     ThrCopy thread_r2r = tiled_r2r.get_slice(thread_idx);
 
     // (t)hread-partition for (r)egister to (s)mem copy (tRS_)
-    TiledCopy tiled_r2s = [&]() {
+    TiledCopy tiled_r2s = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
       if constexpr (IsUseR2R) {
         return make_tiled_copy_D(Copy_Atom<CopyOpR2S,SmemElementD>{}, tiled_r2r);
       }
@@ -647,7 +648,7 @@ public:
     // Absolute coordinate tensors (dynamic)
     Tensor mD_crd = make_identity_tensor(make_shape(M,N));                                                     // (M,N)
     Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord));          // (CTA_M,CTA_N)
-    Tensor tRS_cD_mn = [&]() {
+    Tensor tRS_cD_mn = [&]() CUTLASS_LAMBDA_FUNC_INLINE {
       if constexpr (IsUseR2R) {
         // (t)hread-partition for ConsumerStoreCallbacks. 
         TiledCopy tiled_cst = make_tiled_copy_S(Copy_Atom<CopyOpR2S,SmemElementC>{}, tiled_copy_C_atom);
@@ -699,7 +700,7 @@ public:
 
     // Thread synchronizer for previously issued waits or fences
     // to ensure visibility of smem reads/writes to threads or TMA unit
-    auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
+    auto synchronize = [&] () CUTLASS_LAMBDA_FUNC_INLINE { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
 
     // Predication for TMA store (one warp issues TMA store)
     bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0;
@@ -722,7 +723,7 @@ public:
     static_assert(not (DelayTmaStore and ReuseSmemC and StagesC <= StagesD), "This TMA epilogue configuration will deadlock");
 
     // The TMA store sequence for one subtile iteration
-    auto tma_store_fn = [&] (int epi_m, int epi_n) {
+    auto tma_store_fn = [&] (int epi_m, int epi_n) CUTLASS_LAMBDA_FUNC_INLINE {
       // Write the tile from smem to gmem with TMA
       cutlass::arch::fence_view_async_shared(); // ensure smem writes are visible to TMA
       synchronize(); // ensure all threads have issued their async fence
@@ -767,6 +768,9 @@ public:
 
     // Pre-loop fusion callback entry point
     cst_callbacks.begin();
+    if (cst_callbacks.begin_sync_needed()) {
+      synchronize();
+    }
 
     // For each output tile
     CUTLASS_PRAGMA_UNROLL
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
index 97490400..2d5fd858 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized_bias_elementwise.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/dispatch_policy.hpp b/include/cutlass/epilogue/dispatch_policy.hpp
index a5f47f08..4b18040a 100644
--- a/include/cutlass/epilogue/dispatch_policy.hpp
+++ b/include/cutlass/epilogue/dispatch_policy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/fusion/callbacks.hpp b/include/cutlass/epilogue/fusion/callbacks.hpp
index 9ee37234..c89db7f8 100644
--- a/include/cutlass/epilogue/fusion/callbacks.hpp
+++ b/include/cutlass/epilogue/fusion/callbacks.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/fusion/operations.hpp b/include/cutlass/epilogue/fusion/operations.hpp
index 1ef06a53..e1c53dac 100644
--- a/include/cutlass/epilogue/fusion/operations.hpp
+++ b/include/cutlass/epilogue/fusion/operations.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -194,6 +194,20 @@ struct LinCombPerRowBiasEltAct
   static constexpr bool IsEltActSupported = true;
 };
 
+// Grouped Wgrad's D = alpha * acc + beta * C with special AccFetch.
+template<
+  class GroupsPerTile_,
+  class ElementOutput_,
+  class ElementCompute_,
+  class ElementSource_ = ElementOutput_,
+  class ElementScalar_ = ElementCompute_,
+  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
+>
+struct LinearCombinationGroupedWgrad
+    : LinearCombination<ElementOutput_, ElementCompute_, ElementSource_, ElementScalar_, RoundStyle_> {
+  using GroupsPerTile = GroupsPerTile_;
+};
+
 // D = activation(alpha * acc + beta * C + per-column bias)
 template<
   template <class> class ActivationFn_,
@@ -279,27 +293,6 @@ struct PerRowLinCombPerRowBiasEltAct
   static constexpr bool IsPerRowScaleSupported = true;
 };
 
-// D = per-column alpha * per-row alpha * acc + beta * C
-template<
-  class ElementOutput_,
-  class ElementCompute_,
-  class ElementSource_ = ElementCompute_,
-  class ElementScalar_ = ElementCompute_,
-  int AlignmentScalar_ = 128 / cute::sizeof_bits_v<ElementScalar_>,
-  FloatRoundStyle RoundStyle_ = FloatRoundStyle::round_to_nearest
->
-struct OuterProdLinComb : FusionOperation {
-  using ElementOutput = ElementOutput_;
-  using ElementCompute = ElementCompute_;
-  using ElementSource = ElementSource_;
-  using ElementScalar = ElementScalar_;
-  static constexpr int AlignmentScalar = AlignmentScalar_;
-  static constexpr auto RoundStyle = RoundStyle_;
-  static constexpr bool IsSourceSupported = true;
-  static constexpr bool IsPerRowScaleSupported = true;
-  static constexpr bool IsPerColScaleSupported = true;
-};
-
 // D = activation(per-col alpha * acc + per-col beta * C + per-column bias)
 template<
   template <class> class ActivationFn_,
diff --git a/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
index 3e57fa0b..87258c69 100644
--- a/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_callbacks_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -2499,87 +2499,6 @@ struct FusionCallbacks<
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-// D = per-column alpha * per-row alpha * acc + beta * c
-template<
-  class CtaTileShapeMNK,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource = ElementOutput,
-  class ElementScalar = ElementCompute,
-  int AlignmentScalar = 128 / sizeof_bits_v<ElementScalar>, // Alignment of per-column and per-row scaling vectors
-  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
->
-using Sm90OuterProdLinComb =
-  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // c(beta) * c(C) + c(alpha * acc)
-    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int>>, // beta
-    Sm90SrcFetch<ElementSource>, // C
-    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // c(alpha) * c(acc)
-      Sm90OuterProduct<0, CtaTileShapeMNK, ElementScalar, Stride<_1,_0,int>, Stride<_0,_1,int>, AlignmentScalar>, // alpha_col * alpha_row
-      Sm90AccFetch // acc
-    >
-  >;
-
-template <
-  int StagesC,
-  int StagesD,
-  int FragmentSize,
-  bool ReuseSmemC,
-  bool DelayTmaStore,
-  class ElementOutput,
-  class ElementCompute,
-  class ElementSource,
-  class ElementScalar,
-  int AlignmentScalar,
-  FloatRoundStyle RoundStyle,
-  class CtaTileShapeMNK,
-  class EpilogueTile
->
-struct FusionCallbacks<
-    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
-    OuterProdLinComb<ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle>,
-    CtaTileShapeMNK,
-    EpilogueTile
-> : Sm90OuterProdLinComb<CtaTileShapeMNK, ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle> {
-  using Impl = Sm90OuterProdLinComb<CtaTileShapeMNK, ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle>;
-  using Operation = OuterProdLinComb<ElementOutput, ElementCompute, ElementSource, ElementScalar, AlignmentScalar, RoundStyle>;
-
-  struct Arguments {
-
-    // Give a name and flat ordering to the fusion callback args
-    using StrideCol  = Stride<_1,_0,int>;
-    using StrideRow  = Stride<_0,_1,int>;
-    using StrideBeta = Stride<_0,_0,int>;
-    ElementScalar const* alpha_ptr_col = nullptr;
-    ElementScalar const* alpha_ptr_row = nullptr;
-    ElementScalar        beta = static_cast<ElementScalar>(0);
-    ElementScalar const* beta_ptr = nullptr;
-    StrideCol  dAlphaCol = {};
-    StrideRow  dAlphaRow = {};
-    StrideBeta dBeta     = {};
-
-    // Conversion to the args expected by the visitor implementation
-    // to_underlying_arguments will implicitly call this
-    operator typename Impl::Arguments() const {
-      return
-        {
-          {beta, beta_ptr, dBeta}, // leaf args : beta
-          {},                      // leaf args : C
-          {
-            { alpha_ptr_col, alpha_ptr_row, dAlphaCol, dAlphaRow }, // leaf args : alpha cols / rows
-            {},                                                     // leaf args : acc
-            {}
-          },
-          {}
-        };
-    }
-  };
-
-  // Ctor inheritance
-  using Impl::Impl;
-};
-
-/////////////////////////////////////////////////////////////////////////////////////////////////
-
 // D = softmax(top_k(alpha * acc + beta * C))
 template<
   int TopK,
@@ -2652,6 +2571,83 @@ struct FusionCallbacks<
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Grouped Wgrad Conv
+template<
+  class GroupsPerTile,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource = ElementOutput,
+  class ElementScalar = ElementCompute,
+  FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest
+>
+using Sm90LinearCombinationGroupedWgrad =
+  Sm90EVT<Sm90Compute<homogeneous_multiply_add, ElementOutput, ElementCompute, RoundStyle>, // beta * C + (alpha * acc)
+    Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // beta
+    Sm90SrcFetch<ElementSource>, // C
+    Sm90EVT<Sm90Compute<multiplies, ElementCompute, ElementCompute, RoundStyle>, // alpha * acc
+      Sm90ScalarBroadcast<ElementScalar, Stride<_0,_0,int64_t>>, // alpha
+      Sm90AccFetchGroupedWgrad<GroupsPerTile> // acc
+    >
+  >;
+
+template <
+  int StagesC,
+  int StagesD,
+  int FragmentSize,
+  bool ReuseSmemC,
+  bool DelayTmaStore,
+  class ElementOutput,
+  class ElementCompute,
+  class ElementSource,
+  class ElementScalar,
+  FloatRoundStyle RoundStyle,
+  class CtaTileShapeMNK,
+  class EpilogueTile,
+  class GroupsPerTile
+>
+struct FusionCallbacks<
+    epilogue::Sm90TmaWarpSpecialized<StagesC, StagesD, FragmentSize, ReuseSmemC, DelayTmaStore>,
+    fusion::LinearCombinationGroupedWgrad<GroupsPerTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>,
+    CtaTileShapeMNK,
+    EpilogueTile
+> : Sm90LinearCombinationGroupedWgrad<GroupsPerTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle> {
+
+  using Impl = Sm90LinearCombinationGroupedWgrad<GroupsPerTile, typename cutlass::detail::get_unpacked_element_type<ElementOutput>::type, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+  using Operation = fusion::LinearCombinationGroupedWgrad<GroupsPerTile, ElementOutput, ElementCompute, ElementSource, ElementScalar, RoundStyle>;
+
+  struct Arguments {
+    ElementScalar alpha = ElementScalar(1);
+    ElementScalar beta = ElementScalar(0);
+    //ElementScalar groups = ElementScalar(1);
+    ElementScalar const* alpha_ptr = nullptr;
+    ElementScalar const* beta_ptr = nullptr;
+
+    using StrideAlpha = Stride<_0,_0,int64_t>;
+    using StrideBeta  = Stride<_0,_0,int64_t>;
+    StrideAlpha dAlpha = {_0{}, _0{}, 0};
+    StrideBeta  dBeta  = {_0{}, _0{}, 0};
+
+    operator typename Impl::Arguments() const {
+      return
+        {    // ternary op : beta * C + (alpha * acc)
+          {{beta}, {beta_ptr}, {dBeta}}, // leaf args : beta
+          {},                   // leaf args : C
+          {                     // binary op : alpha * acc
+            {{alpha}, {alpha_ptr}, {dAlpha}}, // leaf args : alpha
+            {},                     // leaf args : acc
+            {}                  // binary args : multiplies
+          },                    // end binary op
+          {} // ternary args : multiply_add
+        };   // end ternary op
+    }
+  };
+
+  // Ctor inheritance
+  using Impl::Impl;
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 namespace detail {
 template <class FusionOpOrCallbacks, class = cute::void_t<>>
 struct get_element_aux {
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
index 321daa6b..0e715a78 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -39,6 +39,7 @@
 #include "cutlass/array.h"
 #include "cutlass/numeric_conversion.h"
 #include "cutlass/epilogue/thread/activation.h"
+#include "cutlass/detail/helper_macros.hpp"
 
 #include "cute/tensor.hpp"
 
@@ -172,14 +173,14 @@ public:
     visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n,
           Array<ElementInputs, FragmentSize> const&... frg_inputs) {
       return transform_apply(cute::make_tuple(frg_inputs...),
-        [&] (auto&& frg_input) {
+        [&] (auto&& frg_input) CUTLASS_LAMBDA_FUNC_INLINE {
           using ElementInput = typename cute::remove_cvref_t<decltype(frg_input)>::Element;
           using ConvertInput = NumericArrayConverter<ElementCompute, ElementInput, FragmentSize, RoundStyle>;
           ConvertInput convert_input{};
 
           return convert_input(frg_input);
         },
-        [&] (auto&&... cvt_frg_inputs) {
+        [&] (auto&&... cvt_frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
           using ComputeOutput = ComputeFn<Array<ElementCompute, FragmentSize>>;
           ComputeOutput compute_output{};
 
@@ -559,14 +560,14 @@ struct Sm90TreeVisitor<
         Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
         Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
         Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
-        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
+        auto predicate_fn = [&] (auto&&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
         copy_if(predicate_fn, tC_rAux_vec, tC_gAux_vec);
       }
       // sub-byte vectorization, must serialize threads
       else {
         // Assumes no inter-warp sharing of bytes (most copy layouts should satisfy this)
         int lane_idx = canonical_lane_idx();
-        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
+        auto predicate_fn = [&] (auto&&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
         CUTLASS_PRAGMA_NO_UNROLL
         for (int i = 0; i < NumThreadsPerWarp; ++i) {
           if (lane_idx == i) {
@@ -718,11 +719,11 @@ struct Sm90AuxLoad<
           Tensor tC_gAux_vec = recast<VecType>(tC_gAux);
           Tensor tC_rAux_vec = recast<VecType>(tC_rAux);
           Tensor tC_cAux_vec = tensor<1>(zipped_divide(tC_cAux, MCL.compose(Int<V>{})));
-          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
+          auto predicate_fn = [&] (auto&&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tC_cAux_vec(coords...), residue_tC_cAux); };
           copy_if(predicate_fn, tC_gAux_vec, tC_rAux_vec);
         }
         else {
-          auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
+          auto predicate_fn = [&] (auto&&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tC_cAux(coords...), residue_tC_cAux); };
           copy_if(predicate_fn, tC_gAux, tC_rAux);
         }
       }
@@ -737,7 +738,7 @@ struct Sm90AuxLoad<
           }
         }
 
-        auto predicate_fn = [&] (auto&&... coords) { return elem_less(tC_cAux(_,_,_,epi_m,epi_n)(coords...), residue_tC_cAux); };
+        auto predicate_fn = [&] (auto&&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tC_cAux(_,_,_,epi_m,epi_n)(coords...), residue_tC_cAux); };
         copy_if(predicate_fn, tC_gAux(_,_,_,epi_m,epi_n), tC_rAux);
       }
     }
@@ -773,7 +774,7 @@ struct Sm90AuxLoad<
 
     // If byte-unaligned vectorization, store in registers as uint32_t to reduce redundant pack+unpack instruction sequences
     constexpr int V = decltype(max_common_vector(tC_gAux.layout(), make_layout(tC_gAux.shape())))::value;
-    Tensor tC_rAux = [&] () {
+    Tensor tC_rAux = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
       if constexpr (V % 8 != 0) {
         return make_tensor<uint32_t>(take<0,3>(shape(tC_gAux)));                       // (CPY,CPY_M,CPY_N)
       } else {
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
index 66b1086e..62f4482c 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@
 #include "cutlass/cutlass.h"
 #include "cutlass/arch/barrier.h"
 #include "cutlass/epilogue/collective/detail.hpp"
+#include "cutlass/detail/helper_macros.hpp"
 
 #include "cute/tensor.hpp"
 #include "sm90_visitor_tma_warpspecialized.hpp"
@@ -133,6 +134,66 @@ struct Sm90SrcFetch : Sm90VisitorImpl<> {
   }
 };
 
+// returns accumulator in Grouped Conv Wgrad
+template <class GroupsPerTile_>
+struct Sm90AccFetchGroupedWgrad : Sm90VisitorImpl<> {
+
+  using Sm90VisitorImpl<>::Sm90VisitorImpl;
+  using GroupsPerTile = GroupsPerTile_;
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(int32_t thread_idx)
+      : thread_idx(thread_idx) { }
+
+    int32_t thread_idx;
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<ElementAccumulator, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+
+      Array<ElementAccumulator, FragmentSize> frg_acc_rst;
+      int warp_id = thread_idx / 32;
+
+      // In Grouped Wgrad, only diagonal block data is valid and the others is wrong and useless.
+      // One block size is C/G x C/G. Note that C/G = Tile_N / GroupsPerTile.
+      // Copy diagonal block ACC into the first block Col which is the output tensor size Tile_M * C/G.
+      // Then we can store the valid output tensor tile directly.
+      if constexpr ( cute::is_same_v<GroupsPerTile, _1> ) {
+        frg_acc_rst = frg_acc;
+      }
+      else if constexpr ( cute::is_same_v<GroupsPerTile, _2> ) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < 16; i++) {
+          frg_acc_rst[i] = frg_acc[i + warp_id / 2 * 16];
+        }
+      }
+      else if constexpr ( cute::is_same_v<GroupsPerTile, _4> ) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < 8; i++) {
+          frg_acc_rst[i] = frg_acc[i + warp_id * 8];
+        }
+      }
+      else if constexpr ( cute::is_same_v<GroupsPerTile, _8> ) {
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < 4; i++) {
+          frg_acc_rst[i] = frg_acc[i + warp_id * 8 + i / 2 * 4];
+        }
+      }
+
+      return frg_acc_rst;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+    return ConsumerStoreCallbacks(args.thread_idx);
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 //
 // Elementwise Load Operations
@@ -496,7 +557,7 @@ struct Sm90AuxLoad<
       Tensor tC_gAux_vec = recast<Array<Element, V>>(coalesce(tC_gAux(_,_,_,epi_m,epi_n)));
       Tensor tC_rAux_vec = recast<Array<Element, V>>(coalesce(tC_rAux));
 
-      auto pred_fn = [&] (auto const&... coords) {
+      auto pred_fn = [&] (auto const&... coords) CUTLASS_LAMBDA_FUNC_INLINE {
         return elem_less(tC_cAux_vec(coords...), problem_shape_mnl);
       };
 
@@ -935,16 +996,12 @@ struct Sm90RowBroadcast {
     StrideMNL dRow = {};
   };
 
-  struct Params {
-    ElementInput const* ptr_row = nullptr;
-    ElementCompute null_default = ElementCompute(0);
-    StrideMNL dRow = {};
-  };
+  using Params = Arguments;
 
   template <class ProblemShape>
   static constexpr Params
   to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return {args.ptr_row, ElementCompute(args.null_default), args.dRow};
+    return args;
   }
 
   template <class ProblemShape>
@@ -1009,14 +1066,14 @@ struct Sm90RowBroadcast {
     return EmptyProducerLoadCallbacks{};
   }
 
-  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class Residue, class ThrNum>
+  template <class GS_GTensor, class GS_STensor, class GS_CTensor, class Tiled_G2S, class SR_STensor, class SR_RTensor, class Residue>
   struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
     CUTLASS_DEVICE
     ConsumerStoreCallbacks(
         GS_GTensor tGS_gRow_, GS_STensor tGS_sRow_, 
         GS_CTensor tGS_cRow_, Tiled_G2S tiled_g2s_, 
         SR_STensor tSR_sRow_, SR_RTensor tSR_rRow_,
-        Residue residue_cRow_, ThrNum thr_num_, Params const& params_)
+        Residue residue_cRow_, Params const& params_)
       : tGS_gRow(tGS_gRow_)
       , tGS_sRow(tGS_sRow_)
       , tGS_cRow(tGS_cRow_)
@@ -1024,11 +1081,7 @@ struct Sm90RowBroadcast {
       , tSR_sRow(tSR_sRow_)
       , tSR_rRow(tSR_rRow_)
       , residue_cRow(residue_cRow_)
-      , params(params_)
-      , is_nullptr(EnableNullptr && params_.ptr_row == nullptr) {
-      if (is_nullptr) {
-        fill(tSR_rRow, params.null_default);
-      }
+      , params(params_) {
     }
 
     GS_GTensor tGS_gRow;                                                         // (CPY,CPY_M,CPY_N)
@@ -1040,17 +1093,12 @@ struct Sm90RowBroadcast {
     SR_RTensor tSR_rRow;                                                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N) 
   
     Residue residue_cRow;                                                        // (m, n)
-    ThrNum thr_num;
     Params const& params;
-    bool is_nullptr;
 
     CUTLASS_DEVICE void
     begin() {
-      if (is_nullptr) {
-        return;
-      }
+      bool is_nullptr = EnableNullptr && params.ptr_row == nullptr;
 
-      auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(thr_num, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); };
       Tensor tGS_gRow_flt = filter_zeros(tGS_gRow);
       Tensor tGS_sRow_flt = filter_zeros(tGS_sRow);
       Tensor tGS_cRow_flt = filter_zeros(tGS_cRow, tGS_gRow.stride());
@@ -1059,19 +1107,23 @@ struct Sm90RowBroadcast {
         if (get<1>(tGS_cRow_flt(i)) >= size<1>(CtaTileShapeMNK{})) {
           continue; // OOB of SMEM, 
         }
-        if (elem_less(tGS_cRow_flt(i), residue_cRow)) {
-          tGS_sRow_flt(i) = tGS_gRow_flt(i);
+        if (not is_nullptr && elem_less(tGS_cRow_flt(i), residue_cRow)) {
+          tGS_sRow_flt(i) = tGS_gRow_flt(i); // issue async gmem to smem load
         }
         else {
-          tGS_sRow_flt(i) = ElementInput(0); // Set to Zero when OOB so LDS can be issued without any preds.
+          tGS_sRow_flt(i) = params.null_default; // fill OOB values so smem to RF load can issue without predication
         }
       }
-      synchronize();
+    }
+
+    CUTLASS_DEVICE bool
+    begin_sync_needed() const {
+      return true; // Ensure visibility of async gmem to smem loads
     }
 
     CUTLASS_DEVICE void
     begin_loop(int epi_m, int epi_n) {
-      if (epi_m == 0 and not is_nullptr) { // Assumes M-major subtile loop
+      if (epi_m == 0) { // Assumes M-major subtile loop
         Tensor tSR_sRow_flt = filter_zeros(tSR_sRow(_,_,_,epi_m,epi_n));
         Tensor tSR_rRow_flt = make_tensor_like<ElementInput>(tSR_sRow_flt);
         copy_aligned(tSR_sRow_flt, tSR_rRow_flt);
@@ -1113,7 +1165,7 @@ struct Sm90RowBroadcast {
     auto [m, n, k, l] = args.tile_coord_mnkl;
     using ThreadCount = decltype(size(args.tiled_copy));
 
-    auto layout_N = [&] () {
+    auto layout_N = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
       auto shape_N = get<1>(args.problem_shape_mnkl);
       if constexpr (IsDynamicBroadcast) {
         auto stride_N = repeat_like(shape_N, int(0));
@@ -1158,7 +1210,6 @@ struct Sm90RowBroadcast {
       tSR_sRow, 
       tSR_rRow, 
       args.residue_cD,
-      ThreadCount{}, 
       params);
   }
 };
@@ -1303,11 +1354,11 @@ struct Sm90ColBroadcast {
         Tensor tCgCol_vec = recast<VecType>(coalesce(tCgCol_flt));
         Tensor tCrCol_vec = recast<VecType>(coalesce(tCrCol_flt));
         Tensor tCcCol_vec = tensor<1>(zipped_divide(tCcCol_flt, MCL.compose(Int<V>{})));
-        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_vec(coords...), residue_tCcCol); };
+        auto pred_fn = [&] (auto const&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tCcCol_vec(coords...), residue_tCcCol); };
         copy_if(pred_fn, tCgCol_vec, tCrCol_vec);
       }
       else {
-        auto pred_fn = [&] (auto const&... coords) { return elem_less(tCcCol_flt(coords...), residue_tCcCol); };
+        auto pred_fn = [&] (auto const&... coords) CUTLASS_LAMBDA_FUNC_INLINE { return elem_less(tCcCol_flt(coords...), residue_tCcCol); };
         copy_if(pred_fn, tCgCol_flt, tCrCol_flt);
       }
 
@@ -1347,7 +1398,7 @@ struct Sm90ColBroadcast {
   get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
 
     auto [M, N, K, L] = args.problem_shape_mnkl;
-    auto layout_M = [&] () {
+    auto layout_M = [&] () CUTLASS_LAMBDA_FUNC_INLINE {
       auto shape_M = get<0>(args.problem_shape_mnkl);
       if constexpr (IsDynamicBroadcast) {
         auto stride_M = repeat_like(shape_M, int(0));
@@ -1378,171 +1429,6 @@ struct Sm90ColBroadcast {
   }
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////
-//
-// Do outer product from the column and row loaded
-//
-template<
-  int Stages,
-  class CtaTileShapeMNK,
-  class ElementScalar,
-  class StrideColMNL_ = Stride<_1,_0,int64_t>, /// NOTE: Batched scaling untested for now
-  class StrideRowMNL_ = Stride<_0,_1,int64_t>,
-  int Alignment = 128 / sizeof_bits_v<ElementScalar>,
-  bool EnableNullptr = false // Fallback scalar broadcast for nullptr params
->
-struct Sm90OuterProduct {
-  using StrideColMNL = StrideColMNL_;
-  using StrideRowMNL = StrideRowMNL_;
-  static_assert(Stages == 0, "OuterProduct doesn't support smem usage");
-  static_assert(Alignment * sizeof_bits_v<ElementScalar> % 128 == 0, "sub-16B alignment not supported yet");
-  static_assert(!EnableNullptr, "Nullptr fallback not implemented");
-  static_assert(is_static_v<decltype(take<0,2>(StrideColMNL{}))> &&
-                is_static_v<decltype(take<0,2>(StrideRowMNL{}))>, "Only batch stride can be dynamic");
-  static_assert(take<0,2>(StrideColMNL{}) == Stride<_1,_0>{} &&
-                take<0,2>(StrideRowMNL{}) == Stride<_0,_1>{}, "Row and column incorrectly formatted");
-
-  // Accumulator distributes col/row elements evenly amongst threads so we can just directly load from gmem
-  struct SharedStorage { };
-
-  struct Arguments {
-    ElementScalar const* ptr_col = nullptr;
-    ElementScalar const* ptr_row = nullptr;
-    StrideColMNL dCol = {};
-    StrideRowMNL dRow = {};
-  };
-
-  using Params = Arguments;
-
-  template <class ProblemShape>
-  static constexpr Params
-  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
-    return args;
-  }
-
-  template <class ProblemShape>
-  static bool
-  can_implement(ProblemShape const& problem_shape, Arguments const& args) {
-    return true;
-  }
-
-  template <class ProblemShape>
-  static size_t
-  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
-    return 0;
-  }
-
-  template <class ProblemShape>
-  static cutlass::Status
-  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
-    CudaHostAdapter* cuda_adapter = nullptr) {
-    return cutlass::Status::kSuccess;
-  }
-
-  CUTLASS_DEVICE bool
-  is_producer_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_C_load_needed() const {
-    return false;
-  }
-
-  CUTLASS_DEVICE bool
-  is_zero() const {
-    return false;
-  }
-
-  CUTLASS_HOST_DEVICE
-  Sm90OuterProduct() { }
-
-  CUTLASS_HOST_DEVICE
-  Sm90OuterProduct(Params const& params, SharedStorage const& shared_storage)
-  : params(params) { }
-
-  Params params;
-
-  template <class... Args>
-  CUTLASS_DEVICE auto
-  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
-    return EmptyProducerLoadCallbacks{};
-  }
-
-  template<
-    class GTensorCol, class RTensorCol,
-    class GTensorRow, class RTensorRow
-  >
-  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
-    CUTLASS_DEVICE
-    ConsumerStoreCallbacks(GTensorCol&& tCgCol, RTensorCol&& tCrCol,
-                           GTensorRow&& tCgRow, RTensorRow&& tCrRow,
-                           Params const& params)
-      : tCgCol(cute::forward<GTensorCol>(tCgCol))
-      , tCrCol(cute::forward<RTensorCol>(tCrCol))
-      , tCgRow(cute::forward<GTensorRow>(tCgRow))
-      , tCrRow(cute::forward<RTensorRow>(tCrRow))
-      , params(params) {}
-
-    GTensorCol tCgCol;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensorCol tCrCol;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    GTensorRow tCgRow;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    RTensorRow tCrRow;                                                                 // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    Params const& params;
-
-    CUTLASS_DEVICE void
-    begin() {
-
-      // Filter so we don't issue redundant copies over stride-0 modes
-      copy(filter(tCgCol), filter(tCrCol));
-      copy(filter(tCgRow), filter(tCrRow));
-    }
-
-    template <typename ElementAccumulator, int FragmentSize>
-    CUTLASS_DEVICE Array<ElementScalar, FragmentSize>
-    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
-      Array<ElementScalar, FragmentSize> frg_colrow;
-      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
-      Tensor tCrRow_mn = tCrRow(_,_,_,epi_m,epi_n);
-
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 0; i < FragmentSize; ++i) {
-        frg_colrow[i] = static_cast<ElementScalar>(tCrCol_mn(epi_v * FragmentSize + i) * tCrRow_mn(epi_v * FragmentSize + i));
-      }
-      return frg_colrow;
-    }
-
-  };
-
-  template <
-    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
-    class... Args
-  >
-  CUTLASS_DEVICE auto
-  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
-
-    auto [M, N, K, L] = args.problem_shape_mnkl;
-    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
-    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
-    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCgRow = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-      mRow, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
-    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-    Tensor tCrRow = make_tensor_like(tCgRow);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
-
-    return ConsumerStoreCallbacks<
-      decltype(tCgCol), decltype(tCrCol),
-      decltype(tCgRow), decltype(tCrRow)
-    >(
-      cute::move(tCgCol), cute::move(tCrCol),
-      cute::move(tCgRow), cute::move(tCrRow),
-      params
-    );
-  }
-
-};
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // Batch matrix broadcast
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
index 83cfc030..de4bdbf2 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_store_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
index 48f4756d..ce841bf2 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,6 +38,7 @@
 
 #include "cutlass/cutlass.h"
 #include "cutlass/workspace.h"
+#include "cutlass/detail/helper_macros.hpp"
 
 #include "cute/tensor.hpp"
 
@@ -215,7 +216,7 @@ struct Sm90VisitorImplBase {
   to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
     uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
     return transform_apply(tuple<Ops...>{}, args,
-      [&] (auto&& op, auto const& op_args) {
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
         using Op = cute::remove_cvref_t<decltype(op)>;
         auto ret = Op::to_underlying_arguments(problem_shape, op_args, op_workspace);
         if (op_workspace != nullptr) {
@@ -224,7 +225,7 @@ struct Sm90VisitorImplBase {
         }
         return ret;
       },
-      [] (auto&&... op_params) { return cute::make_tuple(op_params...); }
+      [] (auto&&... op_params) CUTLASS_LAMBDA_FUNC_INLINE { return cute::make_tuple(op_params...); }
     );
   }
 
@@ -232,11 +233,11 @@ struct Sm90VisitorImplBase {
   static bool
   can_implement(ProblemShape const& problem_shape, Arguments const& args) {
     return transform_apply(tuple<Ops...>{}, args,
-      [&] (auto&& op, auto const& op_args) {
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
         using Op = cute::remove_cvref_t<decltype(op)>;
         return Op::can_implement(problem_shape, op_args);
       },
-      [&] (auto&&... implementable) {
+      [&] (auto&&... implementable) CUTLASS_LAMBDA_FUNC_INLINE {
         return (true && ... && implementable);
       }
     );
@@ -246,12 +247,12 @@ struct Sm90VisitorImplBase {
   static size_t
   get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
     return transform_apply(tuple<Ops...>{}, args,
-      [&] (auto&& op, auto const& op_args) {
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
         using Op = cute::remove_cvref_t<decltype(op)>;
         size_t op_workspace_size = Op::get_workspace_size(problem_shape, op_args);
         return round_nearest(op_workspace_size, MinWorkspaceAlignment);
       },
-      [&] (auto&&... op_workspace_size) {
+      [&] (auto&&... op_workspace_size) CUTLASS_LAMBDA_FUNC_INLINE {
         return (0 + ... + op_workspace_size);
       }
     );
@@ -265,7 +266,7 @@ struct Sm90VisitorImplBase {
     uint8_t* op_workspace = reinterpret_cast<uint8_t*>(workspace);
     return transform_apply(tuple<Ops...>{}, args,
       // Initialize each operation's workspace, stopping at the first error
-      [&] (auto&& op, auto const& op_args) {
+      [&] (auto&& op, auto const& op_args) CUTLASS_LAMBDA_FUNC_INLINE {
         if (status != Status::kSuccess) {
           return status;
         }
@@ -279,7 +280,7 @@ struct Sm90VisitorImplBase {
         return status;
       },
       // Return the final status
-      [&] (auto const&...ops) { return status; }
+      [&] (auto const&...ops) CUTLASS_LAMBDA_FUNC_INLINE { return status; }
     );
   }
 
@@ -289,11 +290,11 @@ struct Sm90VisitorImplBase {
   CUTLASS_HOST_DEVICE
   Sm90VisitorImplBase(Params const& params, SharedStorage const& shared_storage)
     : ops(transform_apply(tuple<Ops...>{}, params, shared_storage,
-        [] (auto&& op, auto const& op_params, auto&& op_storage) {
+        [] (auto&& op, auto const& op_params, auto&& op_storage) CUTLASS_LAMBDA_FUNC_INLINE {
           using Op = cute::remove_cvref_t<decltype(op)>;
           return Op(op_params, op_storage);
         },
-        [] (auto&&... ops) { return cute::make_tuple(ops...); }
+        [] (auto&&... ops) CUTLASS_LAMBDA_FUNC_INLINE { return cute::make_tuple(ops...); }
       )) {}
 
   // Ops can store kernel persistent variables (e.g. descriptors, scalars, wave counters)
@@ -328,7 +329,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
   CUTLASS_DEVICE bool
   is_producer_load_needed() const {
     return cute::apply(ops,
-      [] (auto const&... op) {
+      [] (auto const&... op) CUTLASS_LAMBDA_FUNC_INLINE {
         return (false || ... || op.is_producer_load_needed());
       }
     );
@@ -342,7 +343,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
   CUTLASS_DEVICE bool
   is_C_load_needed() const {
     return cute::apply(ops,
-      [] (auto const&... op) {
+      [] (auto const&... op) CUTLASS_LAMBDA_FUNC_INLINE {
         return (false || ... || op.is_C_load_needed());
       }
     );
@@ -364,7 +365,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     begin() {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.begin();
         }
       );
@@ -376,7 +377,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     step(uint64_t* full_mbarrier_ptr, int epi_m, int epi_n, int load_iteration, bool issue_tma_load) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.step(full_mbarrier_ptr, epi_m, epi_n, load_iteration, issue_tma_load);
         }
       );
@@ -386,7 +387,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     end() {
       for_each(callbacks_tuple,
-        [] (auto& callbacks) {
+        [] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.end();
         }
       );
@@ -399,10 +400,10 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
   CUTLASS_DEVICE auto
   get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
     return transform_apply(ops,
-      [&] (auto& op) {
+      [&] (auto& op) CUTLASS_LAMBDA_FUNC_INLINE {
         return op.get_producer_load_callbacks(args);
       },
-      [] (auto&&... callbacks) {
+      [] (auto&&... callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
         auto callbacks_tuple = cute::make_tuple(callbacks...);
         return ProducerLoadCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
       }
@@ -422,17 +423,27 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     begin() {
       for_each(callbacks_tuple,
-        [] (auto& callbacks) {
+        [] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.begin();
         }
       );
     }
 
+    // Is a thread sync needed after begin(). Allows chaining async copies across multiple nodes
+    CUTLASS_DEVICE bool
+    begin_sync_needed() const {
+      return cute::apply(callbacks_tuple,
+        [] (auto const&... callbacks) {
+          return (false || ... || callbacks.begin_sync_needed());
+        }
+      );
+    }
+
     // Start of subtile store iteration
     CUTLASS_DEVICE void
     begin_loop(int epi_m, int epi_n) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.begin_loop(epi_m, epi_n);
         }
       );
@@ -443,7 +454,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.previsit(epi_m, epi_n, load_iteration, is_producer_load_needed);
         }
       );
@@ -468,7 +479,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     reduce(STensor&& reduction_buffer, SyncFn const& sync_fn, int epi_m, int epi_n, bool is_last_iteration, VTensor visit_results) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.reduce(reduction_buffer, sync_fn, epi_m, epi_n, is_last_iteration, visit_results);
         }
       );
@@ -479,7 +490,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     postreduce(int epi_m, int epi_n, int store_iteration, bool issue_smem_store) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.postreduce(epi_m, epi_n, store_iteration, issue_smem_store);
         }
       );
@@ -492,7 +503,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     tma_store(int epi_m, int epi_n, int store_iteration, bool issue_tma_store) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.tma_store(epi_m, epi_n, store_iteration, issue_tma_store);
         }
       );
@@ -502,7 +513,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     end_loop(int epi_m, int epi_n) {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.end_loop(epi_m, epi_n);
         }
       );
@@ -512,7 +523,7 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
     CUTLASS_DEVICE void
     end() {
       for_each(callbacks_tuple,
-        [&] (auto& callbacks) {
+        [&] (auto& callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           callbacks.end();
         }
       );
@@ -528,10 +539,10 @@ struct Sm90VisitorImpl : Sm90VisitorImplBase<Ops...> {
   CUTLASS_DEVICE auto
   get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
     return transform_apply(ops,
-      [&] (auto& op) {
+      [&] (auto& op) CUTLASS_LAMBDA_FUNC_INLINE {
         return op.template get_consumer_store_callbacks<ReferenceSrc>(args);
       },
-      [] (auto&&... callbacks) {
+      [] (auto&&... callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
         auto callbacks_tuple = cute::make_tuple(callbacks...);
         return ConsumerStoreCallbacks<decltype(callbacks_tuple)>{callbacks_tuple};
       }
@@ -586,10 +597,10 @@ struct Sm90TreeVisitor : Sm90VisitorImpl<ChildOps..., NodeOp> {
     visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
       constexpr int Rm1 = sizeof...(ChildOps);
       return cute::detail::tapply(callbacks_tuple,
-        [&] (auto& child_callbacks) {
+        [&] (auto& child_callbacks) CUTLASS_LAMBDA_FUNC_INLINE {
           return child_callbacks.visit(frg_acc, epi_v, epi_m, epi_n); // child ops must be nullary (e.g. loads, trees)
         },
-        [&] (auto&&... frg_inputs) {
+        [&] (auto&&... frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
           return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
         },
         make_seq<Rm1>{} // restrict the transform to R-1 child ops, apply is for node op
@@ -637,7 +648,7 @@ struct Sm90SplitTreeVisitor : Sm90VisitorImpl<InputTree, AuxOutTrees..., OutputT
 
       constexpr int Rm2 = sizeof...(AuxOutTrees);
       cute::for_each(make_seq<Rm2>{}, // restrict the sequence to aux out trees
-        [&] (auto I) {
+        [&] (auto I) CUTLASS_LAMBDA_FUNC_INLINE {
           get<I+1>(callbacks_tuple).visit(frg_input, epi_v, epi_m, epi_n);
         }
       );
@@ -689,10 +700,10 @@ struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
 
       return cute::detail::tapply(EdgeTuple{}, callbacks_tuple, frg_compute_tuple,
         // Visit the first R-1 ops in topological order
-        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) {
+        [&] (auto&& edge_seq, auto& callbacks, auto& frg_compute) CUTLASS_LAMBDA_FUNC_INLINE {
           frg_compute = cute::detail::apply(frg_compute_tuple,
             // Compute the current op with children inputs
-            [&] (auto const&... frg_inputs) {
+            [&] (auto const&... frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
               auto frg_output = callbacks.visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
               using ElementOutput = typename decltype(frg_output)::Element;
               using ConvertOutput = NumericArrayConverter<ElementCompute, ElementOutput, FragmentSize>;
@@ -706,10 +717,10 @@ struct Sm90TopologicalVisitor : Sm90VisitorImpl<Ops...> {
           return frg_compute; // unused
         },
         // Visit the last op
-        [&] (auto const&...ops) {
+        [&] (auto const&...ops) CUTLASS_LAMBDA_FUNC_INLINE {
           return cute::detail::apply(frg_compute_tuple,
             // Compute the last op with children inputs
-            [&] (auto const&... frg_inputs) {
+            [&] (auto const&... frg_inputs) CUTLASS_LAMBDA_FUNC_INLINE {
               return get<Rm1>(callbacks_tuple).visit(frg_acc, epi_v, epi_m, epi_n, frg_inputs...);
             },
             // Get inputs in the sequence given by the children indices of the last op
diff --git a/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp b/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
index 53c0dce8..5ac64423 100644
--- a/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
+++ b/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h
index 186e9966..145488ae 100644
--- a/include/cutlass/epilogue/thread/activation.h
+++ b/include/cutlass/epilogue/thread/activation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -354,7 +354,11 @@ struct Sigmoid {
 
   CUTLASS_HOST_DEVICE
   T operator()(T const &value) const {
+#if defined(CUTLASS_USE_TANH_FOR_SIGMOID)
+    return fast_tanh(value * T(0.5)) * T(0.5) + T(0.5);
+#else
     return T(1) / (T(1) + fast_exp(-value));
+#endif
   }
 };
 
@@ -364,14 +368,15 @@ struct Sigmoid<Array<T, N>> {
 
   CUTLASS_HOST_DEVICE
   Array<T, N> operator()(Array<T, N> const& z) const {
-    plus<Array<T, N>> add;
-
 #if defined(CUTLASS_USE_TANH_FOR_SIGMOID)
     multiplies<Array<T, N>> mul;
+    multiply_add<Array<T, N>> fma;
     fast_tanh_op<Array<T, N>> tanh;
-    return mul(add(tanh(mul(z, cutlass::constants::half<T>())), cutlass::constants::one<T>()),
+    return fma(tanh(mul(z, cutlass::constants::half<T>())),
+               cutlass::constants::half<T>(),
                cutlass::constants::half<T>());
 #else
+    plus<Array<T, N>> add;
     divides<Array<T, N>> div;
     negate<Array<T, N>> neg;
     fast_exp_op<Array<T, N>> fast_exp;
diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h
index 86200b41..432906ac 100644
--- a/include/cutlass/epilogue/thread/conversion_op.h
+++ b/include/cutlass/epilogue/thread/conversion_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/detail.hpp b/include/cutlass/epilogue/thread/detail.hpp
index 77563002..a132134c 100644
--- a/include/cutlass/epilogue/thread/detail.hpp
+++ b/include/cutlass/epilogue/thread/detail.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h
index c3aa3ff4..05a1f79b 100644
--- a/include/cutlass/epilogue/thread/linear_combination.h
+++ b/include/cutlass/epilogue/thread/linear_combination.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
index c5ffdaa0..0b8d07a6 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -515,6 +515,482 @@ public:
   }
 };
 
+
+/// This base class is meant to define the concept required of the
+/// EpilogueWithBroadcast::OutputOp
+template <
+  typename ElementC_,
+  typename ElementAccumulator_,
+  typename ElementCompute_,
+  typename ElementZ_,
+  typename ElementT_,
+  int ElementsPerAccess,
+  typename ElementwiseOp_ = Identity<ElementCompute_>,
+  typename BinaryOp_ = plus<ElementCompute_>,
+  bool StoreT_ = true,
+  typename ElementVector_ = ElementC_
+>
+class LinearCombinationPerChannelScalingBiasElementwise {
+public:
+
+  using ElementOutput = ElementC_;
+  using ElementD = ElementOutput;
+  using ElementC = ElementC_;
+  using ElementAccumulator = ElementAccumulator_;
+  using ElementCompute = ElementCompute_;
+  using ElementScalar = ElementCompute;
+  using ElementZ = ElementZ_;
+  using ElementT = ElementT_;
+  using ElementVector = ElementVector_;
+  static int const kElementsPerAccess = ElementsPerAccess;
+  static int const kCount = kElementsPerAccess;
+
+  /// Follow cutlass3x EVT aliases
+  static bool const IsEltActSupported = true;
+  static bool const IsPerChannelScalingSupported = true;
+
+  using ElementwiseOp = ElementwiseOp_;
+  using BinaryOp = BinaryOp_;
+
+  using ElementwiseOpDispatcher = detail::ElementwiseOpDispatcher<ElementwiseOp>;
+  using ElementwiseArguments = typename ElementwiseOpDispatcher::Arguments;
+
+  // Indicates that this epilogue applies only one binary operation
+  static bool const kIsSingleSource = true;
+
+
+  using FragmentAccumulator = Array<ElementAccumulator, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementCompute, kElementsPerAccess>;
+  using FragmentC = Array<ElementC, kElementsPerAccess>;
+  using FragmentZ = Array<ElementZ, kElementsPerAccess>;
+  using FragmentT = Array<ElementT, kElementsPerAccess>;
+
+  // Definitions needed for collective epilogue
+  using FragmentSource = FragmentC;
+  using FragmentOutput = FragmentZ;
+  using ElementBias = ElementVector;
+  using FragmentBias = Array<ElementBias, kElementsPerAccess>;
+  using ActivationFn = ElementwiseOp;
+  static const ScaleType::Kind kScale = ScaleType::PerChannelScaling;
+
+  static bool const kIsHeavy = kIsHeavy_member_or_false<ElementwiseOp>::value;
+
+  /// If true, the 'Z' tensor is stored
+  static bool const kStoreZ = true;
+
+  /// If true, the 'T' tensor is stored
+  static bool const kStoreT = StoreT_;
+
+  /// Host-constructable parameters structure
+  struct Params {
+    ElementCompute const *alpha_ptr;       ///< pointer to accumulator scalar - if not null, loads it from memory
+    ElementCompute const *beta_ptr;        ///< pointer to source scalar - if not null, loads it from memory
+    ElementCompute beta;                   ///< scales source tensor
+    ElementwiseArguments  elementwise;     ///< Arguments for elementwise operation
+
+    //
+    // Methods
+    //
+
+    CUTLASS_HOST_DEVICE
+    Params(): 
+      alpha_ptr(nullptr), 
+      beta_ptr(nullptr),
+      beta(ElementCompute(0)) { }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr,
+      ElementCompute const *beta_ptr,
+      ElementwiseArguments  elementwise_ = ElementwiseArguments{}
+    ): beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), elementwise(elementwise_) {
+
+    }
+
+    CUTLASS_HOST_DEVICE
+    Params(
+      ElementCompute const *alpha_ptr
+    ): beta(0), alpha_ptr(alpha_ptr), beta_ptr(nullptr) {
+
+    }
+  };
+
+private:
+
+  //
+  // Data members
+  //
+
+  ElementCompute const* beta_ptr_ = nullptr;
+  ElementCompute beta_ = 0;
+  ElementwiseArguments const &elementwise_;
+  bool skip_elementwise_;
+
+public:
+
+  //
+  // Methods
+  //
+
+  /// Constructor from Params
+  CUTLASS_HOST_DEVICE
+  LinearCombinationPerChannelScalingBiasElementwise(Params const &params): elementwise_(params.elementwise) {
+    if (params.beta_ptr) {
+      beta_ptr_ = params.beta_ptr;
+    }
+    else {
+      beta_ = params.beta;
+    }
+    skip_elementwise_ = false;
+  }
+
+  /// Returns true if source is needed
+  CUTLASS_HOST_DEVICE
+  bool is_source_needed() const {
+    return beta_ptr_ != nullptr || beta_ != ElementCompute(0);
+  }
+
+  CUTLASS_HOST_DEVICE
+  bool is_beta_vector() const {
+    return beta_ptr_ != nullptr;
+  }
+
+  /// Functionally required for serial reduction in the epilogue
+  CUTLASS_HOST_DEVICE
+  void set_k_partition(int k_partition, int k_partition_count) {
+    if (k_partition) {
+      beta_ = ElementCompute(1);
+    }
+
+    if (k_partition != k_partition_count - 1) {
+      skip_elementwise_ = true;
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + beta_ * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  /// D = elementwise_op(vector_alpha * accumulator + vector_beta * source + bias)
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbeta,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + vbeta[i] * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentC const &frag_C,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute tmp_C = NumericArrayConverter<ElementCompute, ElementC, kElementsPerAccess>()(frag_C);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i] + beta_ * tmp_C[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    FragmentZ &frag_Z,
+    FragmentT &frag_T,
+    FragmentAccumulator const &AB,
+    FragmentCompute const & valpha,
+    FragmentCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    FragmentCompute tmp_Accum = NumericArrayConverter<ElementCompute, ElementAccumulator, kElementsPerAccess>()(AB);
+    FragmentCompute result_Z;
+    FragmentCompute result_T;
+
+    CUTLASS_PRAGMA_UNROLL
+    for (int i = 0; i < kElementsPerAccess; ++i) {
+      ElementCompute z = binary_op(valpha[i] * tmp_Accum[i], vbias[i]);
+      result_T[i] = z;
+      result_Z[i] = skip_elementwise_ ? z : elementwise_op(z);
+    }
+
+    NumericArrayConverter<ElementZ, ElementCompute, kElementsPerAccess> convert_z;
+    frag_Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      NumericArrayConverter<ElementT, ElementCompute, kElementsPerAccess> convert_t;
+      frag_T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum + beta_ * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is true
+  /// D = elementwise_op(vector_alpha * accumulator + vector_beta * source + bias)
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbeta,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum + vbeta * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when elementwise_op require arguments and is_source_needed() is false
+  template <typename ElementwiseArgs>
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias,
+    ElementwiseArgs const &elementwise_args) const {
+
+    ElementwiseOp elementwise_op;
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z, elementwise_args);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is true
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementC const &C,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+    ElementCompute tmp_C = NumericConverter<ElementCompute, ElementC>()(C);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum + beta_ * tmp_C, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+
+  /// Applies the operation when is_source_needed() is false
+  CUTLASS_HOST_DEVICE
+  void operator()(
+    ElementZ &Z,
+    ElementT &T,
+    ElementAccumulator const &AB,
+    ElementCompute const & valpha,
+    ElementCompute const & vbias) const {
+
+    ElementwiseOpDispatcher elementwise_op(elementwise_);
+    BinaryOp binary_op;
+
+    ElementCompute tmp_Accum = NumericConverter<ElementCompute, ElementAccumulator>()(AB);
+
+    ElementCompute z = binary_op(valpha * tmp_Accum, vbias);
+    ElementCompute result_Z = skip_elementwise_ ? z : elementwise_op(z);
+
+    NumericConverter<ElementZ, ElementCompute> convert_z;
+    Z = convert_z(result_Z);
+
+    if constexpr (kStoreT) {
+      ElementCompute result_T = z;
+      NumericConverter<ElementT, ElementCompute> convert_t;
+      T = convert_t(result_T);
+    }
+  }
+};
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace thread
diff --git a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
index ead1123c..76d80f29 100644
--- a/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_bias_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h
index aad9b523..ad8f5651 100644
--- a/include/cutlass/epilogue/thread/linear_combination_clamp.h
+++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_dgelu.h b/include/cutlass/epilogue/thread/linear_combination_dgelu.h
index 74eb8213..2aefe91e 100644
--- a/include/cutlass/epilogue/thread/linear_combination_dgelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_dgelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_drelu.h b/include/cutlass/epilogue/thread/linear_combination_drelu.h
index aed17305..9ecb0155 100644
--- a/include/cutlass/epilogue/thread/linear_combination_drelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_drelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_gelu.h b/include/cutlass/epilogue/thread/linear_combination_gelu.h
index 818b21aa..3e82d2ca 100644
--- a/include/cutlass/epilogue/thread/linear_combination_gelu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_gelu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic.h b/include/cutlass/epilogue/thread/linear_combination_generic.h
index a6bd9d67..a2acd493 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h b/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
index e1dde1a6..c8a8083e 100644
--- a/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
+++ b/include/cutlass/epilogue/thread/linear_combination_generic_with_scaling.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_hardswish.h b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
index ef51a318..4315a9b2 100644
--- a/include/cutlass/epilogue/thread/linear_combination_hardswish.h
+++ b/include/cutlass/epilogue/thread/linear_combination_hardswish.h
@@ -1,5 +1,5 @@
 /*************************************************************************************************** 
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h b/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
index 5989f09e..24b507eb 100644
--- a/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_leaky_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_params.h b/include/cutlass/epilogue/thread/linear_combination_params.h
index 27105567..2a7136a6 100644
--- a/include/cutlass/epilogue/thread/linear_combination_params.h
+++ b/include/cutlass/epilogue/thread/linear_combination_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
index ff32f13b..212084ae 100644
--- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
+++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h
index bbdc4986..134ddded 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_relu0.h b/include/cutlass/epilogue/thread/linear_combination_relu0.h
index 76ad5924..bbfa4a3d 100644
--- a/include/cutlass/epilogue/thread/linear_combination_relu0.h
+++ b/include/cutlass/epilogue/thread/linear_combination_relu0.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_residual_block.h b/include/cutlass/epilogue/thread/linear_combination_residual_block.h
index ec4083de..219ab259 100644
--- a/include/cutlass/epilogue/thread/linear_combination_residual_block.h
+++ b/include/cutlass/epilogue/thread/linear_combination_residual_block.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
index 35251177..481eb00d 100644
--- a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
+++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_silu.h b/include/cutlass/epilogue/thread/linear_combination_silu.h
index fa346b06..438bfa6b 100644
--- a/include/cutlass/epilogue/thread/linear_combination_silu.h
+++ b/include/cutlass/epilogue/thread/linear_combination_silu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
index c3ceea0a..b36501b9 100644
--- a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
+++ b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h b/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
index 8a2ce5a2..7dd3b3e5 100644
--- a/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
+++ b/include/cutlass/epilogue/thread/linear_combination_with_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h
index b24d4f95..c2474c08 100644
--- a/include/cutlass/epilogue/thread/reduction_op.h
+++ b/include/cutlass/epilogue/thread/reduction_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/thread/scale_type.h b/include/cutlass/epilogue/thread/scale_type.h
index d1a46621..beed8bf7 100644
--- a/include/cutlass/epilogue/thread/scale_type.h
+++ b/include/cutlass/epilogue/thread/scale_type.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
index 30af039b..2dd22651 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
index e86e4f92..effb49a2 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op_blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h b/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
index 8770f619..45e36028 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_direct_store.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
index e38e0ff6..ed87a9e3 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
index f3119fa4..10719f18 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
index 1d62f4fc..fb016937 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
index e1ae5a24..68a98f3f 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op_blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
index f73edfde..2039fe1d 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
index b0e89a4e..f260a5b4 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
index 16e045e1..ef4fc038 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h b/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
index 34ecfb74..0e023c66 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
index 3b1c5dc1..dd7a071e 100644
--- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
index 2092caf4..030a9c1b 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
index e39ca9d5..39297f14 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
index 1eac4a18..3c381162 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
index 0dccf652..5f5cd47e 100644
--- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h b/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
index 11f89b65..07115e69 100644
--- a/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
+++ b/include/cutlass/epilogue/threadblock/direct_store_epilogue_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h
index 4a0c67ba..49143cf7 100644
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h
index 30432e80..57ba7aab 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_base.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h b/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
index 294e9a51..14aac161 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_base_streamk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_depthwise.h b/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
index 83cbc8ab..76967410 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_depthwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_direct_store.h b/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
index 02de00dd..187d40c9 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_direct_store.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h b/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
index 486c0304..e8d6fbcc 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_gemm_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
index b294244c..7eb68f22 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h b/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
index 85ddae7c..73213557 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_smem_accumulator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h b/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
index aff05485..6a50a500 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_streamk_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h b/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
index 8202284b..8459a72a 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_visitor_with_softmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h b/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
index df5bbc5c..751ce50f 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h b/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
index d69f43c4..312d43c9 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h b/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
index 7f82bac7..5699a23e 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h b/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
index 6ab9cf06..e3e5abd0 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h b/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
index 027830c2..377524f7 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_with_visitor_callbacks.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
index d41a0fa4..65bf32a5 100644
--- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h
+++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
index 8b1cd4fd..a5b26e08 100644
--- a/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_2x.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
index 69a0feab..6275a2ff 100644
--- a/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_compute.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
index 28d482b7..ab877d4d 100644
--- a/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_load.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp b/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
index dcec7ac8..7bc7f80f 100644
--- a/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
+++ b/include/cutlass/epilogue/threadblock/fusion/visitor_store.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/fusion/visitors.hpp b/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
index 96fbc01d..f1936f25 100644
--- a/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
+++ b/include/cutlass/epilogue/threadblock/fusion/visitors.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
index 305f5d78..ec717fbc 100644
--- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
+++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/output_iterator_parameter.h b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
index 73008827..6f6d101d 100644
--- a/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
+++ b/include/cutlass/epilogue/threadblock/output_iterator_parameter.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
index 8a88c0ab..2c011c1d 100644
--- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
+++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
index 9943ea25..7c4692ff 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
index 2b86ac0e..7068c394 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
index 7f7f17b5..9990dbdb 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_affine_layout_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
index c2583674..518ad090 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_blas3.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
index a59437c0..49ee22ef 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_conv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
index 8d7bf7ed..0d1f1711 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_direct_conv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
index 5e9aa22b..11ec3d72 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
index 2fbbc9a4..a4ed371f 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_predicates.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
index 94b71b9b..dfe9571e 100644
--- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
+++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator_strided_dgrad.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
index ccdb4a9f..a321f1b6 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
index eef4d22b..66cc17f7 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
index 5af6997e..74d040ba 100644
--- a/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
+++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
index 84a096c6..58ccbfac 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
index 13b00762..b03cab83 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
index 92d3bf58..404be79f 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
index a69f0fd2..4c6f10b0 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
index 4979a380..fede5586 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
index 955409f3..bdd75a69 100644
--- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h
index b30bf19d..a1fa65ca 100644
--- a/include/cutlass/epilogue/warp/simt_policy.h
+++ b/include/cutlass/epilogue/warp/simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h
index b3f3a4f5..002d8591 100644
--- a/include/cutlass/epilogue/warp/tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h
index 0f470ff7..be7af135 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_simt.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
index 0bef0310..7cfa072c 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
index 3322a4c6..134e6686 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
index 8ce4750c..a18a9ac8 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
index 951833d4..8dbb1282 100644
--- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
+++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
index f6df868e..c108fc91 100644
--- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
index a09c1f79..01b1e72e 100644
--- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
+++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/experimental/distributed/device/detail.hpp b/include/cutlass/experimental/distributed/device/detail.hpp
new file mode 100644
index 00000000..129f7337
--- /dev/null
+++ b/include/cutlass/experimental/distributed/device/detail.hpp
@@ -0,0 +1,163 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Distributed gemm device layer helpers.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::device::detail {
+
+
+cutlass::Status check_cuda_status(cudaError_t status) {
+  if (status != cudaSuccess) {
+    auto result = cudaGetLastError();
+    CUTLASS_TRACE_HOST("  error message: " << cudaGetErrorString(result));
+    return cutlass::Status::kErrorInternal;
+  }
+  return cutlass::Status::kSuccess;                   
+}
+
+// DistGemmBufferHelper computes required buffer size and offsets for GEMM operands.
+template <
+  typename Tiler_, 
+  typename ElementA_,
+  typename ElementB_,
+  typename ElementC_,
+  typename ElementD_>
+struct DistGemmBufferHelper {
+
+  using Tiler = Tiler_;
+
+  using ElementA = ElementA_;
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+  using ElementD = ElementD_;
+
+  static constexpr int NumBuffersA = Tiler::NumBuffersA;
+  static constexpr int NumBuffersB = Tiler::NumBuffersB;
+  static constexpr int NumBuffersC = Tiler::NumBuffersC;
+  static constexpr int NumBuffersD = Tiler::NumBuffersD;
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_a(ProblemShape problem_shape) {
+    auto a_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersA, Tiler::get_local_a_shape(problem_shape), sizeof(ElementA))
+    );
+    return size(a_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_b(ProblemShape problem_shape) {
+    auto b_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersB, Tiler::get_local_b_shape(problem_shape), sizeof(ElementB))
+    );
+    return size(b_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_c(ProblemShape problem_shape) {
+    auto c_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersC, Tiler::get_local_c_shape(problem_shape), sizeof(ElementC))
+    );
+    return size(c_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size_d(ProblemShape problem_shape) {
+    auto d_buffer_layout = cute::make_layout(
+        cute::make_shape(NumBuffersD, Tiler::get_local_d_shape(problem_shape), sizeof(ElementD))
+    );
+    return size(d_buffer_layout);
+  }
+
+  template <typename ProblemShape>
+  static auto
+  get_buffer_size(ProblemShape problem_shape) {
+    size_t buffer_size = 0;
+
+    if constexpr (NumBuffersA > 0) {
+      buffer_size += get_buffer_size_a(problem_shape);
+    }
+    if constexpr (NumBuffersB > 0) {
+      buffer_size += get_buffer_size_b(problem_shape);
+    }
+    if constexpr (NumBuffersC > 0) {
+      buffer_size += get_buffer_size_c(problem_shape);
+    }
+    if constexpr (NumBuffersD > 0) {
+      buffer_size += get_buffer_size_d(problem_shape);
+    }
+
+    return buffer_size;
+  }
+
+  // Buffer space: |  buffer_A  |  buffer_B  |  buffer_C  |  buffer_D  |
+  // And buffer_{A,B,C,D}: |  iter 1  |  iter 2  | ... |  iter TP - 1 |
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_A(ProblemShape problem_shape) {
+    return 0;
+  }
+
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_B(ProblemShape problem_shape) {
+    return get_buffer_size_a(problem_shape);
+  }
+
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_C(ProblemShape problem_shape) {
+    return get_buffer_size_a(problem_shape) + get_buffer_size_b(problem_shape);
+  }
+
+  template <typename ProblemShape>
+  static size_t
+  get_buffer_offset_D(ProblemShape problem_shape) {
+    return get_buffer_size_a(problem_shape) + get_buffer_size_b(problem_shape) + get_buffer_size_c(problem_shape);
+  }
+};
+
+} // namespace cutlass::distributed::device::detail
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp b/include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp
new file mode 100644
index 00000000..ad9abfb3
--- /dev/null
+++ b/include/cutlass/experimental/distributed/device/dist_gemm_universal_wrapper.hpp
@@ -0,0 +1,717 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file Distributed GEMM Device Adapter
+
+  Sets up local GEMM stages, the cuda graph, manages buffer and barrier spaces,
+  and maps arguments to per-stage arguments.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+
+#include "cutlass/experimental/distributed/device/full_barrier.hpp"
+#include "cutlass/experimental/distributed/device/detail.hpp"
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::device {
+
+template <class GemmKernel_>
+class DistributedGemmUniversalAdapter {
+public:
+  using DeviceGemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel_>;
+  using GemmKernel = GemmKernel_;
+  using TileShape = typename GemmKernel::TileShape;
+  using ElementA = typename GemmKernel::ElementA;
+  using ElementB = typename GemmKernel::ElementB;
+  using ElementC = typename GemmKernel::ElementC;
+  using ElementD = typename GemmKernel::ElementD;
+  using ElementAccumulator = typename GemmKernel::ElementAccumulator;
+  using DispatchPolicy = typename GemmKernel::DispatchPolicy;
+  using CollectiveMainloop = typename GemmKernel::CollectiveMainloop;
+  using CollectiveEpilogue = typename GemmKernel::CollectiveEpilogue;
+
+  // "Inherit" type decls and static values from device GEMM
+  using LayoutA = typename DeviceGemm::LayoutA;
+  using LayoutB = typename DeviceGemm::LayoutB;
+  using LayoutC = typename DeviceGemm::LayoutC;
+  using LayoutD = typename DeviceGemm::LayoutD;
+
+  using StrideA = typename GemmKernel::StrideA;
+  using StrideB = typename GemmKernel::StrideB;
+  using StrideC = typename GemmKernel::StrideC;
+  using StrideD = typename GemmKernel::StrideD;
+
+  static bool const kEnableCudaHostAdapter = DeviceGemm::kEnableCudaHostAdapter;
+
+  static ComplexTransform const kTransformA = DeviceGemm::kTransformA;
+  static ComplexTransform const kTransformB = DeviceGemm::kTransformB;
+
+  using MathOperator = typename DeviceGemm::MathOperator;
+  using OperatorClass = typename DeviceGemm::OperatorClass;
+  using ArchTag = typename DeviceGemm::ArchTag;
+
+  using ThreadblockSwizzle = typename DeviceGemm::ThreadblockSwizzle;
+  using ThreadblockShape = typename DeviceGemm::ThreadblockShape;
+  using ClusterShape = typename DeviceGemm::ClusterShape;
+  using InstructionShape = typename DeviceGemm::InstructionShape;
+
+  static int const kThreadCount = DeviceGemm::kThreadCount;
+  static constexpr int WarpsInMma = DeviceGemm::WarpsInMma;
+  static constexpr int WarpsInMmaM = DeviceGemm::WarpsInMmaM;
+  static constexpr int WarpsInMmaN = DeviceGemm::WarpsInMmaN;
+
+  using WarpCount = typename DeviceGemm::WarpCount;
+  using WarpShape = typename DeviceGemm::WarpShape;
+
+  static int constexpr kStages = DeviceGemm::kStages;
+
+  static int constexpr kAlignmentA = DeviceGemm::kAlignmentA;
+  static int constexpr kAlignmentB = DeviceGemm::kAlignmentB;
+  static int constexpr kAlignmentC = DeviceGemm::kAlignmentC;
+  static int constexpr kAlignmentD = DeviceGemm::kAlignmentD;
+
+  using EpilogueOutputOp = typename DeviceGemm::EpilogueOutputOp;
+
+  static int constexpr kSplitKAlignment = DeviceGemm::kSplitKAlignment;
+
+  // Distributed GEMM types and defs
+  using DistSchedule = typename GemmKernel::DistSchedule;
+  static constexpr bool HasMemcpy = DistSchedule::HasMemcpy;
+  using TP = typename DistSchedule::TP;
+  static constexpr int TP_ = TP{};
+  using ElementFlag = typename GemmKernel::ElementFlag;
+  using ElementBarrier = uint32_t;
+
+  using BufferHelper = detail::DistGemmBufferHelper<
+    DistSchedule,
+    ElementA,
+    ElementB,
+    ElementC,
+    ElementD>;
+
+  /// Argument structure
+  using Arguments = typename GemmKernel::BaseArguments;
+  using DistributedArguments = typename GemmKernel::DistributedArguments;
+  using PackedArguments = typename GemmKernel::PackedArguments;
+
+  /// Argument structure: Kernel API
+  using Params = typename GemmKernel::PackedParams;
+
+  struct DistributedGemmState {
+    int device_idx;
+
+    Params params_array[TP_];
+
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_executable;
+
+    bool graph_created = false;
+    bool graph_instantiated = false;
+
+    void * memcpy_source_ptr_array[TP_];
+    void const * memcpy_remote_ptr_array[TP_];
+    size_t memcpy_bytes[TP_];
+
+    cutlass::Array<ElementBarrier*, TP_> device_barrier_ptrs;
+
+    bool is_initialized = false;
+  };
+
+private:
+
+  DistributedGemmState state_;
+
+public:
+
+  bool is_initialized() {
+    return state_.is_initialized && state_.graph_created && state_.graph_instantiated;
+  }
+
+  /// Determines whether the GEMM can execute the given problem.
+  static Status
+  can_implement(Arguments const& args) {
+    if (args.epilogue.thread.beta != 0.0 && DistSchedule::RemoteC) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Selected TP uses Remote C to communicate " <<
+          "partial results, which do not support non-zero values for beta yet " <<
+          "(epilogue must be sourceless.)\n");
+      return Status::kInvalid;
+    }
+
+    if (not DistSchedule::can_implement_global(args.problem_shape)) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem shape not divisible by TP.\n");
+      return Status::kInvalid;
+    }
+
+    Arguments args_copy = args;
+    args_copy.problem_shape = DistSchedule::get_local_gemm_shape(args.problem_shape);
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      if (not GemmKernel::can_implement(args_copy)) {
+        return Status::kInvalid;
+      }
+    }
+    return Status::kSuccess;
+  }
+
+  /// Gets buffer space size
+  static size_t
+  get_buffer_space_size(Arguments const& args) {
+    size_t buffer_bytes = 0;
+
+    buffer_bytes = BufferHelper::get_buffer_size(args.problem_shape);
+    buffer_bytes = round_nearest(buffer_bytes, MinWorkspaceAlignment);
+
+    return buffer_bytes;
+  }
+
+  static auto
+  get_tensor_A_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_A = make_tensor(args.mainloop.ptr_A, make_layout(
+          DistSchedule::get_local_a_shape(args.problem_shape),
+          args.mainloop.dA));
+
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_space[device_idx]) +
+      BufferHelper::get_buffer_offset_A(args.problem_shape);
+
+    return DistSchedule::get_tensor_A(tensor_A, tensor_buffer, device_idx, iteration);
+  }
+
+  static auto
+  get_tensor_B_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_B = make_tensor(args.mainloop.ptr_B, make_layout(
+          DistSchedule::get_local_b_shape(args.problem_shape),
+          args.mainloop.dB));
+
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_space[device_idx]) +
+      BufferHelper::get_buffer_offset_B(args.problem_shape);
+
+    return DistSchedule::get_tensor_B(tensor_B, tensor_buffer, device_idx, iteration);
+  }
+
+  static auto
+  get_tensor_C_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_C = make_tensor(args.epilogue.ptr_C, make_layout(
+          DistSchedule::get_local_c_shape(args.problem_shape),
+          args.epilogue.dC));
+
+    auto peer_idx_iter = DistSchedule::get_remote_peer_id(device_idx, iteration);
+    void* buffer_ptr = DistSchedule::RemoteC ? buffer_space[peer_idx_iter] : buffer_space[device_idx];
+
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_ptr) +
+      BufferHelper::get_buffer_offset_C(args.problem_shape);
+
+    return DistSchedule::get_tensor_C(tensor_C, tensor_buffer, device_idx, iteration);
+  }
+
+  static auto
+  get_tensor_D_for_iter(Arguments const* args_array, void** buffer_space, int device_idx, int iteration) {
+    auto args = args_array[device_idx];
+    auto tensor_D = make_tensor(args.epilogue.ptr_D, make_layout(
+          DistSchedule::get_local_d_shape(args.problem_shape),
+          args.epilogue.dD));
+
+    // support remoteD
+    uint8_t* tensor_buffer = reinterpret_cast<uint8_t*>(buffer_space[device_idx]) +
+      BufferHelper::get_buffer_offset_D(args.problem_shape);
+
+    return DistSchedule::get_tensor_D(tensor_D, tensor_buffer, device_idx, iteration);
+  }
+
+  static size_t
+  get_workspace_size(Arguments const& args) {
+    size_t workspace_bytes = 0;
+
+    workspace_bytes = get_buffer_space_size(args);
+
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      // NOTE: assumes underlying kernels align up to alignment requirements on their own,
+      // and that the alignment requirements of the individual kernels match.
+      workspace_bytes += GemmKernel::get_workspace_size(args);
+    }
+
+    return workspace_bytes;
+  }
+
+  static size_t
+  get_barrier_bytes() {
+    return round_nearest(sizeof(ElementBarrier), 32);
+  }
+
+  static size_t
+  get_flag_bytes() {
+    return round_nearest(sizeof(ElementFlag) * TP_, 32);
+  }
+
+  static void *
+  exclusive_workspace_ptr_to_flag_ptr(void * exclusive_workspace_ptr, int iteration) {
+    return static_cast<void*>(
+        static_cast<uint8_t*>(exclusive_workspace_ptr) + 
+        get_barrier_bytes() + 
+        (sizeof(ElementFlag) * iteration));
+  }
+
+  static size_t
+  get_exclusive_workspace_size() {
+    return get_barrier_bytes() + get_flag_bytes();
+  }
+
+  /// Initializes GEMM state from arguments.
+  Status
+  initialize(
+    Arguments const* args,
+    void** workspace_ptrs,
+    void** exclusive_workspace_ptrs,
+    int device_idx,
+    cudaStream_t stream = nullptr,
+    bool launch_with_pdl = false) {
+
+    CUTLASS_TRACE_HOST("DistributedGemm::initialize() - stream: " << (stream ? "non-null" : "null"));
+
+    state_.device_idx = device_idx;
+
+    for (int device = 0; device < TP_; ++device) {
+      state_.device_barrier_ptrs[device] = reinterpret_cast<ElementBarrier*>(exclusive_workspace_ptrs[device]);
+    }
+
+    // Zero out exclusive workspace
+    zero_workspace(exclusive_workspace_ptrs[device_idx], get_exclusive_workspace_size(), stream, nullptr);
+
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+
+      size_t workspace_iteration_offset = GemmKernel::get_workspace_size(args[device_idx]);
+      uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace_ptrs[device_idx]) + 
+        get_buffer_space_size(args[device_idx]) + 
+        (iteration * workspace_iteration_offset);
+
+      void * workspace_iter = reinterpret_cast<void*>(workspace_ptr);
+      void** buffer_space = workspace_ptrs;
+
+      // Set up GEMM arguments for the current stage/iteration
+      auto tensor_a_iter = get_tensor_A_for_iter(args, buffer_space, device_idx, iteration);
+      auto tensor_b_iter = get_tensor_B_for_iter(args, buffer_space, device_idx, iteration);
+      auto tensor_c_iter = get_tensor_C_for_iter(args, buffer_space, device_idx, iteration);
+      auto tensor_d_iter = get_tensor_D_for_iter(args, buffer_space, device_idx, iteration);
+
+      Arguments base_args = args[device_idx];
+      base_args.problem_shape = DistSchedule::get_local_gemm_shape(args[device_idx].problem_shape);
+      base_args.mainloop = {
+        reinterpret_cast<const ElementA*>(tensor_a_iter.data()),
+        tensor_a_iter.stride(),
+        reinterpret_cast<const ElementB*>(tensor_b_iter.data()),
+        tensor_b_iter.stride()
+      };
+      base_args.epilogue = {
+        base_args.epilogue.thread,
+        reinterpret_cast<const ElementC*>(tensor_c_iter.data()),
+        tensor_c_iter.stride(),
+        reinterpret_cast<const ElementD*>(tensor_d_iter.data()),
+        tensor_d_iter.stride()
+      };
+
+      if constexpr (DistSchedule::RemoteC) {
+        if (iteration > 0) {
+          base_args.epilogue.thread.beta = 1.0;
+        }
+        else if (iteration == 0){
+          base_args.epilogue.thread.beta = 0.0;
+        }
+      }
+
+      auto [left_peer_idx, right_peer_idx] = DistSchedule::get_peers_for_device(device_idx);
+      auto flag_peer_idx = DistSchedule::KernelWritesArrivalFlag ? right_peer_idx : device_idx;
+
+      void * self_flag_ptr = exclusive_workspace_ptr_to_flag_ptr(exclusive_workspace_ptrs[device_idx], iteration);
+      void * peer_flag_ptr = exclusive_workspace_ptr_to_flag_ptr(exclusive_workspace_ptrs[flag_peer_idx], iteration);
+
+      DistributedArguments distributed_args = {
+        device_idx,
+        iteration,
+        self_flag_ptr,
+        peer_flag_ptr
+      };
+      PackedArguments args_iter = {base_args, distributed_args};
+
+      // Initialize the workspace
+      Status status = GemmKernel::initialize_workspace(args_iter, workspace_iter, stream);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      // Initialize the Params structure
+      state_.params_array[iteration] = GemmKernel::to_underlying_arguments(args_iter, workspace_iter);
+
+      // Set up peer buffer ptrs
+      if (iteration > 0 && HasMemcpy) {
+        auto peer_idx_iter = DistSchedule::get_remote_peer_id(device_idx, iteration);
+
+        void * local_ptr_itr = nullptr;
+        void const * remote_ptr_itr = nullptr;
+        size_t local_size = 0;
+        size_t remote_size = 0;
+
+        static_assert(not DistSchedule::HasMemcpy || (
+              DistSchedule::MemcpyA || DistSchedule::MemcpyB),
+            "Expected to either memcpy A or B when scheduler requires memcpy.");
+        if constexpr (DistSchedule::MemcpyA) {
+          local_size = cute::cosize(tensor_a_iter.layout()) * sizeof(ElementA);
+          local_ptr_itr = reinterpret_cast<void*>(tensor_a_iter.data());
+
+          // Copy peer's slice in the first iteration (direct access memcpy instead of logical ring)
+          auto remote_tensor_iter = get_tensor_A_for_iter(args, buffer_space, peer_idx_iter, 0);
+          remote_ptr_itr = reinterpret_cast<void const*>(remote_tensor_iter.data());
+          remote_size = cute::cosize(remote_tensor_iter.layout()) * sizeof(ElementA);
+        }
+        else if constexpr (DistSchedule::MemcpyB) {
+          local_size = cute::cosize(tensor_b_iter.layout()) * sizeof(ElementB);
+          local_ptr_itr = reinterpret_cast<void*>(tensor_b_iter.data());
+
+          // Copy peer's slice in the first iteration (direct access memcpy instead of logical ring)
+          auto remote_tensor_iter = get_tensor_B_for_iter(args, buffer_space, peer_idx_iter, 0);
+          remote_ptr_itr = reinterpret_cast<void const*>(remote_tensor_iter.data());
+          remote_size = cute::cosize(remote_tensor_iter.layout()) * sizeof(ElementB);
+        }
+
+        assert(local_size == remote_size && local_size > 0);
+
+        state_.memcpy_source_ptr_array[iteration] = local_ptr_itr;
+        state_.memcpy_remote_ptr_array[iteration] = remote_ptr_itr;
+        state_.memcpy_bytes[iteration] = local_size;
+      }
+    }
+
+    //
+    // Account for dynamic smem capacity if needed
+    //
+    int smem_size = GemmKernel::SharedStorageSize;
+
+    if (smem_size >= (48 << 10)) {
+      CUTLASS_TRACE_HOST("  Setting smem size to " << smem_size);
+      cudaError_t result = cudaFuncSetAttribute(
+          device_kernel<GemmKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size);
+      if (cudaSuccess != result) {
+        result = cudaGetLastError(); // to clear the error bit
+        CUTLASS_TRACE_HOST("  cudaFuncSetAttribute() returned error: " << cudaGetErrorString(result));
+        return Status::kErrorInternal;
+      }
+    }
+
+    state_.is_initialized = true;
+
+    // Instantiate graph
+    Status status = construct_graph(launch_with_pdl);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    return Status::kSuccess;
+  }
+
+  Status
+  construct_graph(bool launch_with_pdl) {
+#if ((__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+    Status status = Status::kSuccess;
+
+    // Destroy existing graph, if created
+    if (state_.graph_created) {
+      status = detail::check_cuda_status(cudaGraphDestroy(state_.graph));
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    state_.graph_created = true;
+
+    cudaGraphNode_t full_barrier_node;
+
+    // Create dummy stream
+    cudaStream_t stream;
+    status = detail::check_cuda_status(cudaStreamCreate(&stream));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // Create graph
+    status = detail::check_cuda_status(cudaGraphCreate(&state_.graph, 0));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 1. Full barrier node
+    status = detail::check_cuda_status(cudaStreamBeginCaptureToGraph(
+          stream,
+          state_.graph,
+          nullptr, nullptr, 0,
+          cudaStreamCaptureModeRelaxed));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    cutlass::Array<ElementFlag*, TP_> self_flag_ptrs;
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      self_flag_ptrs[iteration] = state_.params_array[iteration].distributed.self_flag_ptr_;
+    }
+
+    launch_full_barrier<TP_, ElementBarrier, TP_, ElementFlag>(
+        state_.device_barrier_ptrs, self_flag_ptrs, state_.device_idx, stream, launch_with_pdl);
+
+    status = detail::check_cuda_status(cudaStreamEndCapture(stream, &state_.graph));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    size_t num_nodes;
+    status = detail::check_cuda_status(cudaGraphGetNodes(state_.graph, nullptr, &num_nodes));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    if (num_nodes != 1) {
+      CUTLASS_TRACE_HOST("  construct_graph() failure: expected a single node in the graph, got " << num_nodes << ".");
+      return Status::kErrorInternal;
+    }
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    status = detail::check_cuda_status(cudaGraphGetNodes(state_.graph, &full_barrier_node, &num_nodes));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 2. Optional mem copy branch
+    if constexpr (HasMemcpy) {
+
+      status = detail::check_cuda_status(cudaStreamBeginCaptureToGraph(
+            stream,
+            state_.graph,
+            &full_barrier_node,
+            /* dependencyData = */ nullptr,
+            1,
+            cudaStreamCaptureModeRelaxed));
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+
+      // No copies for first iter; we assume the data is already there.
+      for (int iteration = 1; iteration < TP_; ++iteration) {
+
+        status = detail::check_cuda_status(cudaMemcpyAsync(
+              state_.memcpy_source_ptr_array[iteration],
+              state_.memcpy_remote_ptr_array[iteration],
+              state_.memcpy_bytes[iteration],
+              cudaMemcpyDeviceToDevice, stream));
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+
+        // Set flag to non zero
+        status = detail::check_cuda_status(cudaMemsetAsync(
+              reinterpret_cast<void *>(state_.params_array[iteration].distributed.peer_flag_ptr_),
+              0b11111111,
+              sizeof(ElementFlag),
+              stream));
+
+        if (status != Status::kSuccess) {
+          return status;
+        }
+      }
+
+      status = detail::check_cuda_status(cudaStreamEndCapture(stream, &state_.graph));
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    // 3. Run local GEMMs
+    // 3.1. Create edge between full barrier and the correct gemm stage/iteration
+    cudaGraphEdgeData barrier_to_gemm_edge = {};
+    barrier_to_gemm_edge.from_port = HasMemcpy ? cudaGraphKernelNodePortLaunchCompletion: cudaGraphKernelNodePortProgrammatic;
+    barrier_to_gemm_edge.type = cudaGraphDependencyTypeProgrammatic;
+
+    status = detail::check_cuda_status(cudaStreamBeginCaptureToGraph(
+          stream,
+          state_.graph,
+          &full_barrier_node,
+          /* dependencyData = */ &barrier_to_gemm_edge,
+          1,
+          cudaStreamCaptureModeRelaxed));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    for (int iteration = 0; iteration < TP_; ++iteration) {
+      status = DeviceGemm::run(
+            state_.params_array[iteration],
+            stream,
+            /* cuda_adapter = */ nullptr,
+            /* launch_with_pdl = */ launch_with_pdl);
+
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    status = detail::check_cuda_status(cudaStreamEndCapture(stream, &state_.graph));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 4. Cleanup.
+    //// Destroy dummy stream
+    status = detail::check_cuda_status(cudaStreamDestroy(stream));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+
+    // 5. Instantiate graph
+    status = detail::check_cuda_status(cudaGraphInstantiate(
+          &state_.graph_executable,
+          state_.graph,
+          /* flags = */ 0));
+    if (status != Status::kSuccess) {
+      return status;
+    }
+    state_.graph_instantiated = true;
+
+    return Status::kSuccess;
+#else
+      CUTLASS_TRACE_HOST("  construct_graph() failure: target was compiled with an incompatible " <<
+          "version of the CUDA toolkit. Please compile Distributed GEMM with CUDA toolkit 12.4 or later.");
+      return Status::kErrorInternal;
+#endif
+  }
+
+  Status
+  update(Arguments const& args, void* workspace = nullptr) {
+    CUTLASS_TRACE_HOST("  DistributedGemm does not support updating arguments yet.");
+    return Status::kErrorInternal;
+  }
+
+  // NOTE: the interface for run() is different in Distributed Gemm:
+  //   1. launch_with_pdl is specified in `initialize`, where the cuda graph is being constructed,
+  //   2. the state of distributed gemm is an array of params for different iterations, and a
+  //      cuda graph.
+  //   3. Custom cuda adapters aren't supported for simplicity.
+  static Status
+  run(DistributedGemmState& state,
+      cudaStream_t stream = nullptr) {
+    CUTLASS_TRACE_HOST("DistributedGemm::run()");
+
+    if (not state.is_initialized) {
+      CUTLASS_TRACE_HOST("  Distributed gemm was not initialized. Did you forget to call initialize()?");
+      return Status::kErrorInternal;
+    }
+
+    if (not state.graph_instantiated) {
+      CUTLASS_TRACE_HOST("  Distributed gemm graph was not instantiated. Did you forget to call initialize()/construct_graph()?");
+      return Status::kErrorInternal;
+    }
+
+    cudaError_t result = cudaGraphLaunch(state.graph_executable, stream);
+    if (cudaSuccess != result) {
+      result = cudaGetLastError(); // to clear the error bit
+      CUTLASS_TRACE_HOST("  cudaGraphLaunch() returned error: " << cudaGetErrorString(result));
+      return Status::kErrorInternal;
+    }
+
+    return Status::kSuccess;
+  }
+
+  //
+  // Non-static launch overloads that first create and set the internal params struct of this kernel handle.
+  //
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  run(
+    cudaStream_t stream = nullptr) {
+    return run(state_, stream);
+  }
+
+  /// Overload that allows a user to re-launch the same kernel without updating internal params struct.
+  Status
+  operator()(cudaStream_t stream = nullptr) {
+    return run(state_, stream);
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  run(
+    Arguments const* args,
+    void** workspace_ptrs,
+    void** exclusive_workspace_ptrs,
+    int device_idx,
+    cudaStream_t stream = nullptr) {
+    Status status = initialize(
+        args,
+        workspace_ptrs,
+        exclusive_workspace_ptrs,
+        device_idx,
+        stream);
+
+    if (Status::kSuccess == status) {
+      status = run(stream);
+    }
+    return status;
+  }
+
+  /// Launches the kernel after first constructing Params internal state from supplied arguments.
+  Status
+  operator()(
+    Arguments const* args,
+    void** workspace_ptrs,
+    void** exclusive_workspace_ptrs,
+    int device_idx,
+    cudaStream_t stream = nullptr) {
+    return run(
+        args,
+        workspace_ptrs,
+        exclusive_workspace_ptrs,
+        device_idx,
+        stream);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::distributed::device
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/experimental/distributed/device/full_barrier.hpp b/include/cutlass/experimental/distributed/device/full_barrier.hpp
new file mode 100644
index 00000000..8ac9940e
--- /dev/null
+++ b/include/cutlass/experimental/distributed/device/full_barrier.hpp
@@ -0,0 +1,74 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Device layer interface for Distributed GEMM barrier kernel.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/experimental/distributed/kernel/full_barrier.hpp"
+
+namespace cutlass::distributed::device {
+
+template <int NP, typename IntType, int Iterations, typename FlagType>
+void launch_full_barrier(
+    cutlass::Array<IntType*, NP> device_arrival_ptrs,
+    cutlass::Array<FlagType*, Iterations> iteration_flag_ptrs,
+    IntType device_idx,
+    cudaStream_t stream,
+    bool launch_with_pdl) {
+
+#if ((__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4))
+  // Legacy (kernel) launch with PDL
+  cudaLaunchAttribute attributes[1];
+  attributes[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attributes[0].val.programmaticStreamSerializationAllowed = 1;
+
+  cudaLaunchConfig_t launch_config;
+  launch_config.gridDim = 1;
+  launch_config.blockDim = 1;
+  launch_config.dynamicSmemBytes = 0;
+  launch_config.stream = stream;
+  launch_config.attrs = attributes;
+  launch_config.numAttrs = launch_with_pdl ? 1 : 0;
+
+  cudaLaunchKernelEx(
+      &launch_config,
+      cutlass::distributed::kernel::full_barrier_kernel<NP, IntType, Iterations, FlagType>,
+      device_arrival_ptrs,
+      iteration_flag_ptrs,
+      device_idx);
+#endif
+}
+
+} // namespace cutlass::distributed::device
+
diff --git a/include/cutlass/experimental/distributed/kernel/detail.hpp b/include/cutlass/experimental/distributed/kernel/detail.hpp
new file mode 100644
index 00000000..0445567e
--- /dev/null
+++ b/include/cutlass/experimental/distributed/kernel/detail.hpp
@@ -0,0 +1,72 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Distributed gemm kernel layer helpers.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::kernel::detail {
+
+// Ld with CV cache hint (don’t cache and fetch again)
+// Reference:
+// https://docs.nvidia.com/cuda/parallel-thread-execution/#cache-operators
+// Used for loading arrival counts from peer devices
+
+CUTLASS_DEVICE
+void ld_without_cache(uint64_t& val, void const * ptr) {
+  asm volatile(
+      "{\n"
+      "  ld.global.cv.u64 %0, [%1];\n"
+      "}\n"
+      : "=l"(val)
+      : "l"(ptr));
+}
+
+CUTLASS_DEVICE
+void ld_without_cache(uint32_t& val, void const * ptr) {
+  asm volatile(
+      "{\n"
+      "  ld.global.cv.u32 %0, [%1];\n"
+      "}\n"
+      : "=r"(val)
+      : "l"(ptr));
+}
+
+} // namespace cutlass::distributed::kernel::detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+
diff --git a/include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp b/include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp
new file mode 100644
index 00000000..a9a40cfe
--- /dev/null
+++ b/include/cutlass/experimental/distributed/kernel/dist_gemm_kernel_wrapper.hpp
@@ -0,0 +1,235 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file Distributed GEMM Kernel Wrapper
+
+  Prepends CUTLASS 3 GEMM kernels with barriers and other necessary instructions to exectue
+  a Distributed GEMM stage.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/grid_dependency_control.h"
+#include "cutlass/gemm/gemm.h"
+
+#include "cutlass/experimental/distributed/kernel/detail.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::kernel {
+
+namespace detail {
+
+// Allow all CUTLASS 3.X GEMM kernels
+template <typename GemmKernel_>
+struct SupportsDistributedGemm: cutlass::gemm::detail::IsCutlass3GemmKernel<GemmKernel_> {};
+
+} // namespace detail
+
+/*!
+  DistributedGemmKernelWrapper is a wrapper around a GEMM kernel.
+
+  Depending on the underlying distribution policy/schedule, it prepends the underlying local GEMM
+  kernel with a few additional instructions that gate the execution of the GEMM on buffers being
+  ready for stages/iterations > 0.
+*/
+
+template <class GemmKernel_, class DistSchedule_, class Enable = void>
+struct DistributedGemmKernelWrapper;
+
+template <class GemmKernel_, class DistSchedule_>
+struct DistributedGemmKernelWrapper<
+  GemmKernel_,
+  DistSchedule_,
+  cute::enable_if_t<detail::SupportsDistributedGemm<GemmKernel_>::value>
+  >: GemmKernel_
+{
+  using DistSchedule = DistSchedule_;
+  using TP = typename DistSchedule::TP;
+
+  static constexpr bool KernelWritesArrivalFlag = DistSchedule::KernelWritesArrivalFlag;
+
+  using BaseKernel = GemmKernel_;
+  using BaseArguments = typename BaseKernel::Arguments;
+  using BaseParams = typename BaseKernel::Params;
+
+  static_assert(BaseKernel::ArchTag::kMinComputeCapability == 90, "DistGEMM only supports Hopper GEMMs for now.");
+  static_assert(not cute::is_same_v<typename BaseKernel::ElementC, void>, "DistributedGEMM epilogues must have a source.");
+
+  using ElementFlag = uint32_t;
+
+  // Device side arguments
+  struct DistributedArguments {
+    int device_idx = 0;
+    int iteration = 0;
+
+    void* self_flag_ptr{nullptr};
+    void* peer_flag_ptr{nullptr};
+  };
+
+  struct PackedArguments {
+    BaseArguments base{};
+    DistributedArguments distributed{};
+  };
+
+  struct DistributedParams {
+    int device_idx = 0;
+    int iteration = 0;
+
+    ElementFlag* self_flag_ptr_{nullptr};
+    ElementFlag* peer_flag_ptr_{nullptr};
+  };
+
+  // Kernel entry point API
+  struct PackedParams {
+    BaseParams base{};
+    DistributedParams distributed{};
+  };
+
+  using Params = PackedParams;
+
+  // Convert to underlying arguments. In this case, a simple copy for the aliased type.
+  static
+  PackedParams
+  to_underlying_arguments(PackedArguments const& args, void* workspace) {
+    CUTLASS_TRACE_HOST("distributed::to_underlying_arguments():");
+
+    auto kernel_params = BaseKernel::to_underlying_arguments(args.base, workspace);
+
+    DistributedParams dist_params = {
+        args.distributed.device_idx,
+        args.distributed.iteration,
+        reinterpret_cast<ElementFlag*>(args.distributed.self_flag_ptr),
+        reinterpret_cast<ElementFlag*>(args.distributed.peer_flag_ptr)
+    };
+
+    return {kernel_params, dist_params};
+  }
+
+  static bool
+  can_implement(BaseArguments const& args) {
+    return BaseKernel::can_implement(args);
+  }
+
+  static bool
+  can_implement(PackedArguments const& args) {
+    return BaseKernel::can_implement(args.base);
+  }
+
+  static size_t
+  get_workspace_size(BaseArguments const& args) {
+    return BaseKernel::get_workspace_size(args);
+  }
+
+  static size_t
+  get_workspace_size(PackedArguments const& args) {
+    return BaseKernel::get_workspace_size(args.base);
+  }
+
+  static cutlass::Status
+  initialize_workspace(BaseArguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return BaseKernel::initialize_workspace(args, workspace, stream, cuda_adapter);
+  }
+
+  static cutlass::Status
+  initialize_workspace(PackedArguments const& args, void* workspace = nullptr, cudaStream_t stream = nullptr,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return BaseKernel::initialize_workspace(args.base, workspace, stream, cuda_adapter);
+  }
+
+  /// Computes the grid shape
+  static dim3
+  get_grid_shape(PackedParams const& params) {
+    return BaseKernel::get_grid_shape(params.base);
+  }
+  
+  static dim3
+  get_grid_shape(BaseParams const& params) {
+    return BaseKernel::get_grid_shape(params);
+  }
+
+  CUTLASS_DEVICE
+  void
+  barrier_buffer(PackedParams const& params) {
+    if (params.distributed.iteration > 0) {
+
+      ElementFlag comm_iter = 0;
+      detail::ld_without_cache(comm_iter, params.distributed.self_flag_ptr_);
+      while (comm_iter == 0) {
+        detail::ld_without_cache(comm_iter, params.distributed.self_flag_ptr_);
+        __nanosleep(40);
+      }
+
+    }
+  }
+
+  CUTLASS_DEVICE
+  void
+  maybe_signal_arrival(PackedParams const& params) {
+    if constexpr (KernelWritesArrivalFlag) {
+      if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 &&
+          threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0 &&
+          params.distributed.iteration > 0) {
+        *reinterpret_cast<ElementFlag*>(params.distributed.peer_flag_ptr_) = 1;
+      }
+    }
+  }
+
+  CUTLASS_DEVICE
+  void
+  operator()(PackedParams const& params, char* smem_buf) {
+    // Launch next grid as soon as possible
+    arch::launch_dependent_grids();
+
+    // Wait on previous kernels to flush their memory.
+    arch::wait_on_dependent_grids();
+
+    // Optionally write arrivals for the previous stage/iteration.
+    maybe_signal_arrival(params);
+
+    // Spin-wait on an arrival flag, make sure the respective buffers are ready.
+    // If the buffered operand is memcpied into, it would wait on its local flag.
+    // If it's a remote buffer that is accessed directly, it would wait on its remote flag.
+    barrier_buffer(params);
+
+    // Perform local gemm
+    BaseKernel gemm;
+    gemm(params.base, smem_buf);
+  }
+
+};
+
+} // namespace cutlass::distributed::kernel
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/experimental/distributed/kernel/full_barrier.hpp b/include/cutlass/experimental/distributed/kernel/full_barrier.hpp
new file mode 100644
index 00000000..0ec620a5
--- /dev/null
+++ b/include/cutlass/experimental/distributed/kernel/full_barrier.hpp
@@ -0,0 +1,82 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*! \file
+    \brief Distributed GEMM barrier kernel.
+
+    The kernel resets the per-stage arrival flags, performs a full barrier (any-to-any),
+    and also atomically resets the local barrier arrival count.
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/grid_dependency_control.h"
+
+#include "cutlass/experimental/distributed/kernel/detail.hpp"
+
+namespace cutlass::distributed::kernel {
+
+template <int NP, typename IntType, int Iterations, typename FlagType>
+__global__ void full_barrier_kernel(
+    cutlass::Array<IntType*, NP> device_arrival_ptrs,
+    cutlass::Array<FlagType*, Iterations> iteration_flag_ptrs,
+    IntType device_idx) {
+
+  arch::launch_dependent_grids();
+  arch::wait_on_dependent_grids();
+
+  CUTLASS_PRAGMA_UNROLL
+  for (FlagType i = 0; i < Iterations; ++i) {
+    iteration_flag_ptrs[i][0] = static_cast<FlagType>(0);
+  }
+
+  IntType val = 1;
+  IntType max_val = static_cast<IntType>(NP - 1);
+
+  CUTLASS_PRAGMA_UNROLL
+  for (IntType d = 0; d < NP; ++d) {
+    if (d != device_idx) {
+      atomicAdd(device_arrival_ptrs[d], val);
+    }
+  }
+
+  IntType curr_val = 0;
+  detail::ld_without_cache(curr_val, device_arrival_ptrs[device_idx]);
+  while (curr_val < max_val) {
+    __nanosleep(40);
+    detail::ld_without_cache(curr_val, device_arrival_ptrs[device_idx]);
+  }
+
+  atomicSub(device_arrival_ptrs[device_idx], max_val);
+}
+
+} // namespace cutlass::distributed::kernel
+
diff --git a/include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp b/include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp
new file mode 100644
index 00000000..73d52adc
--- /dev/null
+++ b/include/cutlass/experimental/distributed/schedules/dist_gemm_1d_schedules.hpp
@@ -0,0 +1,324 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file 1-D Distributed GEMM Schedules
+
+  NOTE: This API is __experimental__ and will change heavily over time. Particularly the use of
+  CuTe layouts as integer functions in defining iteration-to-tile mappings is over-expressive and
+  leaves plenty of room for incorrect/unexpected behavior.
+  Please proceed with caution when modifying these schedules or defining new ones.
+
+  Device/iteration mappings are defined with CuTe layouts, 
+  since they are functions from integers to integers as well.
+  
+  Each mapping is defined as a linear function of 2 variables (rank-2 layout):
+   First variable (mode) is device index, second variable (mode) is iteration.
+   A constant is also added to the final result as an offset value. This is a temporary workaround
+   so that identity ownership mappings in the final iteration can be guaranteed for the schedules
+   currently implemented.
+  How are these mappings defined?
+    Each schedule represents a unique parallel matrix multiplication algorithm, which describes how
+    matrices/tensors are distributed among TP GPUs.
+
+    Depending on the algorithm, access patterns (GPU to tile or (GPU, iteration) to tile) mappings)
+    are not necessarily going to be the identity function.
+
+  Pitfalls:
+    The current representation uses CuTe layouts as arbitrary linear functions that map
+    (GPU, iteration) to tile indices.
+    This approach is over-expressive, and therefore makes a lot of assumptions on the part of the
+    developer in how these mappings are defined. This can easily lead to incorrect implementations
+    if not handled carefully.
+
+  
+  Assumption made in all schedules: TP == number of iterations (stages)
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+#include "cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp"
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::schedules {
+
+// GEMM + Reduce Scatter
+// A and B are tiled along the K mode, which means each GPU gets an [M, K / TP]-shaped slice of A,
+// and an [N, K / TP] slice of B.
+// A is further tiled along the M mode, so that each stage/iteration computes a GEMM of shape
+// [M / TP, N, K / TP], and the epilogue will perform the reduction by reading its C tensor directly
+// from the left peer's previous D buffer.
+//
+// Below is an illustration of the tiling and iteration mappings for this pattern in the TP=4 case:
+//
+//   Rows correspond to the M mode, columns correspond to the K mode for A and B and N mode for 
+//   C and D.  Because sharding is done along K, each column of tiles is owned by one GPU.
+//   Values in the grid correspond to the iteration/stage accessing the tile.
+//   * means the same tile is accessed in all iterations/stages.
+//
+//         Tensor A                             Tensor B              
+//                                                                    
+//  GPU0  GPU1  GPU2  GPU3              GPU0  GPU1  GPU2  GPU3        
+// |-----|-----|-----|-----|           |-----|-----|-----|-----|      
+// |     |     |     |     |           |     |     |     |     |      
+// |  3  |  0  |  1  |  2  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |     |     |     |     |      
+// |     |     |     |     |           |     |     |     |     |      
+// |  2  |  3  |  0  |  1  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |  *  |  *  |  *  |  *  |      
+// |     |     |     |     |           |     |     |     |     |      
+// |  1  |  2  |  3  |  0  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |     |     |     |     |      
+// |     |     |     |     |           |     |     |     |     |      
+// |  0  |  1  |  2  |  3  |           |     |     |     |     |      
+// |_____|_____|_____|_____|           |_____|_____|_____|_____|      
+//                                                                    
+//                          M x K                               N x K 
+//
+//
+//              Tensor C                            Tensor D              
+//              (Peer's D)
+//                                         
+//                                                                        
+//      |-----------------------|           |-----------------------|     
+//      |                       |           |                       |     
+// GPU0 |         1,2,3         |      GPU0 |           *           |     
+//      |_______________________|           |_______________________|     
+//      |                       |           |                       |     
+// GPU1 |         1,2,3         |      GPU1 |           *           |     
+//      |_______________________|           |_______________________|     
+//      |                       |           |                       |     
+// GPU2 |         1,2,3         |      GPU2 |           *           |     
+//      |_______________________|           |_______________________|     
+//      |                       |           |                       |     
+// GPU3 |         1,2,3         |      GPU3 |           *           |     
+//      |_______________________|           |_______________________|     
+//                                                                        
+//                               M x N                               M x N
+//
+//
+//  Tensor A's access pattern can be expressed as follows as a function of GPU index and iteration:
+//    tile_idx = ((device_idx - 1) - iter + TP) % TP
+//  
+//  and can be expressed with the following CuTe layout:
+//    (TP, TP) : (1, -1)
+//  with ProcessorOffset = -1
+//
+//
+//  Note: Since this schedule does not expose any communication, iteration 0 has no reduction step,
+//  therefore epilogue is sourceless in iteration 0, and in the rest of the iterations the epilogue
+//  source is a remote pointer to Tensor D owned by its left peer.
+//
+//  Left peer is simply (device_idx - 1 + TP) % TP, which is expressed with the following CuTe layout:
+//    (TP, TP) : (1, 0)
+//
+template <class TP_>
+struct ReduceScatter1D_TilingA_RotatingC: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<_1, _1, TP_, _1>,
+    /* IterationTiler_ = */ cute::Shape<TP_, _1, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _0>>,                             // (left neighbor) = (device_idx + ProcessorOffset + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _m1>>,                            // = (device_idx + ProcessorOffset - iter + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::N == 1) = 0
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _m1,
+    /* MemcpyA_ = */ false,
+    /* MemcpyB_ = */ false,
+    /* KernelWritesArrivalFlag_ = */ true,
+    /* NumBuffersA_ = */ 0,
+    /* NumBuffersB_ = */ 0,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_  = */ TP_{} - 1> {};
+
+// This schedule is similar to ReduceScatter1D_TilingA_RotatingC, but with the second tiling
+// done along N instead of M. All other details remain unchanged.
+template <class TP_>
+struct ReduceScatter1D_TilingB_RotatingC: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<_1, _1, TP_, _1>,
+    /* IterationTiler_ = */ cute::Shape<_1, TP_, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _0>>,                             // (left neighbor) = (device_idx + ProcessorOffset + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::N == 1) = 0
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _m1>>,                            // = (device_idx + ProcessorOffset - iter + TP) % TP, with ProcessorOffset = -1
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _m1,
+    /* MemcpyA_ = */ false,
+    /* MemcpyB_ = */ false,
+    /* KernelWritesArrivalFlag_ = */ true,
+    /* NumBuffersA_ = */ 0,
+    /* NumBuffersB_ = */ 0,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_  = */ TP_{} - 1> {};
+
+
+// AllGather + GEMM
+// A and B are tiled along the N mode, which means each GPU allgathers A,
+// and operates with an [N / TP, K] slice of B.
+// For pipelining, A is further tiled along the M mode, so that each stage/iteration computes a
+// GEMM of shape [M / TP, N / TP, K], and concurrently we copy a peer's A slice into a local buffer
+// for the next stage/iteration.
+//
+// Below is an illustration of the tiling and iteration mappings for this pattern in the TP=4 case:
+//
+//   Rows correspond to the M mode, columns correspond to the K mode for A and B and N mode for 
+//   C and D.
+//
+//   Since this is a pipelined schedule without exposed communication, the first iteration starts
+//   off immediately and operates on local slices of A and B. In the rest of the iterations, each
+//   GPU accesses a slice of A copied from a peer GPU while it was busy with the last stage.
+//
+//   Values in the following grids correspond to the peer buffer accessed by each GPU during
+//   different iterations:
+//
+//              Tensor A                         Tensor A               
+//               iter 0                           iter 1                
+//                                                                      
+//      |-----------------------|        |-----------------------|      
+//      |                       |        |                       |      
+// GPU0 |           0           |        |           1           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU1 |           1           |        |           2           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU2 |           2           |        |           3           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU3 |           3           |        |           0           |      
+//      |_______________________|        |_______________________|      
+//                                                                      
+//                               M x K                            M x K 
+//
+//              Tensor A                         Tensor A               
+//               iter 2                           iter 3                
+//                                                                      
+//      |-----------------------|        |-----------------------|      
+//      |                       |        |                       |      
+// GPU0 |           2           |        |           3           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU1 |           3           |        |           0           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU2 |           0           |        |           1           |      
+//      |_______________________|        |_______________________|      
+//      |                       |        |                       |      
+// GPU3 |           1           |        |           2           |      
+//      |_______________________|        |_______________________|      
+//                                                                      
+//                               M x K                            M x K 
+//
+//   Values in the following grids correspond to the tile accessed during each iteration.
+//   * means the same tile is accessed in all iterations/stages.
+//
+//              Tensor B                             Tensor C/D               
+//                                                                          
+//                                                                          
+//      |-----------------------|            |-----|-----|-----|-----|      
+//      |                       |            |     |     |     |     |      
+// GPU0 |           *           |       GPU0 |  0  |  1  |  2  |  3  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//      |                       |            |     |     |     |     |      
+// GPU1 |           *           |       GPU1 |  3  |  0  |  1  |  2  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//      |                       |            |     |     |     |     |      
+// GPU2 |           *           |       GPU2 |  2  |  3  |  0  |  1  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//      |                       |            |     |     |     |     |      
+// GPU3 |           *           |       GPU3 |  1  |  2  |  3  |  0  |      
+//      |_______________________|            |_____|_____|_____|_____|      
+//                                                                          
+//                               N x K                                M x N 
+//
+//
+//  Tensor C/D's access pattern can be expressed as follows as a function of GPU index and iteration:
+//    tile_idx = (device_idx + iter) % TP
+//  
+//  and can be expressed with the following CuTe layout:
+//    (TP, TP) : (1, 1)
+//
+//  This schedule does not need a ProcessorOffset constant.
+//
+//  Peer devices from which A slices are copied is also expressed with the same function and CuTe
+//  layout.
+//
+template <class TP_>
+struct AllGather1D_TilingCD_RotatingA: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<_1, TP_, _1, _1>,
+    /* IterationTiler_ = */ cute::Shape<TP_, _1, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::N == 1) = 0
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _0,
+    /* MemcpyA_ = */ true,
+    /* MemcpyB_ = */ false,
+    /* KernelWritesArrivalFlag_ = */ false,
+    /* NumBuffersA_ = */ TP_{} - 1,
+    /* NumBuffersB_ = */ 0,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_ = */ 0>{};
+
+// This schedule is similar to AllGather1D_TilingCD_RotatingA, but with the order of tiling
+// swapped from N then M to M then N. This means slices of B are rotated around GPUs instead of
+// slices of A. All other details remain unchanged.
+template <class TP_>
+struct AllGather1D_TilingCD_RotatingB: BaseSchedule<
+    TP_,
+    /* ProcessorTiler_ = */ cute::Shape<TP_, _1, _1, _1>,
+    /* IterationTiler_ = */ cute::Shape<_1, TP_, _1, _1>,
+    /* PeerDeviceMapping_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingM_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::M == 1) = 0
+    /* IterationMappingN_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_1, _1>>,                             // = device_idx + iter
+    /* IterationMappingK_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::K == 1) = 0
+    /* IterationMappingL_ = */ cute::Layout<cute::Shape<TP_, TP_>, cute::Stride<_0, _0>>,                             // (IterationTiler::L == 1) = 0
+    /* ProcessorOffset_ = */ _0,
+    /* MemcpyA_ = */ false,
+    /* MemcpyB_ = */ true,
+    /* KernelWritesArrivalFlag_ = */ false,
+    /* NumBuffersA_ = */ 0,
+    /* NumBuffersB_ = */ TP_{} - 1,
+    /* NumBuffersC_ = */ 0,
+    /* NumBuffersD_ = */ 0>{};
+
+
+} // namespace cutlass::distributed::schedules
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp b/include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp
new file mode 100644
index 00000000..3a2d3328
--- /dev/null
+++ b/include/cutlass/experimental/distributed/schedules/dist_gemm_base_schedule.hpp
@@ -0,0 +1,538 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+/*!
+  \file Base Schedule for Distributed GEMM
+
+  Templates Distributed GEMM schedules so that they can be expressed as a set of CuTe primitives and
+  other static values.
+
+  NOTE: This API is __experimental__ and will change heavily over time. Particularly the use of
+  CuTe layouts as integer functions in defining iteration-to-tile mappings is over-expressive and
+  leaves plenty of room for incorrect/unexpected behavior.
+  Please proceed with caution when modifying these schedules or defining new ones.
+*/
+
+#pragma once
+
+#include "cute/layout.hpp"
+#include "cute/tensor.hpp"
+#include "cutlass/cutlass.h"
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::distributed::schedules {
+
+/*
+ * Distributed GEMM schedules define exactly how operand tensors are tiled and sliced across 
+ * processors (GPUs) and stages/iterations.
+ *
+ * BaseSchedule's role is to ease the implementation of arbitrary Distributed GEMM schedules
+ * and reduce code repetition, simply by reducing the implementation to CuTe primitives and a few
+ * other static values (buffer sizes, whether tensors are rotated using memcpies or not, and the
+ * like.)
+ */
+template <
+  class TP_,                      // CuTe constant defining the number of processors / GPUs / TP value
+  class ProcessorTiler_,          // CuTe tiler defining how fully materialized tensors are sharded across devices
+  class IterationTiler_,          // CuTe tiler defining how local tensors are tiled across stages/iterations
+  class PeerDeviceMapping_,       // CuTe layout mapping device index and stage/iteration to the device's peer index for that stage/iteration
+  class IterationMappingM_,       // CuTe layout mapping device index and stage/iteration to M tile index
+  class IterationMappingN_,       // CuTe layout mapping device index and stage/iteration to N tile index
+  class IterationMappingK_,       // CuTe layout mapping device index and stage/iteration to K tile index
+  class IterationMappingL_,       // CuTe layout mapping device index and stage/iteration to L tile index
+  class ProcessorOffset_,         // Constant offset for processor / GPU index in iteration mapping
+  bool MemcpyA_,                  // Whether tensor A is memcpied
+  bool MemcpyB_,                  // Whether tensor B is memcpied
+  bool KernelWritesArrivalFlag_,  // Whether the kernel writes arrival flags (when tensors are directly accessed from peer and not memcpied)
+  int NumBuffersA_,               // Number of buffers required for tensor A
+  int NumBuffersB_,               // Number of buffers required for tensor B
+  int NumBuffersC_,               // Number of buffers required for tensor C
+  int NumBuffersD_>               // Number of buffers required for tensor D
+struct BaseSchedule {
+
+  using TP = TP_;
+
+  static_assert(
+      cute::is_static<TP>::value && cute::is_integral<TP>::value && cute::rank(TP{}) == 1 && cute::depth(TP{}) == 0,
+      "Only integers allowed for TP at this time.");
+
+  static_assert(cute::rank(ProcessorTiler_{}) == 4, "Expected rank-4 processor tiler.");
+  static_assert(cute::rank(IterationTiler_{}) == 4, "Expected rank-4 iteration tiler.");
+
+  static_assert(cute::rank(PeerDeviceMapping_{}) == 2, 
+      "PeerDeviceMapping must be rank-2 (device_idx, iter)");
+
+  static_assert(cute::rank(IterationMappingM_{}) == 2, 
+      "IterationMappingM must be rank-2 (device_idx, iter).");
+  static_assert(cute::rank(IterationMappingN_{}) == 2, 
+      "IterationMappingN must be rank-2 (device_idx, iter).");
+  static_assert(cute::rank(IterationMappingK_{}) == 2, 
+      "IterationMappingK must be rank-2 (device_idx, iter).");
+  static_assert(cute::rank(IterationMappingL_{}) == 2, 
+      "IterationMappingL must be rank-2 (device_idx, iter).");
+
+  using ProcessorTiler = ProcessorTiler_;
+  using IterationTiler = IterationTiler_;
+
+  using PeerDeviceMapping = PeerDeviceMapping_;
+  using IterationMappingM = IterationMappingM_;
+  using IterationMappingN = IterationMappingN_;
+  using IterationMappingK = IterationMappingK_;
+  using IterationMappingL = IterationMappingL_;
+
+  using ProcessorOffset = ProcessorOffset_;
+
+  static constexpr bool KernelWritesArrivalFlag = KernelWritesArrivalFlag_;
+  static constexpr bool MemcpyA = MemcpyA_;
+  static constexpr bool MemcpyB = MemcpyB_;
+  static constexpr bool HasMemcpy = MemcpyA || MemcpyB;
+
+  static constexpr int NumBuffersA = NumBuffersA_;
+  static constexpr int NumBuffersB = NumBuffersB_;
+  static constexpr int NumBuffersC = NumBuffersC_;
+  static constexpr int NumBuffersD = NumBuffersD_;
+
+  static_assert(
+      NumBuffersA > 0 ^ 
+      NumBuffersB > 0 ^ 
+      NumBuffersC > 0 ^ 
+      NumBuffersD > 0,
+      "Only one of the ABCD tensors can be buffered!");
+
+  static constexpr bool BufferedOutput = NumBuffersC > 0 || NumBuffersD > 0;
+  static constexpr bool RemoteC = NumBuffersC == 0 && NumBuffersD > 0;
+  static constexpr bool RemoteD = NumBuffersD == 0 && NumBuffersC > 0;
+
+  static_assert(not RemoteD, "Remote D is not supported yet.");
+
+  // Host-side API: can_implement based on the GLOBAL problem shape
+  template <typename ProblemShape>
+  static bool
+  can_implement_global(ProblemShape const& global_problem_shape) {
+    auto [M, N, K, L] = append<4>(global_problem_shape, 1);
+
+    auto [ptileM, ptileN, ptileK, ptileL] = ProcessorTiler{};
+    auto [itileM, itileN, itileK, itileL] = IterationTiler{};
+
+    auto tileM = ptileM * itileM;
+    auto tileN = ptileN * itileN;
+    auto tileK = ptileK * itileK;
+    auto tileL = ptileL * itileL;
+
+    return M % tileM == 0 && N % tileN == 0 && K % tileK == 0 && L % tileL == 0;
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_gemm_shape(ProblemShape const& global_problem_shape) {
+    auto problem_shape_MNKL = append<4>(global_problem_shape, 1);
+
+    return shape_div(
+        shape_div(
+          problem_shape_MNKL,
+          ProcessorTiler{}),
+        IterationTiler{});
+  }
+
+  // Host-side API: determine peers
+  static auto
+  get_peers_for_device(int device_idx) {
+    auto left_peer_id = device_idx > 0 ? device_idx - 1 : TP{} - 1;
+    auto right_peer_id = device_idx < TP{} - 1 ? device_idx + 1 : 0;
+
+    return cute::make_tuple(left_peer_id, right_peer_id);
+  }
+
+  // Determines peer given device index and iteration
+  static int
+  get_remote_peer_id(int device_idx, int iteration) {
+    auto device_iter_to_peer_idx = PeerDeviceMapping{};
+    auto peer_idx = (
+      device_iter_to_peer_idx(device_idx + ProcessorOffset{}, iteration) + TP{}
+    ) % TP{};
+    return peer_idx;
+  }
+
+  // Construct tilers and index mappers for sharding across processors
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_a(Tensor tensor) {
+    if constexpr (NumBuffersA > 0) {
+      return shape_div(tensor.shape(), select<0,2,3>(IterationTiler{}));
+    } else {
+      return shape_div(tensor.shape(), select<0,2,3>(ProcessorTiler{}));
+    }
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_b(Tensor tensor) {
+    if constexpr (NumBuffersB > 0) {
+      return shape_div(tensor.shape(), select<1,2,3>(IterationTiler{}));
+    } else {
+      return shape_div(tensor.shape(), select<1,2,3>(ProcessorTiler{}));
+    }
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_c(Tensor tensor) {
+    if constexpr (BufferedOutput) {
+      return shape_div(tensor.shape(), select<0,1,3>(IterationTiler{}));
+    } else {
+      return shape_div(tensor.shape(), select<0,1,3>(ProcessorTiler{}));
+    }
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_processor_tiler_d(Tensor tensor) {
+    return get_processor_tiler_c(tensor);
+  }
+
+  // Construct tilers and index mappers for tiling and iterating on device
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_a(Tensor tensor) {
+    static_assert(NumBuffersA == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<0,2,3>(IterationTiler{}));
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_b(Tensor tensor) {
+    static_assert(NumBuffersB == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<1,2,3>(IterationTiler{}));
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_c(Tensor tensor) {
+    static_assert(NumBuffersC == 0 && NumBuffersD == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<0,1,3>(IterationTiler{}));
+  }
+
+  template <typename Tensor>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tiler_d(Tensor tensor) {
+    static_assert(NumBuffersC == 0 && NumBuffersD == 0, "Buffered tensors don't have device tilers!");
+    return shape_div(tensor.shape(), select<0,1,3>(IterationTiler{}));
+  }
+
+  // Map device index and iteration to tile coordinate
+  // Must be implemented by children for now.
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_a(int device_idx, int iteration) {
+    auto mapping_m = IterationMappingM{};
+    auto mapping_k = IterationMappingK{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_m = (mapping_m(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_k = (mapping_k(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_m, crd_k, crd_l);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_b(int device_idx, int iteration) {
+    auto mapping_n = IterationMappingN{};
+    auto mapping_k = IterationMappingK{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_n = (mapping_n(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_k = (mapping_k(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_n, crd_k, crd_l);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_c(int device_idx, int iteration) {
+    auto mapping_m = IterationMappingM{};
+    auto mapping_n = IterationMappingN{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_m = (mapping_m(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_n = (mapping_n(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_m, crd_n, crd_l);
+  }
+
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_device_tile_idx_d(int device_idx, int iteration) {
+    auto mapping_m = IterationMappingM{};
+    auto mapping_n = IterationMappingN{};
+    auto mapping_l = IterationMappingL{};
+    auto crd_m = (mapping_m(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_n = (mapping_n(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    auto crd_l = (mapping_l(device_idx + ProcessorOffset{}, iteration) + TP{}) % TP{};
+    return make_coord(crd_m, crd_n, crd_l);
+  }
+
+  // Device Partitioners: partition non-buffered processor-resident operands.
+  // Processor-resident operands fall into two categories: buffered, and not buffered.
+  // Those buffered aren't expected to be further partitioned, and those 
+  template <typename Tensor>
+  static auto
+  get_tensor_A(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element* ptr = const_cast<Element*>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (NumBuffersA  == 0) {
+      auto tiler = get_device_tiler_a(tensor);
+      auto idx = get_device_tile_idx_a(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      Element* ptr_buffer = reinterpret_cast<Element*>(tensor_buffer_ptr);
+      if (iteration == 0) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * (iteration - 1);
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename Tensor>
+  static auto
+  get_tensor_B(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element * ptr = const_cast<Element *>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (NumBuffersB  == 0) {
+      auto tiler = get_device_tiler_b(tensor);
+      auto idx = get_device_tile_idx_b(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      Element * ptr_buffer = reinterpret_cast<Element *>(tensor_buffer_ptr);
+      if (iteration == 0) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * (iteration - 1);
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename Tensor>
+  static auto
+  get_tensor_C(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element * ptr = const_cast<Element *>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (not BufferedOutput) {
+      auto tiler = get_device_tiler_c(tensor);
+      auto idx = get_device_tile_idx_c(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      // implement Remote D
+      static_assert(RemoteC, "");
+
+      Element * ptr_buffer = reinterpret_cast<Element *>(tensor_buffer_ptr);
+      if (iteration == 0) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * (iteration - 1);
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename Tensor>
+  static auto
+  get_tensor_D(Tensor original_tensor, void * tensor_buffer_ptr, int device_idx, int iteration) {
+    static_assert(rank(original_tensor) == 3);
+
+    using Element = typename Tensor::value_type;
+    // Recreate tensor without constness. This is to ensure return types match.
+    Element * ptr = const_cast<Element *>(original_tensor.data());
+    auto shape = original_tensor.shape();
+    auto layout = original_tensor.layout();
+    auto tensor = make_tensor(ptr, layout);
+
+    if constexpr (not BufferedOutput) {
+      auto tiler = get_device_tiler_d(tensor);
+      auto idx = get_device_tile_idx_d(device_idx, iteration);
+      return inner_partition(tensor, tiler, idx);
+    } else {
+      // implement Remote D
+      static_assert(RemoteC, "");
+
+      Element * ptr_buffer = reinterpret_cast<Element *>(tensor_buffer_ptr);
+      // last iteration is the local tensor, the rest are buffers
+      if (iteration == TP{} - 1) {
+        return tensor;
+      }
+      ptr_buffer += size(shape) * iteration; // note: iteration, not iteration - 1
+
+      return make_tensor(ptr_buffer, layout);
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_a_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (NumBuffersA == 0) {
+      return shape_div(
+            select<0,2,3>(problem_shape_MNKL),
+            select<0,2,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<0,2,3>(problem_shape_MNKL),
+            select<0,2,3>(ProcessorTiler{})),
+          select<0,2,3>(IterationTiler{}));
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_b_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (NumBuffersB == 0) {
+      return shape_div(
+            select<1,2,3>(problem_shape_MNKL),
+            select<1,2,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<1,2,3>(problem_shape_MNKL),
+            select<1,2,3>(ProcessorTiler{})),
+          select<1,2,3>(IterationTiler{}));
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_c_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (not BufferedOutput) {
+      return shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{})),
+          select<0,1,3>(IterationTiler{}));
+    }
+  }
+
+  template <typename ProblemShape>
+  CUTLASS_HOST_DEVICE
+  static auto
+  get_local_d_shape(ProblemShape problem_shape) {
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    if constexpr (not BufferedOutput) {
+      return shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{}));
+    } else {
+      return shape_div(
+          shape_div(
+            select<0,1,3>(problem_shape_MNKL),
+            select<0,1,3>(ProcessorTiler{})),
+          select<0,1,3>(IterationTiler{}));
+    }
+  }
+
+  // Host-side APIs: get_device_slice_{A,B,C,D}
+  // Slice off a view of the GLOBAL tensor that corresponds to the shard that 
+  // is going to be owned by a specific device. This helps with the initial 
+  // distribution of the GLOBAL operands among devices.
+  template <typename Tensor>
+  static auto
+  get_device_slice_A(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_a(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+
+  template <typename Tensor>
+  static auto
+  get_device_slice_B(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_b(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+
+  template <typename Tensor>
+  static auto
+  get_device_slice_C(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_c(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+
+  template <typename Tensor>
+  static auto
+  get_device_slice_D(Tensor tensor, int device_idx) {
+    auto tiler = get_processor_tiler_d(tensor);
+    return inner_partition(tensor, tiler, device_idx);
+  }
+};
+
+
+
+} // namespace cutlass::gemm::distributed
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h
index 4ca8e113..279c3aa6 100644
--- a/include/cutlass/fast_math.h
+++ b/include/cutlass/fast_math.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -38,7 +38,9 @@
 #include <cmath>
 #include <type_traits>
 #endif
+#if !defined(__QNX__)
 #include <cuda/std/utility>
+#endif
 #include "cutlass/cutlass.h"
 #include "cutlass/array.h"
 #include "cutlass/uint128.h"
@@ -53,8 +55,16 @@
 namespace cutlass {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
-
+#if !defined(__QNX__)
 using ::cuda::std::swap;
+#else
+template <typename T>
+CUTLASS_HOST_DEVICE void swap(T &lhs, T &rhs) {
+  T tmp = lhs;
+  lhs = rhs;
+  rhs = tmp;
+}
+#endif
 
 /******************************************************************************
  * Static math utilities
diff --git a/include/cutlass/float8.h b/include/cutlass/float8.h
index cfb6b8bb..2f462286 100644
--- a/include/cutlass/float8.h
+++ b/include/cutlass/float8.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/floating_point_nvrtc.h b/include/cutlass/floating_point_nvrtc.h
index c08396aa..6496fea0 100644
--- a/include/cutlass/floating_point_nvrtc.h
+++ b/include/cutlass/floating_point_nvrtc.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h
index 3c4d5c76..52a4d142 100644
--- a/include/cutlass/functional.h
+++ b/include/cutlass/functional.h
@@ -1,5 +1,5 @@
   /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/builders/sm90_common.inl b/include/cutlass/gemm/collective/builders/sm90_common.inl
index 8d95967f..3a7bb842 100644
--- a/include/cutlass/gemm/collective/builders/sm90_common.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_common.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -182,7 +182,7 @@ rs_smem_selector() {
   static_assert(BLK_MN0 % 8 == 0, "BLK_MN0 must be a multiple of 8.");
   static_assert(BLK_K0 % 8 == 0,  "BLK_K0 must be a multiple of 8.");
   if constexpr (major == GMMA::Major::MN) {
-    if constexpr (sizeof(ElementType) == 4){
+    if constexpr (sizeof(ElementType) % 4 == 0) { // Whole-word types
       if constexpr (is_ws_transposed_B) {
         // only optimized transpositionB(SW32 and SW128 for tf32) can be used, but prefer SW32 due to free bank conflict
         if constexpr (BLK_MN0 % size<0>(GMMA::Layout_MN_SW32_Atom<ElementType>{}) == 0) {
diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
index b93cf306..68eba52e 100644
--- a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -456,6 +456,7 @@ public:
   static_assert(SmemAlignment == static_cast<int>(cute::max(CollectiveOp::SmemAlignmentA, CollectiveOp::SmemAlignmentB)));
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 // GMMA_TMA_WS_FP8_FAST_ACCUM_SS
diff --git a/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl b/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
index f9aa7bab..744402e5 100644
--- a/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_sparse_config.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
index 9b608fe0..541b45ea 100644
--- a/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
+++ b/include/cutlass/gemm/collective/builders/sm90_sparse_gmma_builder.inl
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -124,7 +124,6 @@ struct CollectiveBuilder<
                 "Should meet TMA alignment requirement\n");
 
   static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-  static_assert(!IsFP8Input, "FP8 sparse collective currently only supports FastAccum schedules");
 
   // For fp32 types, map to tf32 MMA value type
   using ElementAMmaRaw = cute::conditional_t<cute::is_same_v<ElementA, float>, tfloat32_t, ElementA>;
@@ -160,7 +159,10 @@ struct CollectiveBuilder<
 
   static constexpr int PipelineStages = detail::compute_stage_count_or_override_sparse<detail::sm90_smem_capacity_bytes,
       ElementAMma, ElementBMma, ElementEMma, TileShape_MNK>(StageCountType{});
-  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>;
+
+  using DispatchPolicy = cute::conditional_t<IsFP8Input,
+      MainloopSm90TmaGmmaWarpSpecializedSparseFP8<PipelineStages, ClusterShape_MNK, KernelScheduleType>,
+      MainloopSm90TmaGmmaWarpSpecializedSparse<PipelineStages, ClusterShape_MNK, KernelScheduleType>>;
 
   using SmemCopyAtomA = void; 
   using SmemCopyAtomB = void; 
@@ -354,15 +356,14 @@ struct CollectiveBuilder<
   static_assert(cutlass::detail::dependent_false<ElementA>, "Unsupported Toolkit for SM90 Collective Builder\n");
 #endif
 
-  static constexpr bool IsFP8Input = detail::is_input_fp8<ElementA, ElementB>();
-
+#if ((__CUDACC_VER_MAJOR__ > 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ >= 1)))
+  // Persistent schedules perform best for CUDA Toolkits with version >= 12.1
+  // KernelTmaWarpSpecializedCooperative requires TileShape_M to be at least 128
   using KernelSchedule = cute::conditional_t<size<0>(TileShape_MNK{}) == Int<64>{},
-                                             cute::conditional_t<IsFP8Input,
-                                                                 KernelTmaWarpSpecializedPingpongFP8FastAccum,
-                                                                 KernelTmaWarpSpecializedPingpong>,
-                                             cute::conditional_t<IsFP8Input,
-                                                                 KernelTmaWarpSpecializedCooperativeFP8FastAccum,
-                                                                 KernelTmaWarpSpecializedCooperative>>;
+      KernelTmaWarpSpecializedPingpong, KernelTmaWarpSpecializedCooperative>;
+#else
+  using KernelSchedule = KernelTmaWarpSpecialized;
+#endif
 
   using CollectiveOp = typename CollectiveBuilder<
       arch::Sm90,
diff --git a/include/cutlass/gemm/collective/collective_builder.hpp b/include/cutlass/gemm/collective/collective_builder.hpp
index ccd8d8b3..6ec4daca 100644
--- a/include/cutlass/gemm/collective/collective_builder.hpp
+++ b/include/cutlass/gemm/collective/collective_builder.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/collective_builder_decl.hpp b/include/cutlass/gemm/collective/collective_builder_decl.hpp
index 674e09fc..aae73348 100644
--- a/include/cutlass/gemm/collective/collective_builder_decl.hpp
+++ b/include/cutlass/gemm/collective/collective_builder_decl.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -97,3 +97,4 @@ struct CollectiveBuilder {
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace cutlass::gemm::collective
+
diff --git a/include/cutlass/gemm/collective/collective_mma.hpp b/include/cutlass/gemm/collective/collective_mma.hpp
index cb34a5b9..9d8a1ba2 100644
--- a/include/cutlass/gemm/collective/collective_mma.hpp
+++ b/include/cutlass/gemm/collective/collective_mma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -44,8 +44,8 @@
 #include "cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp" 
 #include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp"
 #include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp"
+#include "cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
 #include "cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp"
 #include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp"
 #include "cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp"
-
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/collective/collective_mma_decl.hpp b/include/cutlass/gemm/collective/collective_mma_decl.hpp
index feef5496..a2faa1ff 100644
--- a/include/cutlass/gemm/collective/collective_mma_decl.hpp
+++ b/include/cutlass/gemm/collective/collective_mma_decl.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/fp8_accumulation.hpp b/include/cutlass/gemm/collective/fp8_accumulation.hpp
index 6db236a4..bca742c3 100644
--- a/include/cutlass/gemm/collective/fp8_accumulation.hpp
+++ b/include/cutlass/gemm/collective/fp8_accumulation.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -37,7 +37,7 @@
 //////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////FP8 Accumulation///////////////////////////
 //////////////////////////////////////////////////////////////////////////////
-/// This calss provides API to promote (add) or scale (multiply_add) the results 
+/// This class provides API to promote (add) or scale (multiply_add) the results 
 /// from the tensor core accumulators to the main accumulators when the number 
 /// of MMAs reaches the max number of MMA interval specified by user, after that
 /// the tensor core accumulators are zeroed.
diff --git a/include/cutlass/gemm/collective/sm70_mma_twostage.hpp b/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
index 3d9e03ed..c2eda0ab 100644
--- a/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
+++ b/include/cutlass/gemm/collective/sm70_mma_twostage.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm80_mma_multistage.hpp b/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
index a129b56e..e8de7c31 100644
--- a/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
+++ b/include/cutlass/gemm/collective/sm80_mma_multistage.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
index ed223a56..ad1e0525 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
index 5264aa4c..789a3cb1 100644
--- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
index 69b31fda..fd13c409 100644
--- a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_rs_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
index e336bd47..fd02d043 100644
--- a/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_multistage_gmma_ss_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
index bc9ee563..898fe828 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
index 0bacd3b5..acc183d0 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized_mixed_input.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -217,7 +217,7 @@ public:
   using PipelineParams = typename MainloopPipeline::Params;
 
   // One threads per CTA are producers (1 for operand tile)
-  static constexpr int NumProducerThreadEvents = 1; 
+  static constexpr int NumProducerThreadEvents = 1;
 
   using SmemLayoutAtomScale = Layout<Shape<decltype(cute::shape<0>(SwappedSmemLayoutAtomA{})), cute::Int<1>>>;
   using ScaleTileShape = decltype(make_shape(shape<0>(TileShape{}), shape<1>(SmemLayoutAtomScale{})));
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
index daaed621..324b397b 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
index ef5fec66..dbde656e 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -111,7 +111,9 @@ struct CollectiveMma<
   using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
   using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
   using PipelineParams = typename MainloopPipeline::Params;
+
   // One threads per CTA are producers (1 for operand tile)
   static constexpr int NumProducerThreadEvents = 1; 
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
index 1ce00687..e29bd60d 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -113,10 +113,11 @@ struct CollectiveMma<
   using CtaShape_MNK = decltype(shape_div(TileShape{}, ClusterShape{}));
   using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
   using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
   using PipelineParams = typename MainloopPipeline::Params;
 
   // One threads per CTA are producers (1 for operand tile)
-  static constexpr int NumProducerThreadEvents = 1; 
+  static constexpr int NumProducerThreadEvents = 1;
 
   static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)");
   static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index ceed71d8..8ba64b28 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -135,10 +135,10 @@ struct CollectiveMma<
       SmemLayoutAtomB{},
       make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
       cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-  
+
   // Block scaling gmem-to-smem copy atom 
   using SmemBlockScalingCopyAtom = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ElementBlockScale>, ElementBlockScale>;
-  
+
   // Block scaling smem layout
   using SmemLayoutScaleA = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>;
   using SmemLayoutScaleB = Layout<Shape<Int<DispatchPolicy::Stages>>, Stride<_1>>;
@@ -261,7 +261,7 @@ struct CollectiveMma<
     constexpr int tma_alignment_bits = 128;
     auto problem_shape_MNKL = append<4>(problem_shape, 1);
     auto [M,N,K,L] = problem_shape_MNKL;
-    
+
     bool implementable = true;
     constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
     implementable = implementable && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K,L), StrideA{});
@@ -381,13 +381,13 @@ struct CollectiveMma<
 
       Tensor gScaleA = mScaleA_mkl(m_coord,_,l_coord);                                           // (1,k,1)
       Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord);                                           // (1,k,1)
-      
+
       TiledCopy scale_copy = make_tiled_copy(SmemBlockScalingCopyAtom{}, Layout<Shape<_1>>{}, Layout<Shape<_1>>{}); // (1,1,1)
       ThrCopy thr_scale_copy = scale_copy.get_slice(threadIdx.x);
-      
+
       Tensor tAgA_ScaleA = thr_scale_copy.partition_S(gScaleA);
       Tensor tAsA_ScaleA = thr_scale_copy.partition_D(sScaleA);
-      
+
       Tensor tBgB_ScaleB = thr_scale_copy.partition_S(gScaleB);
       Tensor tBsB_ScaleB = thr_scale_copy.partition_D(sScaleB);
 
@@ -491,7 +491,7 @@ struct CollectiveMma<
 
     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-    
+
     // Block scaling
     Tensor sScaleA = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), SmemLayoutScaleA{}); // (k)
     Tensor sScaleB = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_B.data()), SmemLayoutScaleB{}); // (k)
@@ -499,7 +499,7 @@ struct CollectiveMma<
     //
     // Define C accumulators and A/B partitioning
     //
-    
+
     // Layout of warp group to thread mapping
 
     static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
@@ -539,7 +539,7 @@ struct CollectiveMma<
 
     // We release buffers to producer warps(dma load) with some mmas in flight
     PipelineState smem_pipe_release = smem_pipe_read;
-    
+
     // Per block scale values for operand A and B
     ElementBlockScale scale_a;
     ElementBlockScale scale_b;
@@ -564,7 +564,7 @@ struct CollectiveMma<
       }
 
       int read_stage = smem_pipe_read.index();
-      
+
       // Load per block scale values from shared memory to registers.
       scale_a = sScaleA[read_stage];
       scale_b = sScaleB[read_stage];
@@ -636,7 +636,7 @@ struct CollectiveMma<
       ++smem_pipe_read;
       ++smem_pipe_release;
     }
-    
+
     accumulation.scale_residue_if_needed(scale);
 
     warpgroup_fence_operand(accumulation());
diff --git a/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
index 06ba20f2..d4f58c66 100644
--- a/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -215,11 +215,15 @@ struct CollectiveMma<
   static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
   static constexpr int K_PIPE_MMAS = 0;
 
-  static constexpr uint32_t TmaTransactionBytes =
+  static constexpr uint32_t TmaTransactionBytesMK = 
         cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
-        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+
+  static constexpr uint32_t TmaTransactionBytesNK =
         cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
 
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
   // Host side kernel arguments
   struct Arguments {
     ElementA const* ptr_A{};
@@ -233,26 +237,26 @@ struct CollectiveMma<
   // Device side kernel params
   struct Params {
 
-    using TMA_A = decltype(make_tma_copy<typename TmaInternalElementA::raw_type>(
+    using TMA_A = decltype(make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
         GmemTiledCopyA{},
         make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
         SmemLayoutA{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
 
-    using TMA_E = decltype(make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
+    using TMA_E = decltype(make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
         GmemCopyAtomE{},
-        make_tensor(recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(nullptr), LayoutE{}),
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
         SmemLayoutE{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})));  // mcast along N mode for this M load, if any
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
 
-    using TMA_B = decltype(make_tma_copy<TmaInternalElementB>(
+    using TMA_B = decltype(make_tma_copy_B_sm90<TmaInternalElementB>(
         GmemTiledCopyB{},
-        make_tensor(static_cast<TmaInternalElementB const*>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
         SmemLayoutB{}(_,_,cute::Int<0>{}),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+        TileShape{},
+        ClusterShape{}));  // mcast along M mode for this N load, if any
 
     TMA_A tma_load_a;
     TMA_E tma_load_e;
@@ -276,40 +280,45 @@ struct CollectiveMma<
     auto [M,N,K,L] = problem_shape_MNKL;
 
     auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
     auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
-    auto ptr_E = recast_ptr<sparse_elem<ElementEMmaSparsity, ElementE>>(args.ptr_E);
 
     Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
-    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
     Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
 
-    typename Params::TMA_A tma_load_a = make_tma_copy<typename TmaInternalElementA::raw_type>(
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
         GmemTiledCopyA{},
         tensor_a,
         SmemLayoutA{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
 
-    typename Params::TMA_E tma_load_e = make_tma_copy<uint64_t>( // use uint64_t to get the largest loading box.
+    typename Params::TMA_E tma_load_e = make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
         GmemCopyAtomE{},
         tensor_e,
         SmemLayoutE{}(_,_,cute::Int<0>{}),
-        make_shape(shape<0>(TileShape{}), shape<2>(TileShape{})),
-        size<1>(ClusterShape{})); // mcast along N mode for this M load, if any
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
 
-    typename Params::TMA_B tma_load_b = make_tma_copy<TmaInternalElementB>(
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90<TmaInternalElementB>(
         GmemTiledCopyB{},
         tensor_b,
         SmemLayoutB{}(_,_,cute::Int<0>{}),
-        make_shape(shape<1>(TileShape{}), shape<2>(TileShape{})),
-        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
 
     return {
       tma_load_a,
       tma_load_e,
       tma_load_b,
       args.layout_a,
-      args.layout_e
+      args.layout_e,
+      transaction_bytes
     };
   }
 
@@ -505,16 +514,29 @@ struct CollectiveMma<
 
     Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
     Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
-
-    Tensor sE_ = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});         // (BLK_M,BLK_K,PIPE)
-    Tensor sE = as_position_independent_swizzle_tensor(sE_);
+    Tensor sE = as_position_independent_swizzle_tensor(
+      make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{}));                   // (BLK_M,BLK_K,PIPE)
 
     //
     // Define C accumulators and A/B partitioning
     //
 
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
     TiledMma tiled_mma;
-    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    auto thread_mma = tiled_mma.get_thread_slice(warp_group_thread_layout(warp_group_idx));
 
     Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
     Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
diff --git a/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp b/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp
new file mode 100644
index 00000000..0a57d31e
--- /dev/null
+++ b/include/cutlass/gemm/collective/sm90_sparse_mma_tma_gmma_ss_warpspecialized_fp8.hpp
@@ -0,0 +1,775 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/collective/builders/sm90_sparse_config.inl"
+#include "cutlass/gemm/collective/fp8_accumulation.hpp"
+#include "cutlass/gemm/dispatch_policy.hpp"
+#include "cutlass/numeric_types.h"
+#include "cutlass/pipeline/pipeline.hpp"
+#include "cutlass/trace.h"
+
+#include "cute/arch/cluster_sm90.hpp"
+#include "cute/arch/copy_sm90.hpp"
+#include "cute/algorithm/functional.hpp"
+#include "cute/atom/mma_atom.hpp"
+#include "cute/algorithm/gemm.hpp"
+#include "cute/tensor_predicate.hpp"
+#include "cute/numeric/arithmetic_tuple.hpp"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass::gemm::collective {
+using namespace cute;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// WarpSpecialized Mainloop
+template <
+  int Stages,
+  class ClusterShape,
+  class KernelSchedule,
+  class TileShape_,
+  class ElementA_,
+  class LayoutPairAE_,
+  class ElementB_,
+  class StrideB_,
+  class TiledMma_,
+  class GmemTiledCopyA_,
+  class SmemLayoutAtomA_,
+  class SmemCopyAtomA_,
+  class TransformA_,
+  class GmemTiledCopyB_,
+  class SmemLayoutAtomB_,
+  class SmemCopyAtomB_,
+  class TransformB_>
+struct CollectiveMma<
+    MainloopSm90TmaGmmaWarpSpecializedSparseFP8<Stages, ClusterShape, KernelSchedule>,
+    TileShape_,
+    ElementA_,
+    LayoutPairAE_,
+    ElementB_,
+    StrideB_,
+    TiledMma_,
+    GmemTiledCopyA_,
+    SmemLayoutAtomA_,
+    SmemCopyAtomA_,
+    TransformA_,
+    GmemTiledCopyB_,
+    SmemLayoutAtomB_,
+    SmemCopyAtomB_,
+    TransformB_>
+{
+  //
+  // Type Aliases
+  //
+  using DispatchPolicy = MainloopSm90TmaGmmaWarpSpecializedSparseFP8<Stages, ClusterShape, KernelSchedule>;
+  using TileShape = TileShape_;
+  using TiledMma = TiledMma_;
+  using ElementA = ElementA_;
+  using ElementAMma = typename TiledMma::ValTypeA;
+  using ElementAMmaRaw = typename ElementAMma::raw_type;
+  using LayoutPairAE = LayoutPairAE_;
+  using LayoutA = remove_cvref_t<decltype(get<0>(LayoutPairAE{}))>;
+  using LayoutE = remove_cvref_t<decltype(get<1>(LayoutPairAE{}))>;
+  using StrideA = decltype(cute::stride(LayoutA{}));
+  using ElementB = ElementB_;
+  using ElementBMma = typename TiledMma::ValTypeB;
+  using StrideB = StrideB_;
+  using ElementEMma = typename TiledMma::ValTypeE;
+  using ElementE = typename ElementEMma::raw_type;
+  using ElementAccumulator = typename TiledMma::ValTypeC;
+  using GmemTiledCopyA = GmemTiledCopyA_;
+  using GmemTiledCopyB = GmemTiledCopyB_;
+  using SmemLayoutAtomA = SmemLayoutAtomA_;
+  using SmemLayoutAtomB = SmemLayoutAtomB_;
+  using SmemCopyAtomA = SmemCopyAtomA_;
+  using SmemCopyAtomB = SmemCopyAtomB_;
+  using TransformA = TransformA_;
+  using TransformB = TransformB_;
+  using ArchTag = typename DispatchPolicy::ArchTag;
+  using ArrayElementA = ElementA;
+  using ArrayElementB = ElementB;
+
+  static_assert(is_sparse<ElementAMma>::value, "ElementAMma is sparse");
+  static_assert(!is_sparse<ElementA>::value, "ElementA is not sparse");
+
+  static constexpr int ElementAMmaSparsity = ElementAMma::sparsity;
+  static constexpr int ElementEMmaSparsity = ElementEMma::sparsity;
+
+  // LayoutA is nested in the stride due to the sparsity.
+  static constexpr bool is_A_mn_major = cute::is_same_v<decltype(get<0>(LayoutA{}.stride())), Int<ElementAMmaSparsity>>;
+  static constexpr bool is_B_mn_major = cutlass::gemm::detail::is_major<0,StrideB>();
+
+  using SparseConfig = cutlass::Sm90GemmSparseConfig<ElementAMma,
+                                                     (is_A_mn_major ? GMMA::Major::MN : GMMA::Major::K),
+                                                     ElementEMma,
+                                                     decltype(cute::min(size<2>(TileShape{}),_128{}))>;
+
+  // The offline permutation for the metadata.
+  using SmemLayoutAtomE_ = typename SparseConfig::TensorEAtom;
+  using SmemLayoutAtomE  = ComposedLayout<Swizzle<0,4,3>,
+                                          smem_sparse_ptr_flag_bits<ElementEMmaSparsity, sizeof_bits_v<ElementE>>,
+                                          SmemLayoutAtomE_>;
+
+  // Metadata pathways
+  using SmemCopyAtomE = AutoVectorizingCopy;
+  using GmemCopyAtomE = GmemTiledCopyA;
+
+  using CtaShape_MNK = TileShape;
+  using MainloopPipeline = cutlass::PipelineTmaAsync<DispatchPolicy::Stages>;
+  using PipelineState = cutlass::PipelineState<DispatchPolicy::Stages>;
+
+  using PipelineParams = typename MainloopPipeline::Params;
+
+  // One threads per CTA are producers (1 for operand tile)
+  static constexpr int NumProducerThreadEvents = 1;
+
+  static_assert(cute::rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M,K)");
+  static_assert((size<0>(TileShape{}) % size<0>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomA{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  static_assert(cute::rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (N,K)");
+  static_assert((size<1>(TileShape{}) % size<0>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+  static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomB{})) == 0, "SmemLayoutAtom must evenly divide tile shape.");
+
+  // Tile along modes in a way that maximizes the TMA box size.
+  using SmemLayoutA = decltype(tile_to_shape(
+      SmemLayoutAtomA{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_A_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+  using SmemLayoutE = decltype(tile_to_shape(
+      SmemLayoutAtomE{},
+      make_shape(shape<0>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{})));
+  using SmemLayoutB = decltype(tile_to_shape(
+      SmemLayoutAtomB{},
+      make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int<DispatchPolicy::Stages>{}),
+      cute::conditional_t<is_B_mn_major, Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
+
+  static_assert(DispatchPolicy::Stages >= 2, "Specialization requires Stages set to value 2 or more.");
+  static_assert(cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeA>::value &&
+                cute::is_base_of<cute::GMMA::DescriptorIterator, typename TiledMma::FrgTypeB>::value,
+                "MMA atom must source both A and B operand from smem_desc for this mainloop.");
+  static_assert(cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyA, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+  static_assert(cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD> || cute::is_same_v<GmemTiledCopyB, SM90_TMA_LOAD_MULTICAST>,
+      "GmemTiledCopy - invalid SM90 TMA copy atom specified.");
+
+  static_assert(cute::is_void_v<SmemCopyAtomA>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+  static_assert(cute::is_void_v<SmemCopyAtomB>,
+    "SM90 GMMA mainloops cannot have a non-void copy atom for smem sourced instructions.");
+
+  // TMA converts f32 input to tf32 when copying from GMEM to SMEM
+  // For all other types, cast to size equivalent uint type to avoid any rounding by TMA.
+  using TmaInternalElementA = cute::sparse_elem<ElementAMmaSparsity,
+                                                cute::conditional_t<cute::is_same_v<ElementA, float>,
+                                                                    cutlass::tfloat32_t,
+                                                                    uint_bit_t<sizeof_bits_v<ElementAMmaRaw>>>>;
+  using TmaInternalElementB = cute::conditional_t<cute::is_same_v<float, ElementB>, 
+                                                  tfloat32_t,
+                                                  uint_bit_t<sizeof_bits_v<ElementBMma>>>;
+
+  struct SharedStorage
+  {
+    struct TensorStorage {
+      alignas(128) cute::ArrayEngine<ElementAMma, cute::cosize_v<SmemLayoutA>> smem_A;
+      alignas(128) cute::ArrayEngine<ElementBMma, cute::cosize_v<SmemLayoutB>> smem_B;
+      alignas(128) cute::ArrayEngine<ElementEMma, cute::cosize_v<SmemLayoutE>> smem_E;
+    } tensors;
+
+    using PipelineStorage = typename MainloopPipeline::SharedStorage;
+    PipelineStorage pipeline;
+  };
+  using TensorStorage = typename SharedStorage::TensorStorage;
+  using PipelineStorage = typename SharedStorage::PipelineStorage;
+
+  static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
+  static constexpr int K_PIPE_MMAS = 0;
+
+  static constexpr uint32_t TmaTransactionBytesMK = 
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutA{})) * cute::sizeof_bits_v<ElementAMma>) +
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutE{})) * cute::sizeof_bits_v<ElementEMma>);
+
+  static constexpr uint32_t TmaTransactionBytesNK =
+        cutlass::bits_to_bytes(cosize(take<0,2>(SmemLayoutB{})) * cute::sizeof_bits_v<ElementBMma>);
+
+  static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytesMK + TmaTransactionBytesNK;
+
+  // Host side kernel arguments
+  struct Arguments {
+    ElementA const* ptr_A{};
+    LayoutA layout_a{};
+    ElementB const* ptr_B{};
+    StrideB dB{};
+    ElementE const* ptr_E{};
+    LayoutE layout_e{};
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  // Device side kernel params
+  struct Params {
+
+    using TMA_A = decltype(make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        make_tensor(recast_ptr<TmaInternalElementA>(nullptr), LayoutA{}),
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    using TMA_E = decltype(make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        make_tensor(recast_ptr<ElementEMma>(nullptr), LayoutE{}),
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along N mode for this M load, if any
+
+    using TMA_B = decltype(make_tma_copy_B_sm90<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        make_tensor(recast_ptr<TmaInternalElementB>(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}),
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}));  // mcast along M mode for this N load, if any
+
+    TMA_A tma_load_a;
+    TMA_E tma_load_e;
+    TMA_B tma_load_b;
+    LayoutA layout_a;
+    LayoutE layout_e;
+    uint32_t tma_transaction_bytes = TmaTransactionBytes;
+    uint32_t mma_promotion_interval = 4;
+  };
+
+  //
+  // Methods
+  //
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    (void) workspace;
+
+    // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK)
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    auto ptr_A = recast_ptr<TmaInternalElementA>(args.ptr_A);
+    auto ptr_E = recast_ptr<ElementEMma>(args.ptr_E);
+    auto ptr_B = recast_ptr<TmaInternalElementB>(args.ptr_B);
+
+    Tensor tensor_a = make_tensor(ptr_A, args.layout_a);
+    Tensor tensor_e = make_tensor(ptr_E, args.layout_e);
+    Tensor tensor_b = make_tensor(ptr_B, make_layout(make_shape(N,K,L), args.dB));
+
+    typename Params::TMA_A tma_load_a = make_tma_copy_A_sm90<typename TmaInternalElementA::raw_type>(
+        GmemTiledCopyA{},
+        tensor_a,
+        SmemLayoutA{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_E tma_load_e = make_tma_copy_A_sm90<uint64_t>( // use uint64_t to get the largest loading box.
+        GmemCopyAtomE{},
+        tensor_e,
+        SmemLayoutE{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+
+    typename Params::TMA_B tma_load_b = make_tma_copy_B_sm90<TmaInternalElementB>(
+        GmemTiledCopyB{},
+        tensor_b,
+        SmemLayoutB{}(_,_,cute::Int<0>{}),
+        TileShape{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+
+    uint32_t transaction_bytes_mk = TmaTransactionBytesMK;
+    uint32_t transaction_bytes_nk = TmaTransactionBytesNK;
+    uint32_t transaction_bytes = transaction_bytes_mk + transaction_bytes_nk;
+
+    return {
+      tma_load_a,
+      tma_load_e,
+      tma_load_b,
+      args.layout_a,
+      args.layout_e,
+      transaction_bytes,
+      args.mma_promotion_interval
+    };
+  }
+
+  template<class ProblemShape>
+  CUTLASS_HOST_DEVICE static bool
+  can_implement(
+      ProblemShape const& problem_shape,
+      [[maybe_unused]] Arguments const& args) {
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits<ElementB>::value;
+    auto problem_shape_MNKL = append<4>(problem_shape, 1);
+    auto [M,N,K,L] = problem_shape_MNKL;
+    
+    bool size_check = true;
+    // Check Alignment A
+    if constexpr (is_A_mn_major) {
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(_1{}, M, M*K/2));
+    }
+    else { // If A is K-major
+      size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_A>(cute::make_shape(M,K/2,L), cute::make_stride(K/2, _1{}, M*K/2));
+    }
+    size_check = size_check && cutlass::detail::check_alignment<min_tma_aligned_elements_B>(cute::make_shape(N,K,L), StrideB{});
+
+    if (!size_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+    }
+
+    // Check if layout_a and layout_e is filled correctly
+    auto layout_a_ref = SparseConfig::fill_layoutA(problem_shape_MNKL);
+    auto layout_e_ref = SparseConfig::fill_layoutE(problem_shape_MNKL);
+    bool layout_check = true;
+    layout_check = layout_check && (layout_a_ref == args.layout_a);
+    layout_check = layout_check && (layout_e_ref == args.layout_e);
+
+    if (!layout_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Layout_a/e mismatch.\n");
+    }
+
+    /* MMA promotion interval should be a multiple of the number of MMA instructions issued by each mainloop iteration. */
+    bool interval_check = args.mma_promotion_interval % (size<2>(TileShape{}) / TiledMma().template tile_size_mnk<2>()) == 0;
+
+    if (!interval_check) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: MMA promotion interval is not a multiple of number of MMA instructions per tile.\n");
+    }
+
+    return size_check && layout_check && interval_check;
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& mainloop_params) {
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_a.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_e.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(mainloop_params.tma_load_b.get_tma_descriptor());
+  }
+
+  /// Set up the data needed by this collective for load and mma.
+  /// Returns a tuple of tensors. The collective and the kernel layer have the contract
+  /// Returned tuple must contain at least two elements, with the first two elements being:
+  /// gA_mkl - The tma tensor, A after a local tile so it has shape  (BLK_M,BLK_K,m,k,l)
+  /// gB_nkl - The tma tensor, B after a local tile so it has shape  (BLK_N,BLK_K,n,k,l)
+  /// The rest of the tensors can be specified as needed by this collective.
+  template <class ProblemShape_MNKL>
+  CUTLASS_DEVICE auto
+  load_init(ProblemShape_MNKL const& problem_shape_MNKL, Params const& mainloop_params) const {
+    using X = Underscore;
+    // Separate out problem shape for convenience
+    auto [M,N,K,L] = problem_shape_MNKL;
+
+    // TMA requires special handling of strides to deal with coord codomain mapping
+    // Represent the full tensors -- get these from TMA
+    Tensor mA_mkl = mainloop_params.tma_load_a.get_tma_tensor(mainloop_params.layout_a.shape());                      // (m,k,l)
+    Tensor mE_mkl = mainloop_params.tma_load_e.get_tma_tensor(mainloop_params.layout_e.shape());                      // (m,k,l)
+    Tensor mB_nkl = mainloop_params.tma_load_b.get_tma_tensor(make_shape(N,K,L));                            // (n,k,l)
+
+    // Make tiled views, defer the slice
+    Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gE_mkl = local_tile(mE_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{});        // (BLK_M,BLK_K,m,k,l)
+    Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{});        // (BLK_N,BLK_K,n,k,l)
+
+    return cute::make_tuple(gA_mkl, gB_nkl, gE_mkl);
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Producer Perspective
+  template <
+    class TensorA, class TensorB, class TensorE,
+    class KTileIterator, class BlockCoord
+  >
+  CUTLASS_DEVICE void
+  load(
+      Params const& mainloop_params,
+      MainloopPipeline pipeline, 
+      PipelineState smem_pipe_write,
+      cute::tuple<TensorA, TensorB, TensorE> const& load_inputs,
+      BlockCoord const& blk_coord,
+      KTileIterator k_tile_iter, int k_tile_count,
+      int thread_idx,
+      uint32_t block_rank_in_cluster,
+      TensorStorage& shared_tensors) {
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sE = make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{});        // (BLK_M,BLK_K,PIPE)
+      Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});        // (BLK_N,BLK_K,PIPE)
+
+      auto [gA_mkl, gB_nkl, gE_mkl] = load_inputs;
+
+      // Define the CTA-in-cluster Layout and Coord
+      Layout cta_layout_mnk = make_layout(ClusterShape{});
+      auto cta_coord_mnk = cta_layout_mnk.get_flat_coord(block_rank_in_cluster);
+
+      // TMA Multicast Masks
+      uint16_t mcast_mask_a = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_e = create_tma_multicast_mask<1>(cta_layout_mnk, cta_coord_mnk);
+      uint16_t mcast_mask_b = create_tma_multicast_mask<0>(cta_layout_mnk, cta_coord_mnk);
+
+      auto block_tma_a = mainloop_params.tma_load_a.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_e = mainloop_params.tma_load_e.get_slice(get<1>(cta_coord_mnk));
+      auto block_tma_b = mainloop_params.tma_load_b.get_slice(get<0>(cta_coord_mnk));
+
+      // Partition the inputs based on the current block coordinates.
+      auto [m_coord, n_coord, k_coord, l_coord] = blk_coord;
+      Tensor gA = gA_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gE = gE_mkl(_,_,m_coord,_,l_coord);                                                     // (BLK_M,BLK_K,k)
+      Tensor gB = gB_nkl(_,_,n_coord,_,l_coord);                                                     // (BLK_N,BLK_K,k)
+
+      // Applies the mapping from block_tma_a
+      Tensor tAgA = block_tma_a.partition_S(gA);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tAsA = block_tma_a.partition_D(sA);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tEgE = block_tma_e.partition_S(gE);                                                 // (TMA,TMA_M,TMA_K,k)
+      Tensor tEsE = block_tma_e.partition_D(sE);                                              // (TMA,TMA_M,TMA_K,PIPE)
+
+      Tensor tBgB = block_tma_b.partition_S(gB);                                                 // (TMA,TMA_N,TMA_K,k)
+      Tensor tBsB = block_tma_b.partition_D(sB);                                              // (TMA,TMA_N,TMA_K,PIPE)
+
+      // Mainloop
+      CUTLASS_PRAGMA_NO_UNROLL
+      for ( ; k_tile_count > 0; --k_tile_count)
+      {
+        // LOCK smem_pipe_write for _writing_
+        pipeline.producer_acquire(smem_pipe_write);
+
+        //
+        // Copy gmem to smem for *k_tile_iter
+        //
+
+        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
+        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
+
+        int write_stage = smem_pipe_write.index();
+        copy(mainloop_params.tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_e.with(*tma_barrier, mcast_mask_e), tEgE(_,_,_,*k_tile_iter), tEsE(_,_,_,write_stage));
+        copy(mainloop_params.tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
+        ++k_tile_iter;
+
+        // Advance smem_pipe_write
+        ++smem_pipe_write;
+      }
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void
+  load_tail(MainloopPipeline pipeline, PipelineState smem_pipe_write) {
+    int lane_predicate = cute::elect_one_sync();
+
+    // Issue the epilogue waits
+    if (lane_predicate) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all 
+       * Consumer UNLOCKs), or if the stage was never used
+       * then would just be acquired since the phase was 
+       * still inverted from make_producer_start_state
+       */
+      pipeline.producer_tail(smem_pipe_write);
+    }
+  }
+
+  /// Perform a collective-scoped matrix multiply-accumulate
+  /// Consumer Perspective
+  template <
+    class FrgTensorC
+  >
+  CUTLASS_DEVICE void
+  mma(MainloopPipeline pipeline,
+      PipelineState smem_pipe_read,
+      FrgTensorC& accum,
+      int k_tile_count,
+      int thread_idx,
+      TensorStorage& shared_tensors,
+      Params const& mainloop_params) {
+    static_assert(is_rmem<FrgTensorC>::value, "C tensor must be rmem resident.");
+    static_assert(cute::rank(SmemLayoutA{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutE{}) == 3, "Smem layout must be rank 3.");
+    static_assert(cute::rank(SmemLayoutB{}) == 3, "Smem layout must be rank 3.");
+
+    Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.begin()), SmemLayoutA{});          // (BLK_M,BLK_K,PIPE)
+    Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.begin()), SmemLayoutB{});          // (BLK_N,BLK_K,PIPE)
+    Tensor sE = as_position_independent_swizzle_tensor(
+      make_tensor(make_smem_ptr(shared_tensors.smem_E.begin()), SmemLayoutE{}));                   // (BLK_M,BLK_K,PIPE)
+
+    //
+    // Define C accumulators and A/B partitioning
+    //
+
+    // Layout of warp group to thread mapping
+
+    static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and 
+                  stride<0>(typename TiledMma::BLayout{}) == 0 and
+                  size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
+                  size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, 
+                  "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+
+    constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(Int<MmaWarpGroups>{}, 
+                                                  Int<NumThreadsPerWarpGroup>{});
+
+    int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
+
+    TiledMma tiled_mma;
+    auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    Tensor tCsA = thread_mma.partition_A(sA);                                                 // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCsB = thread_mma.partition_B(sB);                                                 // (MMA,MMA_N,MMA_K,PIPE)
+
+    // Allocate "fragments/descriptors"
+    Tensor tCrA = thread_mma.make_fragment_A(tCsA);                                           // (MMA,MMA_M,MMA_K,PIPE)
+    Tensor tCrB = thread_mma.make_fragment_B(tCsB);                                           // (MMA,MMA_N,MMA_K,PIPE)
+
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(accum));                                                         // M
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<2>(accum));                                                         // N
+    CUTE_STATIC_ASSERT_V(size<2>(tCsA) == size<2>(tCsB));                                                          // K
+    CUTE_STATIC_ASSERT_V(size<3>(tCsA) == size<3>(tCsB));                                                       // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sA));                                         // PIPE
+    CUTE_STATIC_ASSERT_V(Int<DispatchPolicy::Stages>{} == size<2>(sB));                                         // PIPE
+
+    auto copy_atom_E = Copy_Atom<SmemCopyAtomE, uint32_t>{};
+
+    Tensor tCsE = partition_E(thread_mma, sE(_,_,Int<0>{}));            // (MMA,MMA_M,MMA_K)
+    Tensor tCrE = make_fragment_like<ElementEMma>(tCsE);                // (MMA,MMA_M,MMA_K)
+
+    auto smem_tiled_copy_E = make_tiled_copy_E(copy_atom_E, tiled_mma);
+    auto smem_thr_copy_E   = smem_tiled_copy_E.get_thread_slice(thread_idx);
+
+    Tensor tEsE  = smem_thr_copy_E.partition_S(sE);                     // (ECPY,ECPY_M,ECPY_K)
+    Tensor tErE  = smem_thr_copy_E.retile_D(tCrE);                      // (ECPY,ECPY_M,ECPY_K)
+
+    //
+    // PIPELINED MAIN LOOP
+    //
+    static_assert((0 <= K_PIPE_MMAS) && (K_PIPE_MMAS <  K_PIPE_MAX),
+        "ERROR : Incorrect number of MMAs in flight");
+
+    // We release buffers to producer warps(dma load) with some mmas in flight
+    PipelineState smem_pipe_release = smem_pipe_read;
+
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+
+    tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+
+    GmmaFP8Accumulation accumulation(accum, mainloop_params.mma_promotion_interval, size<2>(tCrA));
+    warpgroup_fence_operand(accumulation());
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_tile_prologue = prologue_mma_count; k_tile_prologue > 0; --k_tile_prologue)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+
+      warpgroup_commit_batch();
+
+      accumulation.promote_if_needed();
+
+      ++smem_pipe_read;
+    }
+
+    warpgroup_fence_operand(accumulation());
+    // Mainloop GMMAs
+    k_tile_count -= prologue_mma_count;
+
+    CUTLASS_PRAGMA_NO_UNROLL
+    for ( ; k_tile_count > 0; --k_tile_count)
+    {
+      // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value)
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+      int read_stage = smem_pipe_read.index();
+
+      // Load metadata smem->rmem for one stage
+      copy(smem_tiled_copy_E, tEsE(_,_,_,read_stage), tErE);
+
+      if (accumulation.prepare_if_needed()) {
+        tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+      }
+
+      warpgroup_fence_operand(accumulation());
+      warpgroup_arrive();
+      // Unroll the K mode manually to set scale D to 1
+      CUTLASS_PRAGMA_UNROLL
+      for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+        cute::gemm(tiled_mma, make_zip_tensor(tCrA(_,_,k_block,read_stage), tErE(_,_,k_block)), tCrB(_,_,k_block,read_stage), accumulation());
+        tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+      }
+      warpgroup_commit_batch();
+
+      /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed
+      warpgroup_wait<K_PIPE_MMAS>();
+      warpgroup_fence_operand(accumulation());
+
+      accumulation.promote_if_needed();
+
+      // UNLOCK smem_pipe_release, done _computing_ on it
+      pipeline.consumer_release(smem_pipe_release);
+
+      // Advance smem_pipe_read and smem_pipe_release
+      ++smem_pipe_read;
+      ++smem_pipe_release;
+    }
+
+    accumulation.promote_residue_if_needed();
+
+    warpgroup_fence_operand(accumulation());
+  }
+
+  /// Perform a Consumer Epilogue to release all buffers
+  CUTLASS_DEVICE void
+  mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) {
+    // Prologue GMMAs
+    int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count);
+    k_tile_count -= prologue_mma_count;
+
+    smem_pipe_release.advance(k_tile_count);
+    
+    // Wait on all GMMAs to complete
+    warpgroup_wait<0>();
+
+    for (int count = 0; count < prologue_mma_count; ++count) {
+      pipeline.consumer_release(smem_pipe_release);                 // UNLOCK smem_pipe_release, done _computing_ on it
+      ++smem_pipe_release;
+    }
+  }
+
+private:
+
+  template <class MMA_Atom,
+            class AtomLayoutMNK,
+            class PermutationMNK,
+            class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  thrfrg_E(TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK> const& mma, ETensor&& etensor)
+  {
+    using TiledMma = TiledMMA<MMA_Atom, AtomLayoutMNK, PermutationMNK>;
+
+    CUTE_STATIC_ASSERT_V(rank(etensor) >= Int<2>{});
+
+    // Reorder the tensor for the TiledAtom
+    auto t_tile = make_tile(get<0>(PermutationMNK{}),
+                            get<2>(PermutationMNK{}));
+    auto t_tensor = logical_divide(etensor, t_tile);                 // (PermM,PermK)
+
+    // Tile the tensor for the Atom
+    auto e_tile = make_tile(make_layout(size<0>(typename TiledMma::AtomShape_MNK{})),
+                            make_layout(size<2>(typename TiledMma::AtomShape_MNK{})));
+    auto e_tensor = zipped_divide(t_tensor, e_tile);                 // ((AtomM,AtomK),(RestM,RestK))
+
+    // Transform the Atom mode from (M,K) to (Thr,Val)
+    using AtomLayoutE_TV = typename TiledMma::Atom::Traits::ELayout;
+    auto tv_tensor = e_tensor.compose(AtomLayoutE_TV{},_);           // ((ThrV,FrgV),(RestM,RestK))
+
+    // Tile the tensor for the Thread
+    auto thr_tile = make_tile(_,
+                              make_tile(make_layout(size<1>(mma.thr_layout_vmnk_)),
+                                        make_layout(size<3>(mma.thr_layout_vmnk_))));
+    auto thr_tensor = zipped_divide(tv_tensor, thr_tile);            // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK)))
+
+    return thr_tensor;
+  }
+
+  template<class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  get_layoutE_TV(TiledMMA<MArgs...> const& mma)
+  {
+    // (M,K) -> (M,K)
+    auto ref_E = make_layout(make_shape(tile_size<0>(mma), tile_size<2>(mma)));
+    // (ethrid,val) -> (M,K)
+    auto layoutE_TV = thrfrg_E(mma, ref_E);
+
+    // (ThrV,(ThrM,ThrK)) -> (ThrV,(ThrM,ThrN,ThrK))
+    auto etile = make_tile(_,
+                            make_tile(make_layout(make_shape (size<1>(mma.thr_layout_vmnk_), size<2>(mma.thr_layout_vmnk_)),
+                                                  make_stride(               Int<1>{} ,                Int<0>{} )),
+                                      _));
+
+    // thr_idx -> (ThrV,ThrM,ThrN,ThrK)
+    auto thridx_2_thrid = right_inverse(mma.thr_layout_vmnk_);
+
+    // (thr_idx,val) -> (M,K)
+    return layoutE_TV.compose(etile, _).compose(thridx_2_thrid, _);
+  }
+
+  template <class... MArgs, class ETensor>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  partition_E(ThrMMA<MArgs...> const& thr_mma, ETensor&& etensor)
+  {
+    auto thr_tensor = make_tensor(static_cast<ETensor&&>(etensor).data(), thrfrg_E(thr_mma, etensor.layout()));
+
+    auto thr_vmk = make_coord(get<0>(thr_mma.thr_vmnk_), make_coord(get<1>(thr_mma.thr_vmnk_), get<3>(thr_mma.thr_vmnk_)));
+    return thr_tensor(thr_vmk, make_coord(_, repeat<rank<1,1>(thr_tensor)>(_)));
+  }
+
+  template <class... CArgs, class... MArgs>
+  CUTE_HOST_DEVICE static constexpr
+  auto
+  make_tiled_copy_E(Copy_Atom<CArgs...> const& copy_atom,
+                    TiledMMA<MArgs...>  const& mma)
+  {
+    return make_tiled_copy_impl(copy_atom, get_layoutE_TV(mma), make_shape(tile_size<0>(mma),tile_size<2>(mma)));
+  }
+
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace cutlass::gemm::collective
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/device/base_grouped.h b/include/cutlass/gemm/device/base_grouped.h
index eec61981..fc59d7ff 100644
--- a/include/cutlass/gemm/device/base_grouped.h
+++ b/include/cutlass/gemm/device/base_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h
index e7ed2da9..75edf2fc 100644
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/ell_gemm.h b/include/cutlass/gemm/device/ell_gemm.h
index 54ddab40..4261496b 100644
--- a/include/cutlass/gemm/device/ell_gemm.h
+++ b/include/cutlass/gemm/device/ell_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h
index c6f488b1..7c36f6a7 100644
--- a/include/cutlass/gemm/device/gemm.h
+++ b/include/cutlass/gemm/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_array.h b/include/cutlass/gemm/device/gemm_array.h
index 1ae2db46..c59ea0d5 100644
--- a/include/cutlass/gemm/device/gemm_array.h
+++ b/include/cutlass/gemm/device/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_batched.h b/include/cutlass/gemm/device/gemm_batched.h
index 5981457c..45a471ce 100644
--- a/include/cutlass/gemm/device/gemm_batched.h
+++ b/include/cutlass/gemm/device/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_complex.h b/include/cutlass/gemm/device/gemm_complex.h
index e36c69ce..35965012 100644
--- a/include/cutlass/gemm/device/gemm_complex.h
+++ b/include/cutlass/gemm/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_grouped.h b/include/cutlass/gemm/device/gemm_grouped.h
index 877375e9..3c1c9bc7 100644
--- a/include/cutlass/gemm/device/gemm_grouped.h
+++ b/include/cutlass/gemm/device/gemm_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
index 3de3cecb..bdc2e5f3 100644
--- a/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
+++ b/include/cutlass/gemm/device/gemm_layernorm_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_sparse.h b/include/cutlass/gemm/device/gemm_sparse.h
index ac453c63..57f345f4 100644
--- a/include/cutlass/gemm/device/gemm_sparse.h
+++ b/include/cutlass/gemm/device/gemm_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_sparse_universal.h b/include/cutlass/gemm/device/gemm_sparse_universal.h
index b7d8cecf..2c92030c 100644
--- a/include/cutlass/gemm/device/gemm_sparse_universal.h
+++ b/include/cutlass/gemm/device/gemm_sparse_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h b/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
index a313ddc9..c42c82b4 100644
--- a/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
+++ b/include/cutlass/gemm/device/gemm_sparse_universal_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_sparse_with_absmax.h b/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
index e599217a..5b86f123 100644
--- a/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
+++ b/include/cutlass/gemm/device/gemm_sparse_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_sparse_with_visitor.h b/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
index 73edfa35..c7007335 100644
--- a/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
+++ b/include/cutlass/gemm/device/gemm_sparse_with_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h
index f78c5a21..e0599810 100644
--- a/include/cutlass/gemm/device/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h
index 55413b77..5da6a367 100644
--- a/include/cutlass/gemm/device/gemm_universal.h
+++ b/include/cutlass/gemm/device/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_universal_adapter.h b/include/cutlass/gemm/device/gemm_universal_adapter.h
index 5c6c2a0f..51ec28c7 100644
--- a/include/cutlass/gemm/device/gemm_universal_adapter.h
+++ b/include/cutlass/gemm/device/gemm_universal_adapter.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h
index e23191ea..6f010c1b 100644
--- a/include/cutlass/gemm/device/gemm_universal_base.h
+++ b/include/cutlass/gemm/device/gemm_universal_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
index 7ef581ac..7de048bb 100644
--- a/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
+++ b/include/cutlass/gemm/device/gemm_universal_streamk_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_universal_with_absmax.h b/include/cutlass/gemm/device/gemm_universal_with_absmax.h
index 35f7b541..2459d3a1 100644
--- a/include/cutlass/gemm/device/gemm_universal_with_absmax.h
+++ b/include/cutlass/gemm/device/gemm_universal_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
index 809a504a..70b18347 100644
--- a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
+++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemm_with_k_reduction.h b/include/cutlass/gemm/device/gemm_with_k_reduction.h
index b25ae6a3..2f64d04b 100644
--- a/include/cutlass/gemm/device/gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/device/gemm_with_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/gemv.h b/include/cutlass/gemm/device/gemv.h
index 5e181743..763f18e8 100644
--- a/include/cutlass/gemm/device/gemv.h
+++ b/include/cutlass/gemm/device/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/rank_2k.h b/include/cutlass/gemm/device/rank_2k.h
index 296f38ca..8e7f436d 100644
--- a/include/cutlass/gemm/device/rank_2k.h
+++ b/include/cutlass/gemm/device/rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/rank_2k_grouped.h b/include/cutlass/gemm/device/rank_2k_grouped.h
index 6cbebc5d..0c59744b 100644
--- a/include/cutlass/gemm/device/rank_2k_grouped.h
+++ b/include/cutlass/gemm/device/rank_2k_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/rank_k.h b/include/cutlass/gemm/device/rank_k.h
index ae18a11b..665c4e3e 100644
--- a/include/cutlass/gemm/device/rank_k.h
+++ b/include/cutlass/gemm/device/rank_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/symm.h b/include/cutlass/gemm/device/symm.h
index c36ef959..69a76996 100755
--- a/include/cutlass/gemm/device/symm.h
+++ b/include/cutlass/gemm/device/symm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/device/trmm.h b/include/cutlass/gemm/device/trmm.h
index 09b9152c..2a9ed8e1 100644
--- a/include/cutlass/gemm/device/trmm.h
+++ b/include/cutlass/gemm/device/trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index 83d6eb77..6c986243 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -105,12 +105,22 @@ struct KernelCpAsyncWarpSpecializedPingpong { };
 struct KernelCpAsyncWarpSpecializedCooperative { };
 struct KernelTma { };
 struct KernelTmaWarpSpecialized { };
-struct KernelTmaWarpSpecializedPingpong { };
-struct KernelTmaWarpSpecializedCooperative { };
+struct KernelTmaWarpSpecializedPingpong { 
+};
+struct KernelTmaWarpSpecializedCooperative { 
+};
 
 struct KernelPtrArrayTmaWarpSpecializedCooperative { };
 struct KernelPtrArrayTmaWarpSpecializedPingpong { };
 
+// FP8 related policies (including Blocked Scaled Accumulation)
+struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelTmaWarpSpecializedCooperative { };
+
+// Policies to opt into mixed type GEMMs
+struct KernelTmaWarpSpecializedMixedInput : KernelTmaWarpSpecialized { };
+struct KernelTmaWarpSpecializedPingpongMixedInput : KernelTmaWarpSpecializedPingpong { };
+struct KernelTmaWarpSpecializedCooperativeMixedInput: KernelTmaWarpSpecializedCooperative { };
+
 //////////////////////////////////////////////////////////////////////////////
 
 //
@@ -125,14 +135,6 @@ struct KernelTmaWarpSpecializedCooperativeFP8FastAccum: KernelTmaWarpSpecialized
 struct KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum : KernelPtrArrayTmaWarpSpecializedCooperative { };
 struct KernelPtrArrayTmaWarpSpecializedPingpongFP8FastAccum : KernelPtrArrayTmaWarpSpecializedPingpong { };
 
-// FP8 related policies (including Blocked Scaled Accumulation)
-struct KernelTmaWarpSpecializedCooperativeFP8BlockScaledAccum: KernelTmaWarpSpecializedCooperative { };
-
-// Policies to opt into mixed type GEMMs
-struct KernelTmaWarpSpecializedMixedInput : KernelTmaWarpSpecialized { };
-struct KernelTmaWarpSpecializedPingpongMixedInput : KernelTmaWarpSpecializedPingpong { };
-struct KernelTmaWarpSpecializedCooperativeMixedInput: KernelTmaWarpSpecializedCooperative { };
-
 //////////////////////////////////////////////////////////////////////////////
 
 // Policies for dispatch of epilogue
@@ -288,6 +290,7 @@ struct MainloopSm90TmaGmmaWarpSpecializedFP8
     "KernelSchedule must be one of the warp specialized policies");
 };
 
+
 // n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule
 // For FP8 kernels with Block Scaling
 template<
@@ -302,7 +305,6 @@ struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingFP8
     "KernelSchedule must be one of the warp specialized policies");
 };
 
-
 // n-buffer in smem (Hopper TMA), pipelined with Hopper GMMA and TMA, Warp specialized dynamic schedule for Ptr-Array and Grouped Gemm
 template<
   int Stages_,
@@ -333,6 +335,16 @@ struct MainloopSm90TmaGmmaWarpSpecializedSparse {
   using Schedule = KernelSchedule;
 };
 
+// For slow-accumulation sparse FP8 kernels
+template<
+  int Stages,
+  class ClusterShape = Shape<_1,_1,_1>,
+  class KernelSchedule = KernelTmaWarpSpecializedCooperative
+>
+struct MainloopSm90TmaGmmaWarpSpecializedSparseFP8 
+  : MainloopSm90TmaGmmaWarpSpecializedSparse<Stages, ClusterShape, KernelSchedule> {
+};
+
 
 //////////////////////////////////////////////////////////////////////////////
 
diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h
index cfe29321..5137bfad 100644
--- a/include/cutlass/gemm/gemm.h
+++ b/include/cutlass/gemm/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/gemm_enumerated_types.h b/include/cutlass/gemm/gemm_enumerated_types.h
index 66aae898..8961735b 100644
--- a/include/cutlass/gemm/gemm_enumerated_types.h
+++ b/include/cutlass/gemm/gemm_enumerated_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/group_array_problem_shape.hpp b/include/cutlass/gemm/group_array_problem_shape.hpp
index fbc0fdd7..73bdce50 100644
--- a/include/cutlass/gemm/group_array_problem_shape.hpp
+++ b/include/cutlass/gemm/group_array_problem_shape.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_ell_gemm.h b/include/cutlass/gemm/kernel/default_ell_gemm.h
index 49f9eef3..561508c7 100644
--- a/include/cutlass/gemm/kernel/default_ell_gemm.h
+++ b/include/cutlass/gemm/kernel/default_ell_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h
index 4678df4a..da41c3e0 100644
--- a/include/cutlass/gemm/kernel/default_gemm.h
+++ b/include/cutlass/gemm/kernel/default_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h
index 7ef46c6c..438769f3 100644
--- a/include/cutlass/gemm/kernel/default_gemm_complex.h
+++ b/include/cutlass/gemm/kernel/default_gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped.h b/include/cutlass/gemm/kernel/default_gemm_grouped.h
index f9163874..1481465b 100644
--- a/include/cutlass/gemm/kernel/default_gemm_grouped.h
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h b/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
index 3b7b126a..2ace2127 100644
--- a/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped_per_group_scale.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h b/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
index a031c1a9..7ad2f90f 100644
--- a/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
+++ b/include/cutlass/gemm/kernel/default_gemm_grouped_softmax_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
index 68d739e3..d06a2a21 100644
--- a/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
+++ b/include/cutlass/gemm/kernel/default_gemm_layernorm_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
index df74a074..5c50d003 100644
--- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse.h b/include/cutlass/gemm/kernel/default_gemm_sparse.h
index f1841a37..8bc5ca03 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h b/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
index 250a0e7b..60965524 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h b/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
index 01939092..15d9d790 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_universal_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h b/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
index 30d06323..2f8a2f28 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h b/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
index 9d7f2c6f..eb2167fd 100644
--- a/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
+++ b/include/cutlass/gemm/kernel/default_gemm_sparse_with_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
index 061bb749..c4aed55c 100644
--- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h b/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
index c19fdb5e..683fc511 100644
--- a/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
+++ b/include/cutlass/gemm/kernel/default_gemm_streamk_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h
index ed7951be..29ff219d 100644
--- a/include/cutlass/gemm/kernel/default_gemm_universal.h
+++ b/include/cutlass/gemm/kernel/default_gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h b/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
index a3c69f2d..0ec473e4 100644
--- a/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
+++ b/include/cutlass/gemm/kernel/default_gemm_universal_with_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_absmax.h b/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
index 3fd643e7..b27a078c 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
index e95c2561..e53f31fc 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
index ca4c2cba..01019cf2 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
index 1a578f09..e24dd923 100644
--- a/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
+++ b/include/cutlass/gemm/kernel/default_gemm_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h
index db630640..a574dabb 100755
--- a/include/cutlass/gemm/kernel/default_gemv.h
+++ b/include/cutlass/gemm/kernel/default_gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_2k.h b/include/cutlass/gemm/kernel/default_rank_2k.h
index 63400ef4..f52e5d7f 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_complex.h b/include/cutlass/gemm/kernel/default_rank_2k_complex.h
index 1a685286..7b6e3290 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k_complex.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_grouped.h b/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
index 7c79dd61..7f5efe32 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_2k_universal.h b/include/cutlass/gemm/kernel/default_rank_2k_universal.h
index 41e9cc45..a27be8d1 100644
--- a/include/cutlass/gemm/kernel/default_rank_2k_universal.h
+++ b/include/cutlass/gemm/kernel/default_rank_2k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_k.h b/include/cutlass/gemm/kernel/default_rank_k.h
index 780b205a..5001b338 100644
--- a/include/cutlass/gemm/kernel/default_rank_k.h
+++ b/include/cutlass/gemm/kernel/default_rank_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_k_complex.h b/include/cutlass/gemm/kernel/default_rank_k_complex.h
index 56d2fcc9..21ccc331 100644
--- a/include/cutlass/gemm/kernel/default_rank_k_complex.h
+++ b/include/cutlass/gemm/kernel/default_rank_k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_rank_k_universal.h b/include/cutlass/gemm/kernel/default_rank_k_universal.h
index 309ea464..503040a7 100644
--- a/include/cutlass/gemm/kernel/default_rank_k_universal.h
+++ b/include/cutlass/gemm/kernel/default_rank_k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_symm.h b/include/cutlass/gemm/kernel/default_symm.h
index 8f0ff425..435e46b3 100755
--- a/include/cutlass/gemm/kernel/default_symm.h
+++ b/include/cutlass/gemm/kernel/default_symm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_symm_complex.h b/include/cutlass/gemm/kernel/default_symm_complex.h
index c2f80310..028296c0 100755
--- a/include/cutlass/gemm/kernel/default_symm_complex.h
+++ b/include/cutlass/gemm/kernel/default_symm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_symm_universal.h b/include/cutlass/gemm/kernel/default_symm_universal.h
index ac0da25d..8915df67 100755
--- a/include/cutlass/gemm/kernel/default_symm_universal.h
+++ b/include/cutlass/gemm/kernel/default_symm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_trmm.h b/include/cutlass/gemm/kernel/default_trmm.h
index 3380eee3..8e004d07 100644
--- a/include/cutlass/gemm/kernel/default_trmm.h
+++ b/include/cutlass/gemm/kernel/default_trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_trmm_complex.h b/include/cutlass/gemm/kernel/default_trmm_complex.h
index c5cba8fb..d8eeee10 100644
--- a/include/cutlass/gemm/kernel/default_trmm_complex.h
+++ b/include/cutlass/gemm/kernel/default_trmm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/default_trmm_universal.h b/include/cutlass/gemm/kernel/default_trmm_universal.h
index e06e15ca..fef1fcde 100644
--- a/include/cutlass/gemm/kernel/default_trmm_universal.h
+++ b/include/cutlass/gemm/kernel/default_trmm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/ell_gemm.h b/include/cutlass/gemm/kernel/ell_gemm.h
index aad32959..16010fd6 100644
--- a/include/cutlass/gemm/kernel/ell_gemm.h
+++ b/include/cutlass/gemm/kernel/ell_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h
index 354f5ea8..22b5f48d 100644
--- a/include/cutlass/gemm/kernel/gemm.h
+++ b/include/cutlass/gemm/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h
index bafa5fa8..88128062 100644
--- a/include/cutlass/gemm/kernel/gemm_array.h
+++ b/include/cutlass/gemm/kernel/gemm_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h
index 0c11e997..efd5b846 100644
--- a/include/cutlass/gemm/kernel/gemm_batched.h
+++ b/include/cutlass/gemm/kernel/gemm_batched.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_grouped.h b/include/cutlass/gemm/kernel/gemm_grouped.h
index daa6cbd7..3a4098cc 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h b/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
index 972681ab..65325e50 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped_per_group_scale.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h b/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
index 1c4411bd..dc37d560 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped_problem_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h b/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
index 3d889469..f6fc2223 100644
--- a/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
+++ b/include/cutlass/gemm/kernel/gemm_grouped_softmax_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h b/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
index f324d7b3..c862cc00 100644
--- a/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
+++ b/include/cutlass/gemm/kernel/gemm_layernorm_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_params.h b/include/cutlass/gemm/kernel/gemm_params.h
index 5a7f29d8..a3b0eb89 100755
--- a/include/cutlass/gemm/kernel/gemm_params.h
+++ b/include/cutlass/gemm/kernel/gemm_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h
index 019f93c8..4d199825 100644
--- a/include/cutlass/gemm/kernel/gemm_pipelined.h
+++ b/include/cutlass/gemm/kernel/gemm_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h
index 09228ca0..0f8cd338 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
index 0c21fb8d..1685f23f 100644
--- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
+++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_sparse_universal.h b/include/cutlass/gemm/kernel/gemm_sparse_universal.h
index c5420c72..035caf7b 100644
--- a/include/cutlass/gemm/kernel/gemm_sparse_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_sparse_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h b/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
index 47b76a17..6251c389 100644
--- a/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
+++ b/include/cutlass/gemm/kernel/gemm_sparse_universal_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
index 8ab98ff0..a21f0813 100644
--- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
+++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h b/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
index 013fb773..473819af 100644
--- a/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
+++ b/include/cutlass/gemm/kernel/gemm_streamk_with_fused_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_transpose_operands.h b/include/cutlass/gemm/kernel/gemm_transpose_operands.h
index 4a2258c4..98bc2271 100644
--- a/include/cutlass/gemm/kernel/gemm_transpose_operands.h
+++ b/include/cutlass/gemm/kernel/gemm_transpose_operands.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h
index 08b30c74..be1e1d86 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.h
+++ b/include/cutlass/gemm/kernel/gemm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal.hpp b/include/cutlass/gemm/kernel/gemm_universal.hpp
index 6c7b89a2..2b54758d 100644
--- a/include/cutlass/gemm/kernel/gemm_universal.hpp
+++ b/include/cutlass/gemm/kernel/gemm_universal.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal_decl.h b/include/cutlass/gemm/kernel/gemm_universal_decl.h
index 73426db5..94652342 100644
--- a/include/cutlass/gemm/kernel/gemm_universal_decl.h
+++ b/include/cutlass/gemm/kernel/gemm_universal_decl.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal_streamk.h b/include/cutlass/gemm/kernel/gemm_universal_streamk.h
index 39a9bfb5..96a09569 100644
--- a/include/cutlass/gemm/kernel/gemm_universal_streamk.h
+++ b/include/cutlass/gemm/kernel/gemm_universal_streamk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h b/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
index 5ce123a1..e8fdea73 100644
--- a/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
+++ b/include/cutlass/gemm/kernel/gemm_universal_with_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h b/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
index 5d8ce789..3fd9d605 100644
--- a/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
+++ b/include/cutlass/gemm/kernel/gemm_universal_with_visitor_streamk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_with_absmax.h b/include/cutlass/gemm/kernel/gemm_with_absmax.h
index 470eaef5..f1a3ec86 100644
--- a/include/cutlass/gemm/kernel/gemm_with_absmax.h
+++ b/include/cutlass/gemm/kernel/gemm_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
index 363d109c..b27c1678 100644
--- a/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
+++ b/include/cutlass/gemm/kernel/gemm_with_fused_epilogue.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
index 49c4b0a1..c8b24ee4 100644
--- a/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
+++ b/include/cutlass/gemm/kernel/gemm_with_k_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemv.h b/include/cutlass/gemm/kernel/gemv.h
index 9ec55e13..eb5da1a7 100644
--- a/include/cutlass/gemm/kernel/gemv.h
+++ b/include/cutlass/gemm/kernel/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h
index 673f1995..3b22c110 100755
--- a/include/cutlass/gemm/kernel/gemv_batched_strided.h
+++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/grouped_problem_visitor.h b/include/cutlass/gemm/kernel/grouped_problem_visitor.h
index 4df76ec0..7aaaa094 100644
--- a/include/cutlass/gemm/kernel/grouped_problem_visitor.h
+++ b/include/cutlass/gemm/kernel/grouped_problem_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/params_sparse_base.h b/include/cutlass/gemm/kernel/params_sparse_base.h
index 6080e799..3b1d2c95 100644
--- a/include/cutlass/gemm/kernel/params_sparse_base.h
+++ b/include/cutlass/gemm/kernel/params_sparse_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/params_universal_base.h b/include/cutlass/gemm/kernel/params_universal_base.h
index 172855ed..46933d90 100644
--- a/include/cutlass/gemm/kernel/params_universal_base.h
+++ b/include/cutlass/gemm/kernel/params_universal_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped.h b/include/cutlass/gemm/kernel/rank_2k_grouped.h
index e8383faf..84d70212 100644
--- a/include/cutlass/gemm/kernel/rank_2k_grouped.h
+++ b/include/cutlass/gemm/kernel/rank_2k_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
index 054d2a73..c9fcf0c0 100644
--- a/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
+++ b/include/cutlass/gemm/kernel/rank_2k_grouped_problem_visitor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h b/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
index 11b2a915..349cd25d 100644
--- a/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
+++ b/include/cutlass/gemm/kernel/rank_2k_transpose_operands.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/rank_2k_universal.h b/include/cutlass/gemm/kernel/rank_2k_universal.h
index bd7ffb0e..f304d060 100644
--- a/include/cutlass/gemm/kernel/rank_2k_universal.h
+++ b/include/cutlass/gemm/kernel/rank_2k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/rank_k_universal.h b/include/cutlass/gemm/kernel/rank_k_universal.h
index ad418286..96091432 100644
--- a/include/cutlass/gemm/kernel/rank_k_universal.h
+++ b/include/cutlass/gemm/kernel/rank_k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm70_gemm.hpp b/include/cutlass/gemm/kernel/sm70_gemm.hpp
index b6ad7613..5475b53b 100644
--- a/include/cutlass/gemm/kernel/sm70_gemm.hpp
+++ b/include/cutlass/gemm/kernel/sm70_gemm.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
index c0c10b97..c19f33fb 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -209,10 +209,20 @@ public:
           "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
       sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
     }
-
     CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
 
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
 
     // Calculate workspace pointers
     uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
index 1b7c0cb4..62096e82 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -217,10 +217,20 @@ public:
           "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
       sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
     }
-
     CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
 
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
 
     // Calculate workspace pointers
     uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
index c7245457..2292d7e4 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
index b278f96e..cafab8b9 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index 4d14644b..4482e25d 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -197,10 +197,20 @@ public:
           "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
       sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
     }
-
     CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
 
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
 
     // Calculate workspace pointers
     uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
index c19a8e9f..cba20b5c 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -210,9 +210,20 @@ public:
           "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
       sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
     }
-
     CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
 
     // Calculate workspace pointers
     uint8_t* workspace_ptr = reinterpret_cast<uint8_t*>(workspace);
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
index c2a888ae..e7cafde5 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
index 04174520..1d35ff2d 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_cooperative.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -179,10 +179,21 @@ public:
           "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
       sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
     }
-
     CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
 
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
     TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
       problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
 
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
index 142fabd2..be086f0c 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_warpspecialized_pingpong.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -191,10 +191,21 @@ public:
           "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
       sm_count = KernelHardwareInfo::query_device_multiprocessor_count(args.hw_info.device_id);
     }
-
     CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid SM count to " << sm_count);
 
-    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    // Get maximum number of clusters that could co-exist on the target device
+    int max_active_clusters = args.hw_info.max_active_clusters;
+    if (max_active_clusters <= 0) {
+      max_active_clusters = 0;
+      CUTLASS_TRACE_HOST("  WARNING: Arguments do not include a valid max cluster count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the max_active_clusters.");
+    }
+    else {
+      CUTLASS_TRACE_HOST("to_underlying_arguments(): Setting persistent grid cluster count to " << max_active_clusters);
+    }
+
+    KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count, max_active_clusters};
+
     TileSchedulerParams scheduler = TileScheduler::to_underlying_arguments(
       problem_shape_MNKL, TileShape{}, ClusterShape{}, hw_info, args.scheduler, workspace);
 
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
index 08437c70..7a5835c1 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
index a30d9ce0..ea52a1b8 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_group.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
index b5e62164..b3413c85 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -412,7 +412,8 @@ public:
     FrgTensorC& accumulators,
     uint32_t num_barriers,
     uint32_t barrier_idx,
-    uint32_t num_accumulator_mtxs = 1) {
+    uint32_t num_accumulator_mtxs = 1,
+    uint32_t idx_accumulator_mtxs = 0) {
 
     using ElementAccumulator = typename FrgTensorC::value_type;
 
@@ -443,7 +444,8 @@ public:
     // Reductions use BlockStripedReduce with a width of BarrierManager::ThreadCount under the hood.
     // Thus, the start of the reduction space is the same across all threads in a warp group.
     uint64_t reduction_offset_base = (static_cast<uint64_t>(cute::size<0>(TileShape{})) * static_cast<uint64_t>(cute::size<1>(TileShape{})) * reduction_tile_idx * num_accumulator_mtxs) +
-      (static_cast<uint64_t>(size(accumulators)) * barrier_idx * BarrierManager::ThreadCount);
+      (static_cast<uint64_t>(size(accumulators)) * barrier_idx * BarrierManager::ThreadCount * num_accumulator_mtxs)
+      + static_cast<uint64_t>(size(accumulators)) * BarrierManager::ThreadCount * idx_accumulator_mtxs;
     uint64_t reduction_offset = reduction_offset_base + reduction_peer_offset;
 
     ElementAccumulator* group_reduction_workspace = reinterpret_cast<ElementAccumulator*>(params.reduction_workspace_) + reduction_offset;
@@ -497,8 +499,18 @@ public:
         BlockStripedReduceT::store(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
       }
       else {
-        // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        if (params.reduction_mode_ == ReductionMode::Deterministic) {
+          // Wait until the preceding split added its accumulators
+          BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
+        }
+        else {
+          // Wait until the first split has stored its accumulators. Note that the first split will have
+          // accumulated a value into the lock potentially greater than one (since the locked value is
+          // incremented by work_tile_info.k_tile_count below for both the deterministic and non-deterministic)
+          // cases. For non-deterministic reductions, all that non-first or last splits care about is whether
+          // the first split has been written, so we only wait while the locked value is less than 1.
+          BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
+        }
 
         // Perform reduction in workspace
         BlockStripedReduceT::reduce(reduction_workspace_array, *accumulator_array, barrier_group_thread_idx);
@@ -509,21 +521,13 @@ public:
       uint32_t increment = params.requires_separate_reduction() ? 1 : work_tile_info.k_tile_count;
 
       // Signal our arrival
-      BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+      if (idx_accumulator_mtxs == (num_accumulator_mtxs - 1)) {
+        BarrierManager::arrive_inc(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, increment);
+      }
     }
     else {
-      if (
-        params.reduction_mode_ == ReductionMode::Deterministic
-      ) {
-
-        // Wait until the preceding split added its accumulators
-        BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
-
-      }
-      else {
-        // Wait until the first split has stored its accumulators
-        BarrierManager::wait_lt(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, 1);
-      }
+      // Wait until the preceding split added its accumulators
+      BarrierManager::wait_eq(barrier_idx, lock_workspace, barrier_group_thread_idx, lock_idx, work_tile_info.K_idx);
 
       // The block computing the final split for the tile adds previously-reduced partials
       // to its accumulators and computes the epilogue.
@@ -554,10 +558,7 @@ public:
       AccumulatorArrayT addend_fragment;
       auto peer_reduction_workspace = reinterpret_cast<AccumulatorArrayT*>(reduction_workspace + (i * peer_offset));
 
-      BlockStripedReduceT::load(addend_fragment, peer_reduction_workspace, thread_idx);
-
-      // Add peer fragment
-      *accumulator_array = add_fragments(*accumulator_array, addend_fragment);
+      BlockStripedReduceT::load_add(*accumulator_array, peer_reduction_workspace, thread_idx);
     }
   }
 
diff --git a/include/cutlass/gemm/kernel/sparse_gemm.h b/include/cutlass/gemm/kernel/sparse_gemm.h
index af274ee0..84102a6c 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h b/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
index f464e29c..0574c218 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h b/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
index 36480408..a8ec1c3d 100644
--- a/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
+++ b/include/cutlass/gemm/kernel/sparse_gemm_with_visitor.h
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/static_tile_scheduler.hpp b/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
index 67d346e3..8401fb0c 100644
--- a/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/static_tile_scheduler.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/symm_universal.h b/include/cutlass/gemm/kernel/symm_universal.h
index b51cc6ed..29cf977c 100755
--- a/include/cutlass/gemm/kernel/symm_universal.h
+++ b/include/cutlass/gemm/kernel/symm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/tile_scheduler.hpp b/include/cutlass/gemm/kernel/tile_scheduler.hpp
index ba6b4243..a5246309 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/tile_scheduler.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/kernel/tile_scheduler_params.h b/include/cutlass/gemm/kernel/tile_scheduler_params.h
index da8794bb..9ac78311 100644
--- a/include/cutlass/gemm/kernel/tile_scheduler_params.h
+++ b/include/cutlass/gemm/kernel/tile_scheduler_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -217,6 +217,7 @@ struct PersistentTileSchedulerSm90Params {
     ) {
 
     int const sm_count = hw_info.sm_count;
+    int const max_active_clusters = hw_info.max_active_clusters;
 
     // Round up to nearest multiple of swizzle_size along each mode
     auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
@@ -259,6 +260,18 @@ struct PersistentTileSchedulerSm90Params {
         launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
       }
     }
+    // In case the maximum number of clusters that could co-exist on the target device is
+    // already calculated using cudaOccupancyMaxActiveClusters
+    else if (max_active_clusters != 0) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = max_active_clusters * cluster_shape.n();
+      }
+      else {
+        launch_grid.x = max_active_clusters * cluster_shape.m();
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
     else {
       int cta_per_device = sm_count;
       /*
@@ -278,6 +291,8 @@ struct PersistentTileSchedulerSm90Params {
             cta_per_device       / cluster_shape.n(),
             problem_blocks_total / cluster_shape.n());
       }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using heuristics = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
     }
     return launch_grid;
   }
@@ -635,6 +650,7 @@ struct PersistentTileSchedulerSm90StreamKParams {
     // number of K tiles per stream-K unit remains above min_iters_per_sk_unit_
 
     uint32_t groups = platform::min(max_groups_problem, uint32_t(max_sk_groups_));
+
     // Grouping is disabled when separate reduction is used because grouping is primarily an attempt
     // to improve L2 locality, and L2-locality optimizations are unnecessary when the the kernel
     // is a single wave (which is the case for separate reduction).
@@ -755,7 +771,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
       cluster_shape,
       splits,
       epilogue_subtile,
-      reduction_mode);
+      reduction_mode
+      );
   }
 
   // Return the optimal decomposition result by heuristic.
@@ -906,7 +923,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
     GemmCoord cluster_shape,
     uint32_t splits,
     uint32_t epilogue_subtile,
-    ReductionMode reduction_mode) {
+    ReductionMode reduction_mode
+    ) {
     // The highest priority when customers set as splitk mode, may set
     // with a adpated splits value rather than the original splits
     // even it does not make sense
@@ -1666,6 +1684,7 @@ struct PersistentTileSchedulerSm90GroupParams {
     bool truncate_by_problem_size=true) {
 
     int const sm_count = hw_info.sm_count;
+    int const max_active_clusters = hw_info.max_active_clusters;
 
     // Round up to nearest multiple of swizzle_size along each mode
     auto log_swizzle_size = get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle_size);
@@ -1708,6 +1727,18 @@ struct PersistentTileSchedulerSm90GroupParams {
         launch_grid.x = possibly_truncate(sm_count, problem_blocks_total);
       }
     }
+    // In case the maximum number of clusters that could co-exist on the target device is
+    // already calculated using cudaOccupancyMaxActiveClusters
+    else if (max_active_clusters != 0) {
+      if (raster_order == RasterOrder::AlongN) {
+        launch_grid.y = max_active_clusters * cluster_shape.n();
+      }
+      else {
+        launch_grid.x = max_active_clusters * cluster_shape.m();
+      }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
+    }
     else {
       // Optimal grid size calculation is based on
       // GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
@@ -1725,6 +1756,8 @@ struct PersistentTileSchedulerSm90GroupParams {
             cta_per_device       / cluster_shape.n(),
             problem_blocks_total / cluster_shape.n());
       }
+      CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using heuristics = "
+          "(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
     }
     return launch_grid;
   }
diff --git a/include/cutlass/gemm/kernel/trmm_universal.h b/include/cutlass/gemm/kernel/trmm_universal.h
index 50b33eab..992aa484 100644
--- a/include/cutlass/gemm/kernel/trmm_universal.h
+++ b/include/cutlass/gemm/kernel/trmm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h
index 2e3798b1..018963b2 100644
--- a/include/cutlass/gemm/thread/mma.h
+++ b/include/cutlass/gemm/thread/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h
index 4c70bcf3..e05c56e3 100644
--- a/include/cutlass/gemm/thread/mma_sm50.h
+++ b/include/cutlass/gemm/thread/mma_sm50.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h
index 5e217898..64c8e033 100644
--- a/include/cutlass/gemm/thread/mma_sm60.h
+++ b/include/cutlass/gemm/thread/mma_sm60.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h
index a1abb05f..f7127ed8 100644
--- a/include/cutlass/gemm/thread/mma_sm61.h
+++ b/include/cutlass/gemm/thread/mma_sm61.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_ell_mma.h b/include/cutlass/gemm/threadblock/default_ell_mma.h
index fba28126..e27c582e 100644
--- a/include/cutlass/gemm/threadblock/default_ell_mma.h
+++ b/include/cutlass/gemm/threadblock/default_ell_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h
index 404e1891..214f451c 100755
--- a/include/cutlass/gemm/threadblock/default_gemv_core.h
+++ b/include/cutlass/gemm/threadblock/default_gemv_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h
index 8885d1ff..c77de40e 100644
--- a/include/cutlass/gemm/threadblock/default_mma.h
+++ b/include/cutlass/gemm/threadblock/default_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h
index da83982f..16860880 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
index 91f4710e..9c9f3e6f 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
index 41000dc1..fafc45c0 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
index 0162ef0d..39422ec8 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
index ae21ee8b..a839a776 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
index 985693ce..4abf7235 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_sparse_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h b/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
index 66501074..b260c911 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_with_access_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h b/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
index 9f45601a..72015956 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
index 5f8e3e33..7b3bbcf7 100644
--- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
+++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h b/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
index 5dd3dbc3..4281d5af 100644
--- a/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
+++ b/include/cutlass/gemm/threadblock/default_mma_layernorm_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
index 1895962a..cab385af 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
index e800ba44..51327c1a 100644
--- a/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h b/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
index f50d36a4..f429b525 100644
--- a/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
+++ b/include/cutlass/gemm/threadblock/default_mma_softmax_mainloop_fusion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_mma_with_reduction.h b/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
index 677c1144..c1e0af76 100644
--- a/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
+++ b/include/cutlass/gemm/threadblock/default_mma_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
index 7f249780..62d0c49b 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
index cab2a96a..8751495a 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
index 33150314..f9716f32 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h b/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
index abcb063e..4045dd2e 100644
--- a/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
+++ b/include/cutlass/gemm/threadblock/default_multistage_trmm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_sparse_mma.h b/include/cutlass/gemm/threadblock/default_sparse_mma.h
index 388b9c47..ca982121 100644
--- a/include/cutlass/gemm/threadblock/default_sparse_mma.h
+++ b/include/cutlass/gemm/threadblock/default_sparse_mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/default_trmm.h b/include/cutlass/gemm/threadblock/default_trmm.h
index 5e90f25c..2500d51a 100644
--- a/include/cutlass/gemm/threadblock/default_trmm.h
+++ b/include/cutlass/gemm/threadblock/default_trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/ell_mma_multistage.h b/include/cutlass/gemm/threadblock/ell_mma_multistage.h
index 17cc9dae..83723619 100644
--- a/include/cutlass/gemm/threadblock/ell_mma_multistage.h
+++ b/include/cutlass/gemm/threadblock/ell_mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/ell_mma_pipelined.h b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
index 55a951e1..adcff38d 100644
--- a/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/ell_mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h
index e246ddce..314f58b7 100755
--- a/include/cutlass/gemm/threadblock/gemv.h
+++ b/include/cutlass/gemm/threadblock/gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/index_remat.h b/include/cutlass/gemm/threadblock/index_remat.h
index 8370f614..89e4b1af 100644
--- a/include/cutlass/gemm/threadblock/index_remat.h
+++ b/include/cutlass/gemm/threadblock/index_remat.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h
index 16ec6568..2eaa40b7 100644
--- a/include/cutlass/gemm/threadblock/mma_base.h
+++ b/include/cutlass/gemm/threadblock/mma_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_blas3_multistage.h b/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
index 11eb20ad..e94c1de2 100644
--- a/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_blas3_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h b/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
index 11ad5444..1f533dde 100644
--- a/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_layernorm_mainloop_fusion_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h
index ef551317..ed278806 100644
--- a/include/cutlass/gemm/threadblock/mma_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h
index 89681ebc..87ccc0a6 100644
--- a/include/cutlass/gemm/threadblock/mma_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
index e8616cc9..b0ba5094 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
index b9deb632..22989815 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
index 0e36a6dc..44585961 100644
--- a/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
+++ b/include/cutlass/gemm/threadblock/mma_planar_complex_pipelined.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h
index 31156286..d3b84d63 100644
--- a/include/cutlass/gemm/threadblock/mma_singlestage.h
+++ b/include/cutlass/gemm/threadblock/mma_singlestage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h b/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
index bd793fc8..5174be4b 100644
--- a/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_softmax_mainloop_fusion_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_base.h b/include/cutlass/gemm/threadblock/mma_sparse_base.h
index bb10c0a8..9e94b0ff 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_base.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_base.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
index 8113583d..8bc23c3f 100644
--- a/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_sparse_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h b/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
index fa95dd7d..2fd49a5b 100644
--- a/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
+++ b/include/cutlass/gemm/threadblock/mma_with_reduction_multistage.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
index 1a4948d0..9495d785 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
index b79e587d..7141a6c5 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
index 92e698f8..067da30b 100644
--- a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
index 22342654..e2cb3f22 100644
--- a/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_sparse_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h
index 3a8cacd3..44d7fe11 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
index 67fcde77..8c9abb82 100644
--- a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
+++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
index db6713cb..7bd8c0fd 100644
--- a/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_with_reduction_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
index 145e4be7..6a90a780 100644
--- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
+++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h b/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
index bbf0090b..f032f26f 100644
--- a/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
+++ b/include/cutlass/gemm/warp/layernorm_scale_bias_transform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h
index dc210b02..cd677433 100644
--- a/include/cutlass/gemm/warp/mma.h
+++ b/include/cutlass/gemm/warp/mma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
index 2ef8bb42..baaced7c 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
index d52c5e24..e84ae06c 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_fast_f32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
index bc51bca0..e14450d3 100644
--- a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
index 5a02417a..6728ac20 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
index fe785f8d..ec99c77f 100644
--- a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h b/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
index f553fbde..4e16ff89 100644
--- a/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_mixed_input_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_planar_complex.h b/include/cutlass/gemm/warp/mma_planar_complex.h
index c5dcfb7c..af1031ad 100644
--- a/include/cutlass/gemm/warp/mma_planar_complex.h
+++ b/include/cutlass/gemm/warp/mma_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h
index f5f2f063..c4152da3 100644
--- a/include/cutlass/gemm/warp/mma_simt.h
+++ b/include/cutlass/gemm/warp/mma_simt.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h
index 8da3b9f8..9bca2348 100644
--- a/include/cutlass/gemm/warp/mma_simt_policy.h
+++ b/include/cutlass/gemm/warp/mma_simt_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
index 6b0647ff..c522eafa 100644
--- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
index 1ce1051c..81668b44 100644
--- a/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_sparse_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h
index d4aaf5be..9c08dd64 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h b/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
index 148e7122..570298bc 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fast_f32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
index 32460b62..1489694e 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
index 0a768cae..febd0e48 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
index c40790fa..e7a4d87f 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
index 4588efb9..6446b7bd 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
index e6e6d70f..dd15097d 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
index bcac335f..f6cc735a 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
index 4ccf0b58..d53d6dfd 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
index c4ed8bc9..97f7e14f 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
index 0da043e6..92e065f2 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
index 971ad3b8..ec445443 100644
--- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
+++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
index 67231d35..d97c8f44 100644
--- a/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
+++ b/include/cutlass/gemm/warp/mma_with_reduction_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/scale_bias_tile_iterator.h b/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
index 7d74ac8c..2d79dcf7 100644
--- a/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
+++ b/include/cutlass/gemm/warp/scale_bias_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/softmax_scale_bias_transform.h b/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
index d8d99d67..7e3af9bf 100644
--- a/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
+++ b/include/cutlass/gemm/warp/softmax_scale_bias_transform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
index 42c6728b..0406db0d 100644
--- a/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
+++ b/include/cutlass/gemm/warp/tile_iterator_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm_coord.h b/include/cutlass/gemm_coord.h
index 61b97a1e..dd826de2 100644
--- a/include/cutlass/gemm_coord.h
+++ b/include/cutlass/gemm_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/gemm_coord.hpp b/include/cutlass/gemm_coord.hpp
index a979241e..a22b8031 100644
--- a/include/cutlass/gemm_coord.hpp
+++ b/include/cutlass/gemm_coord.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/half.h b/include/cutlass/half.h
index a0f39828..f5fb90d2 100644
--- a/include/cutlass/half.h
+++ b/include/cutlass/half.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h
index 27a50fd2..80ddefa1 100644
--- a/include/cutlass/integer_subbyte.h
+++ b/include/cutlass/integer_subbyte.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/kernel_hardware_info.h b/include/cutlass/kernel_hardware_info.h
index 62dcb8b4..1d61904a 100644
--- a/include/cutlass/kernel_hardware_info.h
+++ b/include/cutlass/kernel_hardware_info.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,11 +30,13 @@
  **************************************************************************************************/
 #pragma once
 
+#include "cutlass/device_kernel.h"
 #if !defined(__CUDACC_RTC__)
 #include "cuda_runtime.h"
-
+#include "cutlass/cluster_launch.hpp"
 #include "cutlass/trace.h"
 #endif
+#include <cute/int_tuple.hpp>
 
 namespace cutlass {
 
@@ -42,9 +44,13 @@ struct KernelHardwareInfo {
   //
   // Data members
   //
+
+  // Hardware properties
   int device_id = 0;
   int sm_count  = 0;
 
+  // Kernel properties
+  int max_active_clusters = 0;              // Maximum number of clusters that could co-exist on the target device.
   //
   // Methods
   //
@@ -70,6 +76,58 @@ struct KernelHardwareInfo {
     }
     return multiprocessor_count;
   }
+
+  // Query maximum number of active clusters that could co-exist on the target device
+  // based on kernel properties such as cluster dims and threadblock dims
+  static inline int
+  query_device_max_active_clusters(
+      dim3 cluster_dims,
+      uint32_t threads_per_block,
+      void const* kernel_ptr) {
+    int max_active_clusters = 0;
+#if defined(CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED)
+    ClusterLauncher::LaunchConfig cluster_launch_config = ClusterLauncher::make_cluster_launch_config(
+                                                            cluster_dims /* minumum grid dim */, cluster_dims, {threads_per_block, 1, 1});
+    // Given the kernel function and launch configuration, return the maximum number of clusters that could co-exist on the target device.
+    cudaError_t result = cudaOccupancyMaxActiveClusters(&max_active_clusters, kernel_ptr, &cluster_launch_config.launch_config);
+    if (result != cudaSuccess) {
+      CUTLASS_TRACE_HOST(
+        "  cudaGetDevice() returned error "
+        << cudaGetErrorString(result));
+      return 0;
+    }
+    CUTLASS_TRACE_HOST("cudaOccupancyMaxActiveClusters: maximum number of clusters that could co-exist on the target device = "
+        << max_active_clusters << "\n");
+    return max_active_clusters;
+#else
+    CUTLASS_TRACE_HOST("ClusterLauncher: CUTLASS_SM90_CLUSTER_LAUNCH_ENABLED not defined! Aborting cluster occupancy query.");
+    return max_active_clusters;
+#endif
+  }
+
+  // Simpler version of the above query function that fetches relevant information from the Kernel 
+  template <typename Kernel>
+  static inline int
+  query_device_max_active_clusters() {
+    dim3 cluster_dims(cute::size<0>(typename Kernel::ClusterShape{}),
+                      cute::size<1>(typename Kernel::ClusterShape{}),
+                      cute::size<2>(typename Kernel::ClusterShape{}));
+    uint32_t threads_per_block = Kernel::MaxThreadsPerBlock;
+    void const* kernel_ptr = (void*)(device_kernel<Kernel>);
+    return query_device_max_active_clusters(cluster_dims, threads_per_block, kernel_ptr);
+  }
+
+  template <typename Kernel>
+  static inline KernelHardwareInfo
+  make_kernel_hardware_info(int const device_id = 0, int sm_count = 0, int max_active_clusters = 0) {
+    if (sm_count == 0) {
+      sm_count = query_device_multiprocessor_count(device_id);
+    }
+    if (max_active_clusters == 0) {
+      max_active_clusters = query_device_max_active_clusters<Kernel>();
+    }
+    return {device_id, sm_count, max_active_clusters};
+  }
 #endif
 };
 
diff --git a/include/cutlass/kernel_hardware_info.hpp b/include/cutlass/kernel_hardware_info.hpp
index 876aacc6..e1758eac 100644
--- a/include/cutlass/kernel_hardware_info.hpp
+++ b/include/cutlass/kernel_hardware_info.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h
index 4cd087a3..e92e6c13 100644
--- a/include/cutlass/kernel_launch.h
+++ b/include/cutlass/kernel_launch.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h
index 1089add3..b2e377c2 100644
--- a/include/cutlass/layout/layout.h
+++ b/include/cutlass/layout/layout.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h
index 32aa17a5..281b668b 100644
--- a/include/cutlass/layout/matrix.h
+++ b/include/cutlass/layout/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/permute.h b/include/cutlass/layout/permute.h
index 13e5ef22..32a6ee0d 100644
--- a/include/cutlass/layout/permute.h
+++ b/include/cutlass/layout/permute.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h
index 8c9540f4..7052de14 100644
--- a/include/cutlass/layout/pitch_linear.h
+++ b/include/cutlass/layout/pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h
index d296f1d0..91e4a9ef 100644
--- a/include/cutlass/layout/tensor.h
+++ b/include/cutlass/layout/tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
index b260942a..e4d25a51 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
index 1cda4428..6ca60055 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
index 15d52839..e3104906 100644
--- a/include/cutlass/layout/tensor_op_multiplicand_sm80.h
+++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h
index 56506fea..6cb74f35 100644
--- a/include/cutlass/layout/vector.h
+++ b/include/cutlass/layout/vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/matrix.h b/include/cutlass/matrix.h
index 5d8ccb3c..b46cbfec 100644
--- a/include/cutlass/matrix.h
+++ b/include/cutlass/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h
index 719575d5..85d447b1 100644
--- a/include/cutlass/matrix_coord.h
+++ b/include/cutlass/matrix_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h
index 66623a43..20d668b2 100644
--- a/include/cutlass/matrix_shape.h
+++ b/include/cutlass/matrix_shape.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h
index 298163d8..d708fd7a 100644
--- a/include/cutlass/numeric_conversion.h
+++ b/include/cutlass/numeric_conversion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/numeric_size.h b/include/cutlass/numeric_size.h
index 98fd77c3..4f267e51 100644
--- a/include/cutlass/numeric_size.h
+++ b/include/cutlass/numeric_size.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h
index ca37896b..e5fa5f9c 100644
--- a/include/cutlass/numeric_types.h
+++ b/include/cutlass/numeric_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/pipeline/pipeline.hpp b/include/cutlass/pipeline/pipeline.hpp
index 0b561797..040ecee3 100644
--- a/include/cutlass/pipeline/pipeline.hpp
+++ b/include/cutlass/pipeline/pipeline.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index 381834ec..58f49c36 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/pitch_linear_coord.h b/include/cutlass/pitch_linear_coord.h
index 475229a2..1b782ece 100644
--- a/include/cutlass/pitch_linear_coord.h
+++ b/include/cutlass/pitch_linear_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h
index 9ee0a7a0..939451a2 100644
--- a/include/cutlass/platform/platform.h
+++ b/include/cutlass/platform/platform.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h
index e8781562..0241a6fd 100644
--- a/include/cutlass/predicate_vector.h
+++ b/include/cutlass/predicate_vector.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/quaternion.h b/include/cutlass/quaternion.h
index b31df455..48ca3628 100644
--- a/include/cutlass/quaternion.h
+++ b/include/cutlass/quaternion.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/real.h b/include/cutlass/real.h
index 95a22444..cfca3866 100644
--- a/include/cutlass/real.h
+++ b/include/cutlass/real.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h
index 0b8ac7a5..92b57aae 100644
--- a/include/cutlass/reduction/device/reduce_split_k.h
+++ b/include/cutlass/reduction/device/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/device/tensor_reduce.h b/include/cutlass/reduction/device/tensor_reduce.h
index f36c72c9..26a0249e 100644
--- a/include/cutlass/reduction/device/tensor_reduce.h
+++ b/include/cutlass/reduction/device/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
index 8d71aa9d..c00c3681 100644
--- a/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_contiguous.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
index 5ec7e654..c85d6dcb 100644
--- a/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/device/tensor_reduce_affine_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/kernel/reduce_softmax_final.h b/include/cutlass/reduction/kernel/reduce_softmax_final.h
index 9752b9b7..3d39dc75 100644
--- a/include/cutlass/reduction/kernel/reduce_softmax_final.h
+++ b/include/cutlass/reduction/kernel/reduce_softmax_final.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h
index d9c70139..f6d26666 100644
--- a/include/cutlass/reduction/kernel/reduce_split_k.h
+++ b/include/cutlass/reduction/kernel/reduce_split_k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
index bffc956f..914bbddd 100644
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_contiguous.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
index 0d449e68..0538184f 100644
--- a/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
+++ b/include/cutlass/reduction/kernel/tensor_reduce_affine_strided.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h
index d2551f97..cc354df5 100644
--- a/include/cutlass/reduction/thread/reduce.h
+++ b/include/cutlass/reduction/thread/reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h
index 8423c2d9..3792d332 100644
--- a/include/cutlass/reduction/thread/reduction_operators.h
+++ b/include/cutlass/reduction/thread/reduction_operators.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h
index ffb35dad..bbabaed2 100644
--- a/include/cutlass/reduction/threadblock_swizzle.h
+++ b/include/cutlass/reduction/threadblock_swizzle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h
index 26b7c66b..779c1552 100644
--- a/include/cutlass/relatively_equal.h
+++ b/include/cutlass/relatively_equal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h
index efcd9211..09a0a1a4 100644
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h
index 8d43f503..6e98cdc3 100644
--- a/include/cutlass/subbyte_reference.h
+++ b/include/cutlass/subbyte_reference.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h
index 982ec4e0..a124d395 100644
--- a/include/cutlass/tensor_coord.h
+++ b/include/cutlass/tensor_coord.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h
index 1191f651..fc467499 100644
--- a/include/cutlass/tensor_ref.h
+++ b/include/cutlass/tensor_ref.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_ref_planar_complex.h b/include/cutlass/tensor_ref_planar_complex.h
index ab354bba..9ba3a230 100644
--- a/include/cutlass/tensor_ref_planar_complex.h
+++ b/include/cutlass/tensor_ref_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h
index 7defcc24..d669443a 100644
--- a/include/cutlass/tensor_view.h
+++ b/include/cutlass/tensor_view.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tensor_view_planar_complex.h b/include/cutlass/tensor_view_planar_complex.h
index af63f80c..6b8f7b47 100644
--- a/include/cutlass/tensor_view_planar_complex.h
+++ b/include/cutlass/tensor_view_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h
index d6d265a4..7bc13e17 100644
--- a/include/cutlass/tfloat32.h
+++ b/include/cutlass/tfloat32.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h
index f6b4b2b7..c3383061 100644
--- a/include/cutlass/thread/matrix.h
+++ b/include/cutlass/thread/matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/trace.h b/include/cutlass/trace.h
index 1b0c5112..803c72ec 100644
--- a/include/cutlass/trace.h
+++ b/include/cutlass/trace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
index 430545e6..99c5bf70 100644
--- a/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
+++ b/include/cutlass/transform/collective/sm90_wgmma_transpose.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/device/transform_universal_adapter.hpp b/include/cutlass/transform/device/transform_universal_adapter.hpp
index a5033d80..265d2fe4 100644
--- a/include/cutlass/transform/device/transform_universal_adapter.hpp
+++ b/include/cutlass/transform/device/transform_universal_adapter.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/kernel/filter_format_transformer.hpp b/include/cutlass/transform/kernel/filter_format_transformer.hpp
index 9f54c93f..9c9d7589 100644
--- a/include/cutlass/transform/kernel/filter_format_transformer.hpp
+++ b/include/cutlass/transform/kernel/filter_format_transformer.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp b/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
index dd4fa0c1..38a39740 100644
--- a/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
+++ b/include/cutlass/transform/kernel/sm90_sparse_gemm_compressor.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp b/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
index 51f42e9f..d2679b06 100644
--- a/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
+++ b/include/cutlass/transform/kernel/sparse_gemm_compressor.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h
index 0fcb48e5..6a8970e8 100644
--- a/include/cutlass/transform/pitch_linear_thread_map.h
+++ b/include/cutlass/transform/pitch_linear_thread_map.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h
index 4d0b3907..508cad84 100644
--- a/include/cutlass/transform/thread/transpose.h
+++ b/include/cutlass/transform/thread/transpose.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/thread/unary_op.h b/include/cutlass/transform/thread/unary_op.h
index ce7cbbe8..3977af52 100644
--- a/include/cutlass/transform/thread/unary_op.h
+++ b/include/cutlass/transform/thread/unary_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/ell_iterator.h b/include/cutlass/transform/threadblock/ell_iterator.h
index 026e4ced..bd717d67 100644
--- a/include/cutlass/transform/threadblock/ell_iterator.h
+++ b/include/cutlass/transform/threadblock/ell_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
index 2e9e3716..3676c233 100644
--- a/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/ell_predicated_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h b/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
index 7c1b27b3..48fb983f 100644
--- a/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/ell_predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
index 366897c6..dab597c8 100644
--- a/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
index 54b0ecf5..e5d9e70d 100644
--- a/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_scale_bias_vector_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
index a99dae95..e5c2a5f0 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
index 4379bb0a..93eac72e 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
index c67af387..5e509a34 100755
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_params.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
index 24498843..f657fe25 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator_triangular_matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
index bdfb33fe..01bfd70a 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
index 422ac45c..4a130b92 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
index 8fea9ae0..3acc31ff 100644
--- a/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
+++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_triangular_matrix.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h b/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
index 391f94b9..df551c13 100644
--- a/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
+++ b/include/cutlass/transform/threadblock/predicated_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h b/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
index f5906d82..1aae4698 100644
--- a/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_scale_bias_vector_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
index d0992d44..cfb491b5 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
index fa02b008..adda9339 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
index a7b57bbe..71c89686 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear_direct_conv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
index 96e3ee84..e172447f 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
index b424af44..b55f841e 100644
--- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
+++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h
index d09c2389..be07e43f 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
index 1e04c426..6c186ce3 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
index 7fd49598..5ed2e7fd 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
index 1308f45e..723f328d 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
index 81b774cf..53121c61 100644
--- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
+++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/threadblock/vector_iterator.h b/include/cutlass/transform/threadblock/vector_iterator.h
index f78e5e86..8e5d181c 100644
--- a/include/cutlass/transform/threadblock/vector_iterator.h
+++ b/include/cutlass/transform/threadblock/vector_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/transform/warp/vector_fragment_iterator.h b/include/cutlass/transform/warp/vector_fragment_iterator.h
index b8bfa57f..707cbcc8 100644
--- a/include/cutlass/transform/warp/vector_fragment_iterator.h
+++ b/include/cutlass/transform/warp/vector_fragment_iterator.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/uint128.h b/include/cutlass/uint128.h
index 6de3ba14..295eaa68 100644
--- a/include/cutlass/uint128.h
+++ b/include/cutlass/uint128.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/version.h b/include/cutlass/version.h
index ff9aa115..984d39d1 100644
--- a/include/cutlass/version.h
+++ b/include/cutlass/version.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -35,7 +35,7 @@
 #include <string>
 
 #define CUTLASS_MAJOR 3
-#define CUTLASS_MINOR 6
+#define CUTLASS_MINOR 7
 #define CUTLASS_PATCH 0
 
 #ifdef CUTLASS_VERSIONS_GENERATED
diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h
index 0f9b2b51..77929f60 100644
--- a/include/cutlass/wmma_array.h
+++ b/include/cutlass/wmma_array.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/include/cutlass/workspace.h b/include/cutlass/workspace.h
index 6f1c3254..11a21a39 100644
--- a/include/cutlass/workspace.h
+++ b/include/cutlass/workspace.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md
index b446091e..fff1ce9c 100644
--- a/media/docs/code_organization.md
+++ b/media/docs/code_organization.md
@@ -238,7 +238,7 @@ of tests run may vary over time as more are added.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/cutlass_3x_backwards_compatibility.md b/media/docs/cutlass_3x_backwards_compatibility.md
index 024e66f4..85eca7d6 100644
--- a/media/docs/cutlass_3x_backwards_compatibility.md
+++ b/media/docs/cutlass_3x_backwards_compatibility.md
@@ -442,7 +442,7 @@ as the mappings are not always bijective.
 
 # Copyright
 
-Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md
index 17e4b159..1ff521ac 100644
--- a/media/docs/doxygen_mainpage.md
+++ b/media/docs/doxygen_mainpage.md
@@ -38,7 +38,7 @@ has a variety of examples.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md
index 33df8608..4defa6d8 100644
--- a/media/docs/efficient_gemm.md
+++ b/media/docs/efficient_gemm.md
@@ -257,7 +257,7 @@ targeting NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/functionality.md b/media/docs/functionality.md
index a761284e..274bba62 100644
--- a/media/docs/functionality.md
+++ b/media/docs/functionality.md
@@ -281,7 +281,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md
index 8bef0702..311a80e6 100644
--- a/media/docs/fundamental_types.md
+++ b/media/docs/fundamental_types.md
@@ -345,7 +345,7 @@ support on current and future NVIDIA GPUs.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md
index f942933a..e2aaaccb 100644
--- a/media/docs/gemm_api.md
+++ b/media/docs/gemm_api.md
@@ -542,7 +542,7 @@ to inline PTX.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/gemm_api_3x.md b/media/docs/gemm_api_3x.md
index 5956e6e1..ab6e6e09 100644
--- a/media/docs/gemm_api_3x.md
+++ b/media/docs/gemm_api_3x.md
@@ -128,6 +128,7 @@ using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder
 
 // Step 2: Specify the collective layer epilogue type
 using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+    ElementC,
     cutlass::gemm::TagToStrideC_t<LayoutC>,
     cutlass::gemm::TagToStrideC_t<LayoutC>,
     cutlass::epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>>;
@@ -673,7 +674,7 @@ please refer to CuTe's tutorial, e.g., the sections on
 
 # Copyright
 
-Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/implicit_gemm_convolution.md b/media/docs/implicit_gemm_convolution.md
index 9b4df009..9b00cfc2 100644
--- a/media/docs/implicit_gemm_convolution.md
+++ b/media/docs/implicit_gemm_convolution.md
@@ -761,7 +761,7 @@ Convolution can also be run by the CUTLASS Profiler.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/layout.md b/media/docs/layout.md
index 74c1909e..bd544c0a 100644
--- a/media/docs/layout.md
+++ b/media/docs/layout.md
@@ -272,7 +272,7 @@ Permuted Shared Memory Layouts:
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/pipeline.md b/media/docs/pipeline.md
index 4b9bfd47..1a8b551a 100644
--- a/media/docs/pipeline.md
+++ b/media/docs/pipeline.md
@@ -179,7 +179,7 @@ for more details.
 
 # Copyright
 
-Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/profiler.md b/media/docs/profiler.md
index 80c855e8..846cfb54 100644
--- a/media/docs/profiler.md
+++ b/media/docs/profiler.md
@@ -159,9 +159,17 @@ Profiling:
                                                  capacity of the last-level cache.
 
   --profiling-iterations=<iterations>              Number of iterations to profile each kernel. If zero, kernels
-                                                   are launched up to the profiling duration.
+                                                   are launched up to the profiling duration. If non-zero, this
+                                                   overrides `profiling-duration` and `min-iterations`.
 
-  --warmup-iterations=<iterations>                 Number of iterations to execute each kernel prior to profiling.
+  --profiling-duration=<duration>                  Time to spend profiling each kernel (ms). Overriden by
+                                                   `profiling-iterations` when `profiling-iterations` != 0.
+                                                   Note that `min-iterations` must also be satisfied.
+
+  --min-iterations=<iterations>                    Minimum number of iterations to spend profiling each kernel, even if
+                                                   `profiling-duration` has been met.
+
+  --warmup-iterations=<iterations>                 Number of iterations to execute each kernel prior to profiling (default: 10).
 
   --sleep-duration=<duration>                      Number of ms to sleep between profiling periods (ms).
 
@@ -624,7 +632,7 @@ reference_device: Passed
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md
index d6d5e16b..d7d601a2 100644
--- a/media/docs/programming_guidelines.md
+++ b/media/docs/programming_guidelines.md
@@ -1161,7 +1161,7 @@ However, A is certainly M major if interpreted as a matrix.
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md
index 29e5a0f6..a217e0e7 100644
--- a/media/docs/quickstart.md
+++ b/media/docs/quickstart.md
@@ -655,7 +655,7 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS='70;75;80' -DCUTLASS_LIBRARY_KERNELS=tensorop*s*
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/terminology.md b/media/docs/terminology.md
index d58884b9..f4e3a9d7 100644
--- a/media/docs/terminology.md
+++ b/media/docs/terminology.md
@@ -82,7 +82,7 @@ replaced by [MMA and Copy atoms from CuTe](/media/docs/cute/0t_mma_atom.md).
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md
index 897db653..f8db020d 100644
--- a/media/docs/tile_iterator_concept.md
+++ b/media/docs/tile_iterator_concept.md
@@ -473,7 +473,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept {
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/media/docs/utilities.md b/media/docs/utilities.md
index b179f2fa..e8e1b98e 100644
--- a/media/docs/utilities.md
+++ b/media/docs/utilities.md
@@ -434,7 +434,7 @@ Please note that `synclog` is an experimental feature, and its functionality is
 
 # Copyright
 
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/pyproject.toml b/pyproject.toml
index ef8f1db2..ffb66b27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "nvidia-cutlass"
-version = "3.6.0.0"
+version = "3.7.0.0"
 description = "CUTLASS"
 readme = "README.md"
 requires-python = ">=3.8"
diff --git a/python/LICENSE.txt b/python/LICENSE.txt
index 52550084..47016fa7 100644
--- a/python/LICENSE.txt
+++ b/python/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 Redistribution and use in source and binary forms, with or without
diff --git a/python/README.md b/python/README.md
index 4c89ec89..3fb36f03 100644
--- a/python/README.md
+++ b/python/README.md
@@ -181,7 +181,7 @@ You can also use the [generator.py](/python/cutlass_library/generator.py) script
 
 # Copyright
 
-Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 
 ```
diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py
index fad27837..81bb8cfb 100644
--- a/python/cutlass/__init__.py
+++ b/python/cutlass/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -134,7 +134,7 @@ def get_option_registry():
         this._option_registry = OptionRegistry(device_cc())
     return this._option_registry
 
-this.__version__ = '3.6.0'
+this.__version__ = '3.7.0'
 
 from cutlass.backend import create_memory_pool
 from cutlass.emit.pytorch import pytorch
diff --git a/python/cutlass/backend/__init__.py b/python/cutlass/backend/__init__.py
index 9a4e2f67..1011cd22 100644
--- a/python/cutlass/backend/__init__.py
+++ b/python/cutlass/backend/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/arguments.py b/python/cutlass/backend/arguments.py
index b91cdf1f..eb31b762 100644
--- a/python/cutlass/backend/arguments.py
+++ b/python/cutlass/backend/arguments.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/c_types.py b/python/cutlass/backend/c_types.py
index 95e264cd..83a80e81 100644
--- a/python/cutlass/backend/c_types.py
+++ b/python/cutlass/backend/c_types.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py
index 2c38397d..43750d45 100644
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/conv2d_operation.py b/python/cutlass/backend/conv2d_operation.py
index faefd135..bf6e5754 100644
--- a/python/cutlass/backend/conv2d_operation.py
+++ b/python/cutlass/backend/conv2d_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/epilogue.py b/python/cutlass/backend/epilogue.py
index 48366a76..a9cf6be7 100644
--- a/python/cutlass/backend/epilogue.py
+++ b/python/cutlass/backend/epilogue.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/__init__.py b/python/cutlass/backend/evt/__init__.py
index a7cad2ea..35ce4aa3 100644
--- a/python/cutlass/backend/evt/__init__.py
+++ b/python/cutlass/backend/evt/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/backend/__init__.py b/python/cutlass/backend/evt/backend/__init__.py
index 06c1545c..bb7c0834 100644
--- a/python/cutlass/backend/evt/backend/__init__.py
+++ b/python/cutlass/backend/evt/backend/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/backend/emitter_base.py b/python/cutlass/backend/evt/backend/emitter_base.py
index dd98af63..738dcf46 100644
--- a/python/cutlass/backend/evt/backend/emitter_base.py
+++ b/python/cutlass/backend/evt/backend/emitter_base.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/backend/sm80_emitter.py b/python/cutlass/backend/evt/backend/sm80_emitter.py
index f8d8000c..a22e3379 100644
--- a/python/cutlass/backend/evt/backend/sm80_emitter.py
+++ b/python/cutlass/backend/evt/backend/sm80_emitter.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/backend/sm80_nodes.py b/python/cutlass/backend/evt/backend/sm80_nodes.py
index ec915fd3..aafc38e2 100644
--- a/python/cutlass/backend/evt/backend/sm80_nodes.py
+++ b/python/cutlass/backend/evt/backend/sm80_nodes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/backend/sm90_emitter.py b/python/cutlass/backend/evt/backend/sm90_emitter.py
index f9d9afc6..3d5b5046 100644
--- a/python/cutlass/backend/evt/backend/sm90_emitter.py
+++ b/python/cutlass/backend/evt/backend/sm90_emitter.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/backend/sm90_nodes.py b/python/cutlass/backend/evt/backend/sm90_nodes.py
index acdc4f47..62ad5004 100644
--- a/python/cutlass/backend/evt/backend/sm90_nodes.py
+++ b/python/cutlass/backend/evt/backend/sm90_nodes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/epilogue.py b/python/cutlass/backend/evt/epilogue.py
index c0c780be..85c11bea 100644
--- a/python/cutlass/backend/evt/epilogue.py
+++ b/python/cutlass/backend/evt/epilogue.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/frontend/__init__.py b/python/cutlass/backend/evt/frontend/__init__.py
index 5abb473e..f2cd3c97 100644
--- a/python/cutlass/backend/evt/frontend/__init__.py
+++ b/python/cutlass/backend/evt/frontend/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/frontend/frontend_base.py b/python/cutlass/backend/evt/frontend/frontend_base.py
index 5c63c141..4cc1edf0 100644
--- a/python/cutlass/backend/evt/frontend/frontend_base.py
+++ b/python/cutlass/backend/evt/frontend/frontend_base.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/frontend/python_ast.py b/python/cutlass/backend/evt/frontend/python_ast.py
index 3f334854..0af934a6 100644
--- a/python/cutlass/backend/evt/frontend/python_ast.py
+++ b/python/cutlass/backend/evt/frontend/python_ast.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/__init__.py b/python/cutlass/backend/evt/ir/__init__.py
index 83412fd1..5d55adea 100644
--- a/python/cutlass/backend/evt/ir/__init__.py
+++ b/python/cutlass/backend/evt/ir/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/compute_nodes.py b/python/cutlass/backend/evt/ir/compute_nodes.py
index 783d7cf1..6c9f51b2 100644
--- a/python/cutlass/backend/evt/ir/compute_nodes.py
+++ b/python/cutlass/backend/evt/ir/compute_nodes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/dag_ir.py b/python/cutlass/backend/evt/ir/dag_ir.py
index b837a552..ce8c3d64 100644
--- a/python/cutlass/backend/evt/ir/dag_ir.py
+++ b/python/cutlass/backend/evt/ir/dag_ir.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/layout_algorithm.py b/python/cutlass/backend/evt/ir/layout_algorithm.py
index dd990303..9d453b1f 100644
--- a/python/cutlass/backend/evt/ir/layout_algorithm.py
+++ b/python/cutlass/backend/evt/ir/layout_algorithm.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/layout_nodes.py b/python/cutlass/backend/evt/ir/layout_nodes.py
index 961189fc..81ddf094 100644
--- a/python/cutlass/backend/evt/ir/layout_nodes.py
+++ b/python/cutlass/backend/evt/ir/layout_nodes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/load_nodes.py b/python/cutlass/backend/evt/ir/load_nodes.py
index 4c6c4b8b..73bf9825 100644
--- a/python/cutlass/backend/evt/ir/load_nodes.py
+++ b/python/cutlass/backend/evt/ir/load_nodes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/node.py b/python/cutlass/backend/evt/ir/node.py
index 87cca2f2..b5d4fdd1 100644
--- a/python/cutlass/backend/evt/ir/node.py
+++ b/python/cutlass/backend/evt/ir/node.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/store_nodes.py b/python/cutlass/backend/evt/ir/store_nodes.py
index 4990e87a..a3f06645 100644
--- a/python/cutlass/backend/evt/ir/store_nodes.py
+++ b/python/cutlass/backend/evt/ir/store_nodes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/ir/tensor.py b/python/cutlass/backend/evt/ir/tensor.py
index 18641e0e..b8d1bbe0 100644
--- a/python/cutlass/backend/evt/ir/tensor.py
+++ b/python/cutlass/backend/evt/ir/tensor.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/__init__.py b/python/cutlass/backend/evt/passes/__init__.py
index 4bd8180c..c2998397 100644
--- a/python/cutlass/backend/evt/passes/__init__.py
+++ b/python/cutlass/backend/evt/passes/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/graph_drawer.py b/python/cutlass/backend/evt/passes/graph_drawer.py
index a2a73640..4e1e094e 100644
--- a/python/cutlass/backend/evt/passes/graph_drawer.py
+++ b/python/cutlass/backend/evt/passes/graph_drawer.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_argument_type.py b/python/cutlass/backend/evt/passes/pass_argument_type.py
index 11cf1fe2..0c5cc1d2 100644
--- a/python/cutlass/backend/evt/passes/pass_argument_type.py
+++ b/python/cutlass/backend/evt/passes/pass_argument_type.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_dag_2_tree.py b/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
index 9a1cb851..5783e9b0 100644
--- a/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
+++ b/python/cutlass/backend/evt/passes/pass_dag_2_tree.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_fix_element_d.py b/python/cutlass/backend/evt/passes/pass_fix_element_d.py
index 7cc5f70f..3ef697ca 100644
--- a/python/cutlass/backend/evt/passes/pass_fix_element_d.py
+++ b/python/cutlass/backend/evt/passes/pass_fix_element_d.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_get_impl.py b/python/cutlass/backend/evt/passes/pass_get_impl.py
index 47f39035..a883e9ff 100644
--- a/python/cutlass/backend/evt/passes/pass_get_impl.py
+++ b/python/cutlass/backend/evt/passes/pass_get_impl.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_layout_elimination.py b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
index be57deb2..48c5d295 100644
--- a/python/cutlass/backend/evt/passes/pass_layout_elimination.py
+++ b/python/cutlass/backend/evt/passes/pass_layout_elimination.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_manager.py b/python/cutlass/backend/evt/passes/pass_manager.py
index 0b0b6bb9..e5b94048 100644
--- a/python/cutlass/backend/evt/passes/pass_manager.py
+++ b/python/cutlass/backend/evt/passes/pass_manager.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_no_op_elimination.py b/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
index 11884262..148f87f8 100644
--- a/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
+++ b/python/cutlass/backend/evt/passes/pass_no_op_elimination.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_preprocess_red.py b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
index 3c8b7a72..9a342636 100644
--- a/python/cutlass/backend/evt/passes/pass_preprocess_red.py
+++ b/python/cutlass/backend/evt/passes/pass_preprocess_red.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py b/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
index b0e93219..fc493626 100644
--- a/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
+++ b/python/cutlass/backend/evt/passes/pass_shape_type_propagation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/smem_size_calculator.py b/python/cutlass/backend/evt/passes/smem_size_calculator.py
index bcc94884..d28bf3a0 100644
--- a/python/cutlass/backend/evt/passes/smem_size_calculator.py
+++ b/python/cutlass/backend/evt/passes/smem_size_calculator.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/evt/passes/util.py b/python/cutlass/backend/evt/passes/util.py
index 59b4ebcd..ad014bf5 100644
--- a/python/cutlass/backend/evt/passes/util.py
+++ b/python/cutlass/backend/evt/passes/util.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/frontend.py b/python/cutlass/backend/frontend.py
index 2b907cc7..fe05582d 100644
--- a/python/cutlass/backend/frontend.py
+++ b/python/cutlass/backend/frontend.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/gemm_operation.py b/python/cutlass/backend/gemm_operation.py
index 62ac6c27..f9d64148 100644
--- a/python/cutlass/backend/gemm_operation.py
+++ b/python/cutlass/backend/gemm_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/library.py b/python/cutlass/backend/library.py
index 49cb537a..4e0812c4 100644
--- a/python/cutlass/backend/library.py
+++ b/python/cutlass/backend/library.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/memory_manager.py b/python/cutlass/backend/memory_manager.py
index 89e69083..414af64d 100644
--- a/python/cutlass/backend/memory_manager.py
+++ b/python/cutlass/backend/memory_manager.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/operation.py b/python/cutlass/backend/operation.py
index a73cef68..7694941c 100644
--- a/python/cutlass/backend/operation.py
+++ b/python/cutlass/backend/operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/reduction_operation.py b/python/cutlass/backend/reduction_operation.py
index 7b65a876..3aec9765 100644
--- a/python/cutlass/backend/reduction_operation.py
+++ b/python/cutlass/backend/reduction_operation.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/type_hint.py b/python/cutlass/backend/type_hint.py
index 9a3caabf..fffa0336 100644
--- a/python/cutlass/backend/type_hint.py
+++ b/python/cutlass/backend/type_hint.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/utils/__init__.py b/python/cutlass/backend/utils/__init__.py
index 43ebe696..638a97b1 100644
--- a/python/cutlass/backend/utils/__init__.py
+++ b/python/cutlass/backend/utils/__init__.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/backend/utils/device.py b/python/cutlass/backend/utils/device.py
index 7ccf6ee9..16c865b4 100644
--- a/python/cutlass/backend/utils/device.py
+++ b/python/cutlass/backend/utils/device.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/emit/__init__.py b/python/cutlass/emit/__init__.py
index 145960d0..e1026558 100644
--- a/python/cutlass/emit/__init__.py
+++ b/python/cutlass/emit/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/emit/common.py b/python/cutlass/emit/common.py
index 87025eea..4d9b8763 100644
--- a/python/cutlass/emit/common.py
+++ b/python/cutlass/emit/common.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/emit/pytorch.py b/python/cutlass/emit/pytorch.py
index 8c10f87a..e759596c 100644
--- a/python/cutlass/emit/pytorch.py
+++ b/python/cutlass/emit/pytorch.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/epilogue/__init__.py b/python/cutlass/epilogue/__init__.py
index 423decce..3646d9b1 100644
--- a/python/cutlass/epilogue/__init__.py
+++ b/python/cutlass/epilogue/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/epilogue/epilogue.py b/python/cutlass/epilogue/epilogue.py
index 3f512d24..b1dcfa4f 100644
--- a/python/cutlass/epilogue/epilogue.py
+++ b/python/cutlass/epilogue/epilogue.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/epilogue/evt_ops.py b/python/cutlass/epilogue/evt_ops.py
index 153b937e..aa4ec292 100644
--- a/python/cutlass/epilogue/evt_ops.py
+++ b/python/cutlass/epilogue/evt_ops.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/library_defaults.py b/python/cutlass/library_defaults.py
index dc67c1bd..f2c18c86 100644
--- a/python/cutlass/library_defaults.py
+++ b/python/cutlass/library_defaults.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/op/__init__.py b/python/cutlass/op/__init__.py
index c0f3c827..5332556c 100644
--- a/python/cutlass/op/__init__.py
+++ b/python/cutlass/op/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py
index 6de93551..c9fd8f9a 100644
--- a/python/cutlass/op/conv.py
+++ b/python/cutlass/op/conv.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/op/gemm.py b/python/cutlass/op/gemm.py
index e74c4078..9d4518e7 100644
--- a/python/cutlass/op/gemm.py
+++ b/python/cutlass/op/gemm.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/op/gemm_grouped.py b/python/cutlass/op/gemm_grouped.py
index dbbb21f6..c68747bc 100644
--- a/python/cutlass/op/gemm_grouped.py
+++ b/python/cutlass/op/gemm_grouped.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/op/op.py b/python/cutlass/op/op.py
index 69eb71c6..444df8b9 100644
--- a/python/cutlass/op/op.py
+++ b/python/cutlass/op/op.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/shape.py b/python/cutlass/shape.py
index 37341463..0987899a 100644
--- a/python/cutlass/shape.py
+++ b/python/cutlass/shape.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/swizzle.py b/python/cutlass/swizzle.py
index b678910f..ffd94834 100644
--- a/python/cutlass/swizzle.py
+++ b/python/cutlass/swizzle.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/utils/__init__.py b/python/cutlass/utils/__init__.py
index 367fd67b..21658035 100644
--- a/python/cutlass/utils/__init__.py
+++ b/python/cutlass/utils/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/utils/check.py b/python/cutlass/utils/check.py
index 2a37b72c..7cc004ec 100644
--- a/python/cutlass/utils/check.py
+++ b/python/cutlass/utils/check.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/utils/datatypes.py b/python/cutlass/utils/datatypes.py
index 8ef50ad8..75beda65 100644
--- a/python/cutlass/utils/datatypes.py
+++ b/python/cutlass/utils/datatypes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass/utils/profiler.py b/python/cutlass/utils/profiler.py
index 82e414c0..87369670 100644
--- a/python/cutlass/utils/profiler.py
+++ b/python/cutlass/utils/profiler.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/__init__.py b/python/cutlass_library/__init__.py
index 57885061..d164768c 100644
--- a/python/cutlass_library/__init__.py
+++ b/python/cutlass_library/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/conv2d_operation.py b/python/cutlass_library/conv2d_operation.py
index 1cfe7f6e..b674463a 100644
--- a/python/cutlass_library/conv2d_operation.py
+++ b/python/cutlass_library/conv2d_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/conv3d_operation.py b/python/cutlass_library/conv3d_operation.py
index bb9f99e4..b96b6db7 100644
--- a/python/cutlass_library/conv3d_operation.py
+++ b/python/cutlass_library/conv3d_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/conv3x_emitter.py b/python/cutlass_library/conv3x_emitter.py
index 29bc4a8f..46cb56d0 100644
--- a/python/cutlass_library/conv3x_emitter.py
+++ b/python/cutlass_library/conv3x_emitter.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py
index 62a5474a..6ae493b9 100644
--- a/python/cutlass_library/gemm_operation.py
+++ b/python/cutlass_library/gemm_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py
index bd06a801..3fa49eae 100644
--- a/python/cutlass_library/generator.py
+++ b/python/cutlass_library/generator.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -5203,7 +5203,7 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments.
@@ -5268,7 +5268,7 @@ def GenerateSM90_TensorOp_16b_WGMMA_alignx_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9992)
   is_aligned = False
 
   # layouts for ABC and their alignments.
@@ -5329,7 +5329,7 @@ def GenerateSM90_SparseTensorOp_16b_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=131, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments.
@@ -5394,7 +5394,7 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments
@@ -5452,7 +5452,7 @@ def GenerateSM90_TensorOp_tf32_WGMMA_alignx_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=101, exhaustive_level=9992)
   is_aligned = False
 
   # layouts for ABC and their alignments.
@@ -5509,7 +5509,7 @@ def GenerateSM90_SparseTensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=120, default_level=121, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments
@@ -5564,7 +5564,7 @@ def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments
@@ -5616,7 +5616,7 @@ def GenerateSM90_TensorOp_int8_WGMMA_alignx_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
   is_aligned = False
 
   # layouts for ABC and their alignments
@@ -5668,7 +5668,7 @@ def GenerateSM90_SparseTensorOp_int8_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=100, default_level=111, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments
@@ -5723,7 +5723,7 @@ def GenerateSM90_TensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments
@@ -5789,7 +5789,7 @@ def GenerateSM90_TensorOp_fp8_WGMMA_alignx_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 0):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=0, default_level=101, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=0, default_level=101, exhaustive_level=9992)
   is_aligned = False
 
   # layouts for ABC and their alignments
@@ -5847,7 +5847,7 @@ def GenerateSM90_SparseTensorOp_fp8_WGMMA_gemm(manifest, cuda_version):
   if not CudaToolkitVersionSatisfies(cuda_version, 12, 2):
     return
 
-  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9999)
+  instantiation_level = manifest.get_sm90_instantiation_level(pruned_level=20, default_level=121, exhaustive_level=9992)
   is_aligned = True
 
   # layouts for ABC and their alignments
diff --git a/python/cutlass_library/library.py b/python/cutlass_library/library.py
index 3ccfb403..c00992f2 100644
--- a/python/cutlass_library/library.py
+++ b/python/cutlass_library/library.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py
index 3e82e640..78f6b887 100644
--- a/python/cutlass_library/manifest.py
+++ b/python/cutlass_library/manifest.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -563,17 +563,31 @@ class Manifest:
     self.operations_by_name = {}
     self.disable_full_archs_compilation = args.disable_full_archs_compilation
     self.is_kernel_filter_set_to_all = args.instantiation_level == "max" and args.kernels != ''
+    self.instantiation_level = 0
+    try:
+        self.instantiation_level = int(args.instantiation_level)
+    except ValueError:
+        self.instantiation_level = 0
 
-  def get_sm90_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9999):
+  def get_sm90_instantiation_level(self, pruned_level=0, default_level=111, exhaustive_level=9992):
     # Non-negative integer which determines how many kernels are instantiated.
     # 0 = 0000 generates the fewest kernels, 9999 generates all possible combinations.
     # increasing first digit reduces schedule / mixed type pruning,
     # increasing second digit generates more cluster sizes,
-    # increasing third digit generates more MMA shapes,
+    # increasing third digit generates more MMA multipliers,
     # increasing fourth digit generates more instruction shapes.
-    return exhaustive_level if self.is_kernel_filter_set_to_all else (
-      pruned_level if self.kernel_filter == '' else default_level
-    )
+
+    if self.instantiation_level > 0:
+        return self.instantiation_level
+
+    elif self.is_kernel_filter_set_to_all:
+        return exhaustive_level
+
+    elif self.kernel_filter == '':
+        return pruned_level
+
+    else:
+        return default_level
 
 
   def get_kernel_filters (self, kernelListFile):
diff --git a/python/cutlass_library/rank_2k_operation.py b/python/cutlass_library/rank_2k_operation.py
index 0cfb9222..29ef056f 100644
--- a/python/cutlass_library/rank_2k_operation.py
+++ b/python/cutlass_library/rank_2k_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/rank_k_operation.py b/python/cutlass_library/rank_k_operation.py
index a15b6815..98419523 100644
--- a/python/cutlass_library/rank_k_operation.py
+++ b/python/cutlass_library/rank_k_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/sm90_shapes.py b/python/cutlass_library/sm90_shapes.py
index 034e7524..e14761aa 100644
--- a/python/cutlass_library/sm90_shapes.py
+++ b/python/cutlass_library/sm90_shapes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/sm90_utils.py b/python/cutlass_library/sm90_utils.py
index 021406d7..53285400 100644
--- a/python/cutlass_library/sm90_utils.py
+++ b/python/cutlass_library/sm90_utils.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -261,21 +261,30 @@ def is_tile_desc_valid(tile_description):
         tile_description.math_instruction.element_accumulator
     )
 
-    cluster_shape, cta_shape, inst_shape = (
+    cluster_size, cta_shape = (
         tile_description.cluster_shape,
         tile_description.threadblock_shape,
-        tile_description.math_instruction.instruction_shape
     )
     grid_size = (
-        cta_shape[0] * cluster_shape[0] +
-        cta_shape[1] * cluster_shape[1] +
-        cta_shape[2] * cluster_shape[2]
+        cta_shape[0] * cluster_size[0] +
+        cta_shape[1] * cluster_size[1] +
+        cta_shape[2] * cluster_size[2]
     )
-    cluster_size = cluster_shape[0] * cluster_shape[1] * cluster_shape[2]
+    num_ctas_in_cluster = cluster_size[0] * cluster_size[1] * cluster_size[2]
+    cluster_shape = (
+        cluster_size[0] * cta_shape[0],
+        cluster_size[1] * cta_shape[1],
+        cluster_size[2] * cta_shape[2]
+    )
+
+    FP32_TYPES = [DataType.f32, DataType.tf32]
+    FP16_TYPES = [DataType.f16, DataType.bf16]
+    is_fp32 = element_a in FP32_TYPES and element_b in FP32_TYPES
+    is_fp16 = element_a in FP16_TYPES and element_b in FP16_TYPES
 
     # Maximum number of CTAs per cluster is 8 for Hopper, but up to 16 is
     # allowed for non portable clusters.
-    if cluster_size > 16 or cluster_size < 1:
+    if num_ctas_in_cluster > 16 or num_ctas_in_cluster < 1:
         return False
 
     if grid_size < 1:
@@ -299,8 +308,17 @@ def is_tile_desc_valid(tile_description):
     if cta_shape[2] < 16 or cta_shape[2] % 8 != 0:
         return False
 
-    # Minimum of 2 stages
-    if cta_shape[2] < inst_shape[2] or cta_shape[2] % inst_shape[2] != 0 or cta_shape[2] / inst_shape[2] < 2:
+    # Minimum of 2 stages (very rough heuristic that may filter out valid kernel configs)
+    if (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 256:
+        return False
+
+    if is_fp32 and (cluster_shape[0] >= 128 or cluster_shape[1] >= 128) and cluster_shape[2] >= 128:
+        return False
+
+    if is_fp32 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 64:
+        return False
+
+    if is_fp16 and cluster_shape[0] >= 256 and cluster_shape[1] >= 256 and cluster_shape[2] >= 128:
         return False
 
     # CTA shape upper bound: <256, 256, 256>
@@ -329,6 +347,7 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level:
     tile_descriptions = set()
     mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned)
     for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes):
+
         tile_desc = TileDescription(
             threadblock_shape=[
                 math_inst.instruction_shape[0] * mma_mul[0],
@@ -426,6 +445,25 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
     d_type = data_types["d_type"]
     is_void_c = c_type == DataType.void
 
+    # Filter out invalid kernels
+    is_nt = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.RowMajor
+    is_tn = layout[0][0] == LayoutType.RowMajor and layout[1][0] == LayoutType.ColumnMajor
+    is_nn = layout[0][0] == LayoutType.ColumnMajor and layout[1][0] == LayoutType.ColumnMajor
+
+    # static_assert(size<0>(SmemLayoutB{}) % WarpgroupTileSize == 0,
+    #   "Copy size must evenly divide SMEM tile.");
+    if is_fp32 and is_nt and (cta_n % cta_k != 0):
+        return [], []
+
+    # static_assert(!TransposeB || (cutlass::bits_to_bytes((size<1>(SmemLayoutB{}) * sizeof_bits<InternalElementB>::value))) == 128,
+    # "SmemLayoutB K must be 128bytes to be transposed.")
+    if is_fp32 and is_nt and cta_k != 32:
+        return [], []
+
+    # Static assert failure when instantiating SmemLayoutB
+    if is_fp32 and (is_tn or is_nn) and (cta_n % cta_k != 0):
+        return [], []
+
     # Early pruning
     if level < 1:
         # Don't stamp out FP16/BF16 kernels smaller than or equal to 64x128x64
@@ -441,7 +479,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
             if CudaToolkitVersionSatisfies(cuda_version, 12, 1) and can_do_cooperative and can_do_tma_epilogue:
                 return [
                     [
-                        KernelScheduleType.TmaWarpSpecializedCooperative if not is_sparse else KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
+                        KernelScheduleType.TmaWarpSpecializedCooperative,
                         EpilogueScheduleType.TmaWarpSpecializedCooperative
                     ],
                     [
@@ -489,8 +527,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
         # Pruning: don't stamp out fp8 kernels with auto schedule
         if not is_fp8:
             schedules.append([KernelScheduleType.ScheduleAuto, auto_epilogue])
-        if not (is_fp8 and is_sparse):
-            schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
+        schedules.append([KernelScheduleType.TmaWarpSpecialized, default_epilogue])
     stream_k_schedules = []
     
     if CudaToolkitVersionSatisfies(cuda_version, 12, 0):
@@ -518,16 +555,14 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
             schedules.append([KernelScheduleType.TmaWarpSpecializedPingpongFP8FastAccum, default_epilogue])
 
         if can_do_cooperative:
-            # Sparse kernels only support FastAccum FP8 mainloop
-            if not (is_fp8 and is_sparse):
-                schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedCooperative,
-                    default_epilogue
-                ])
-                stream_k_schedules.append([
-                    KernelScheduleType.TmaWarpSpecializedCooperative,
-                    default_epilogue
-                ])
+            schedules.append([
+                KernelScheduleType.TmaWarpSpecializedCooperative,
+                default_epilogue
+            ])
+            stream_k_schedules.append([
+                KernelScheduleType.TmaWarpSpecializedCooperative,
+                default_epilogue
+            ])
             if can_do_fp8_fast_accum:
                 schedules.append([
                     KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
@@ -542,16 +577,14 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types,
         if can_do_tma_epilogue:
             assert not requires_transposed_epilogue
             if can_do_cooperative:
-                # Sparse kernels only support FastAccum FP8 mainloop
-                if not (is_fp8 and is_sparse):
-                    schedules.append([
-                        KernelScheduleType.TmaWarpSpecializedCooperative,
-                        EpilogueScheduleType.TmaWarpSpecializedCooperative
-                    ])
-                    stream_k_schedules.append([
-                        KernelScheduleType.TmaWarpSpecializedCooperative,
-                        EpilogueScheduleType.TmaWarpSpecializedCooperative
-                    ])
+                schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    EpilogueScheduleType.TmaWarpSpecializedCooperative
+                ])
+                stream_k_schedules.append([
+                    KernelScheduleType.TmaWarpSpecializedCooperative,
+                    EpilogueScheduleType.TmaWarpSpecializedCooperative
+                ])
                 if can_do_fp8_fast_accum:
                     schedules.append([
                         KernelScheduleType.TmaWarpSpecializedCooperativeFP8FastAccum,
diff --git a/python/cutlass_library/symm_operation.py b/python/cutlass_library/symm_operation.py
index 93c94832..8661ff79 100644
--- a/python/cutlass_library/symm_operation.py
+++ b/python/cutlass_library/symm_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/cutlass_library/trmm_operation.py b/python/cutlass_library/trmm_operation.py
index 5fc991a3..46ba360c 100644
--- a/python/cutlass_library/trmm_operation.py
+++ b/python/cutlass_library/trmm_operation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/docs_src/source/conf.py b/python/docs_src/source/conf.py
index 762dd037..c396d75a 100644
--- a/python/docs_src/source/conf.py
+++ b/python/docs_src/source/conf.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/pycute/__init__.py b/python/pycute/__init__.py
index 2fbfb6c6..308a5676 100644
--- a/python/pycute/__init__.py
+++ b/python/pycute/__init__.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/pycute/int_tuple.py b/python/pycute/int_tuple.py
index da8e2a6c..36abf557 100644
--- a/python/pycute/int_tuple.py
+++ b/python/pycute/int_tuple.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/pycute/layout.py b/python/pycute/layout.py
index 881b8095..389f0037 100644
--- a/python/pycute/layout.py
+++ b/python/pycute/layout.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/pycute/swizzle.py b/python/pycute/swizzle.py
index 336df1c3..3f73d1bc 100644
--- a/python/pycute/swizzle.py
+++ b/python/pycute/swizzle.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/pycute/typing.py b/python/pycute/typing.py
index 5e39f72a..834f7e54 100644
--- a/python/pycute/typing.py
+++ b/python/pycute/typing.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/setup_cutlass.py b/python/setup_cutlass.py
index f43020d4..bdef2e32 100644
--- a/python/setup_cutlass.py
+++ b/python/setup_cutlass.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/python/setup_library.py b/python/setup_library.py
index bbe97c06..40f92836 100644
--- a/python/setup_library.py
+++ b/python/setup_library.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
     setup(
         name='cutlass_library',
-        version='3.6.0',
+        version='3.7.0',
         description='CUTLASS library generation scripts',
         packages=['cutlass_library']
     )
diff --git a/python/setup_pycute.py b/python/setup_pycute.py
index a57e37a4..2b9cd02e 100644
--- a/python/setup_pycute.py
+++ b/python/setup_pycute.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -36,7 +36,7 @@ from setuptools import setup
 def perform_setup():
     setup(
         name='pycute',
-        version='3.6.0',
+        version='3.7.0',
         description='Python implementation of CuTe',
         packages=['pycute'],
     )
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index eb802d80..39d30fad 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/conv2d/conv2d_problem_sizes.py b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
index b80a2692..d16338d9 100644
--- a/test/python/cutlass/conv2d/conv2d_problem_sizes.py
+++ b/test/python/cutlass/conv2d/conv2d_problem_sizes.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/conv2d/conv2d_sm80.py b/test/python/cutlass/conv2d/conv2d_sm80.py
index 5503cb2a..fd59cbdd 100644
--- a/test/python/cutlass/conv2d/conv2d_sm80.py
+++ b/test/python/cutlass/conv2d/conv2d_sm80.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/conv2d/conv2d_test_utils.py b/test/python/cutlass/conv2d/conv2d_test_utils.py
index 090bee2f..454eb22b 100644
--- a/test/python/cutlass/conv2d/conv2d_test_utils.py
+++ b/test/python/cutlass/conv2d/conv2d_test_utils.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/conv2d/run_all_tests.py b/test/python/cutlass/conv2d/run_all_tests.py
index 3e554897..d892b5df 100644
--- a/test/python/cutlass/conv2d/run_all_tests.py
+++ b/test/python/cutlass/conv2d/run_all_tests.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/emit/pytorch.py b/test/python/cutlass/emit/pytorch.py
index 18388a76..5fe5b1a5 100644
--- a/test/python/cutlass/emit/pytorch.py
+++ b/test/python/cutlass/emit/pytorch.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/evt_compute_sm80_90.py b/test/python/cutlass/evt/evt_compute_sm80_90.py
index da6c1dec..dd58eb0a 100644
--- a/test/python/cutlass/evt/evt_compute_sm80_90.py
+++ b/test/python/cutlass/evt/evt_compute_sm80_90.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/evt_layout_sm80_90.py b/test/python/cutlass/evt/evt_layout_sm80_90.py
index d5cfa796..71e09973 100644
--- a/test/python/cutlass/evt/evt_layout_sm80_90.py
+++ b/test/python/cutlass/evt/evt_layout_sm80_90.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/evt_load_sm80_90.py b/test/python/cutlass/evt/evt_load_sm80_90.py
index 885e9ada..1b1a4fa6 100644
--- a/test/python/cutlass/evt/evt_load_sm80_90.py
+++ b/test/python/cutlass/evt/evt_load_sm80_90.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/evt_mixed_sm80_90.py b/test/python/cutlass/evt/evt_mixed_sm80_90.py
index 3efd13d9..448e4b70 100644
--- a/test/python/cutlass/evt/evt_mixed_sm80_90.py
+++ b/test/python/cutlass/evt/evt_mixed_sm80_90.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/evt_store_sm80_90.py b/test/python/cutlass/evt/evt_store_sm80_90.py
index 4719edbe..772e06b8 100644
--- a/test/python/cutlass/evt/evt_store_sm80_90.py
+++ b/test/python/cutlass/evt/evt_store_sm80_90.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/run_all_tests.py b/test/python/cutlass/evt/run_all_tests.py
index e36fe8bc..5bb84e2e 100644
--- a/test/python/cutlass/evt/run_all_tests.py
+++ b/test/python/cutlass/evt/run_all_tests.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/evt/utils/evt_testbed.py b/test/python/cutlass/evt/utils/evt_testbed.py
index 66c20df0..f5ee2a33 100644
--- a/test/python/cutlass/evt/utils/evt_testbed.py
+++ b/test/python/cutlass/evt/utils/evt_testbed.py
@@ -1,6 +1,6 @@
 ################################################################################
 #
-# Copyright (c) 20123 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_batched.py b/test/python/cutlass/gemm/gemm_batched.py
index 823a40d0..a4303970 100644
--- a/test/python/cutlass/gemm/gemm_batched.py
+++ b/test/python/cutlass/gemm/gemm_batched.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_f16_sm80.py b/test/python/cutlass/gemm/gemm_f16_sm80.py
index 7460794c..4c8ed29e 100644
--- a/test/python/cutlass/gemm/gemm_f16_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm80.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_f16_sm90.py b/test/python/cutlass/gemm/gemm_f16_sm90.py
index cfac8a6c..445a096f 100644
--- a/test/python/cutlass/gemm/gemm_f16_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f16_sm90.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_f32_sm80.py b/test/python/cutlass/gemm/gemm_f32_sm80.py
index 32bb2d81..c5b85170 100644
--- a/test/python/cutlass/gemm/gemm_f32_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f32_sm80.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_f64_sm80.py b/test/python/cutlass/gemm/gemm_f64_sm80.py
index 8f8ce3e1..f238890e 100644
--- a/test/python/cutlass/gemm/gemm_f64_sm80.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm80.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_f64_sm90.py b/test/python/cutlass/gemm/gemm_f64_sm90.py
index 784477e0..d0d0238d 100644
--- a/test/python/cutlass/gemm/gemm_f64_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f64_sm90.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_f8_sm90.py b/test/python/cutlass/gemm/gemm_f8_sm90.py
index 1e5f9927..5735e36c 100644
--- a/test/python/cutlass/gemm/gemm_f8_sm90.py
+++ b/test/python/cutlass/gemm/gemm_f8_sm90.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_mixed_sm80.py b/test/python/cutlass/gemm/gemm_mixed_sm80.py
index 80d1919e..857acd83 100644
--- a/test/python/cutlass/gemm/gemm_mixed_sm80.py
+++ b/test/python/cutlass/gemm/gemm_mixed_sm80.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_s8_sm80.py b/test/python/cutlass/gemm/gemm_s8_sm80.py
index d38bc098..38dd307d 100644
--- a/test/python/cutlass/gemm/gemm_s8_sm80.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm80.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_s8_sm90.py b/test/python/cutlass/gemm/gemm_s8_sm90.py
index 1035482c..2cfeadb8 100644
--- a/test/python/cutlass/gemm/gemm_s8_sm90.py
+++ b/test/python/cutlass/gemm/gemm_s8_sm90.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/gemm_testbed.py b/test/python/cutlass/gemm/gemm_testbed.py
index 731c9680..d4631650 100644
--- a/test/python/cutlass/gemm/gemm_testbed.py
+++ b/test/python/cutlass/gemm/gemm_testbed.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/run_all_tests.py b/test/python/cutlass/gemm/run_all_tests.py
index 0ad3922c..bc5e7467 100644
--- a/test/python/cutlass/gemm/run_all_tests.py
+++ b/test/python/cutlass/gemm/run_all_tests.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/gemm/utils.py b/test/python/cutlass/gemm/utils.py
index 2d8c30f1..6ec92fec 100644
--- a/test/python/cutlass/gemm/utils.py
+++ b/test/python/cutlass/gemm/utils.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/installation.py b/test/python/cutlass/installation.py
index 7cc70075..6f05e5ac 100644
--- a/test/python/cutlass/installation.py
+++ b/test/python/cutlass/installation.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/interface/conv2d_interface.py b/test/python/cutlass/interface/conv2d_interface.py
index 425f2a9d..d0f13e04 100644
--- a/test/python/cutlass/interface/conv2d_interface.py
+++ b/test/python/cutlass/interface/conv2d_interface.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/interface/evt_interface.py b/test/python/cutlass/interface/evt_interface.py
index 8d94889c..f2668d70 100644
--- a/test/python/cutlass/interface/evt_interface.py
+++ b/test/python/cutlass/interface/evt_interface.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/interface/gemm_interface.py b/test/python/cutlass/interface/gemm_interface.py
index e494fe03..85ef228d 100644
--- a/test/python/cutlass/interface/gemm_interface.py
+++ b/test/python/cutlass/interface/gemm_interface.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/cutlass/interface/utils.py b/test/python/cutlass/interface/utils.py
index 706981a1..9f93ca26 100644
--- a/test/python/cutlass/interface/utils.py
+++ b/test/python/cutlass/interface/utils.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/run_all_tests.py b/test/python/pycute/run_all_tests.py
index 2b8ee287..b7cdc421 100644
--- a/test/python/pycute/run_all_tests.py
+++ b/test/python/pycute/run_all_tests.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_coalesce.py b/test/python/pycute/test_coalesce.py
index dd897cff..d4330377 100644
--- a/test/python/pycute/test_coalesce.py
+++ b/test/python/pycute/test_coalesce.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_complement.py b/test/python/pycute/test_complement.py
index 1cb2425d..5a8684a5 100644
--- a/test/python/pycute/test_complement.py
+++ b/test/python/pycute/test_complement.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_composition.py b/test/python/pycute/test_composition.py
index 8b569cd5..1918972d 100644
--- a/test/python/pycute/test_composition.py
+++ b/test/python/pycute/test_composition.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_int_tuple.py b/test/python/pycute/test_int_tuple.py
index 8e56c37a..0dbf443c 100644
--- a/test/python/pycute/test_int_tuple.py
+++ b/test/python/pycute/test_int_tuple.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_left_inverse.py b/test/python/pycute/test_left_inverse.py
index 23d088a1..a6501fd6 100644
--- a/test/python/pycute/test_left_inverse.py
+++ b/test/python/pycute/test_left_inverse.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_right_inverse.py b/test/python/pycute/test_right_inverse.py
index 6fdf08f7..2ed9759d 100644
--- a/test/python/pycute/test_right_inverse.py
+++ b/test/python/pycute/test_right_inverse.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/python/pycute/test_typing.py b/test/python/pycute/test_typing.py
index 447873ac..9eb99a48 100644
--- a/test/python/pycute/test_typing.py
+++ b/test/python/pycute/test_typing.py
@@ -1,6 +1,6 @@
 #################################################################################################
 #
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/self_contained_includes/CMakeLists.txt b/test/self_contained_includes/CMakeLists.txt
index a576868b..6e8eeb7f 100644
--- a/test/self_contained_includes/CMakeLists.txt
+++ b/test/self_contained_includes/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index b02ec65a..0abda31d 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cluster_launch/CMakeLists.txt b/test/unit/cluster_launch/CMakeLists.txt
index 65cf7d84..0af3f30d 100644
--- a/test/unit/cluster_launch/CMakeLists.txt
+++ b/test/unit/cluster_launch/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cluster_launch/cluster_launch.cu b/test/unit/cluster_launch/cluster_launch.cu
index 43141923..96dd2701 100644
--- a/test/unit/cluster_launch/cluster_launch.cu
+++ b/test/unit/cluster_launch/cluster_launch.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h
index c1521e65..86b78237 100644
--- a/test/unit/common/cutlass_unit_test.h
+++ b/test/unit/common/cutlass_unit_test.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp
index 32acad1e..875924ed 100644
--- a/test/unit/common/filter_architecture.cpp
+++ b/test/unit/common/filter_architecture.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/CMakeLists.txt b/test/unit/conv/CMakeLists.txt
index 4762a0c0..890f9516 100644
--- a/test/unit/conv/CMakeLists.txt
+++ b/test/unit/conv/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/cache_testbed_output.h b/test/unit/conv/cache_testbed_output.h
index 57b0a253..3035e986 100644
--- a/test/unit/conv/cache_testbed_output.h
+++ b/test/unit/conv/cache_testbed_output.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/CMakeLists.txt b/test/unit/conv/device/CMakeLists.txt
index d3a6782f..8ea7dde8 100644
--- a/test/unit/conv/device/CMakeLists.txt
+++ b/test/unit/conv/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index dd6fbfcf..d2dc4e43 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index 88f89c79..d221c5c1 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 5559fe6b..3bcbaea5 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index 923b0ea1..024c27f2 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 66ff6fee..8c602955 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 847d68de..1ff096a4 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 61644552..db8d1756 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index e96d3ea9..843c4b9f 100644
--- a/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index d71832ea..14550e93 100644
--- a/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index 36259550..af57ddd2 100644
--- a/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index 95e18422..79603811 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index e7022b09..c37ec6e9 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
index 1c780f50..52525dda 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index 64db7ebb..a8d4ce85 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index c977d144..b47101de 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index e4e6f66e..008aa73b 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index b3eb74e1..2afa3489 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index c75ebbee..684976c8 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
index c41e561b..1424ea80 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 842fd88b..4e64c6dc 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu
index 34435514..fe82e9ec 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_f8nhwc_f8nhwc_f8nhwc_tensor_op_f32_sm89.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
index ae16437e..f7ca7947 100755
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
index e443531c..6fcee6d1 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
index 69070d9b..19e1d2e8 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4ncxhwx_s4cxrskx_s4ncxhwx_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
index a9f14ada..4317515f 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
index cc6502a7..08ef0b55 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s4nhwc_s4nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
index 0af60771..661d8531 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
index 91de9236..eb9f751d 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8ncxhwx_s8cxrskx_s8ncxhwx_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
index a2967cc1..c048c569 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
index eeb146ed..b6d1eddf 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_s8nhwc_s8nhwc_s32nhwc_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 8ce7b4be..f6501e42 100644
--- a/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu
index c3015bfd..b869c526 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_simt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
index 6e235c79..2a13e6a4 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
index f2d3e584..cd6c193c 100644
--- a/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_broadcast_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
index a1d658ea..7e85ba53 100644
--- a/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
+++ b/test/unit/conv/device/conv2d_fprop_with_reduction_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_problems.h b/test/unit/conv/device/conv2d_problems.h
index 07ca3ed9..a14134b2 100644
--- a/test/unit/conv/device/conv2d_problems.h
+++ b/test/unit/conv/device/conv2d_problems.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index b1fe52d3..24c74f8f 100644
--- a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu
index ec5548c6..0e89d03e 100644
--- a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_swizzling4_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index 8bc1d58b..8bcd4137 100644
--- a/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_testbed.h b/test/unit/conv/device/conv2d_testbed.h
index 2acf1cf6..34588ecb 100644
--- a/test/unit/conv/device/conv2d_testbed.h
+++ b/test/unit/conv/device/conv2d_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_testbed_interleaved.h b/test/unit/conv/device/conv2d_testbed_interleaved.h
index cc1e8ab7..cf075674 100644
--- a/test/unit/conv/device/conv2d_testbed_interleaved.h
+++ b/test/unit/conv/device/conv2d_testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
index 49bb3bcf..6431dc7c 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
index fac3708a..7d3e9d35 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_cf32nhwc_cf32nhwc_cf32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
index f6c0104c..1b4aa1b7 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
index f8d02e59..d3858b81 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
index 65bdd867..a87cdf0e 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
index b7fc11a0..4be8c701 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 01c0f151..38728635 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
index cf093472..ad4e9b59 100644
--- a/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_with_absmax_testbed.h b/test/unit/conv/device/conv2d_with_absmax_testbed.h
index ba54d208..ad7b2ce6 100644
--- a/test/unit/conv/device/conv2d_with_absmax_testbed.h
+++ b/test/unit/conv/device/conv2d_with_absmax_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_with_broadcast_testbed.h b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
index 278d447f..f768f5b2 100644
--- a/test/unit/conv/device/conv2d_with_broadcast_testbed.h
+++ b/test/unit/conv/device/conv2d_with_broadcast_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv2d_with_reduction_testbed.h b/test/unit/conv/device/conv2d_with_reduction_testbed.h
index 255a6aab..a8ec16ca 100644
--- a/test/unit/conv/device/conv2d_with_reduction_testbed.h
+++ b/test/unit/conv/device/conv2d_with_reduction_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index eefc8c9f..a02191cd 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
index e2716ad0..67106366 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index fafe81f3..2cc9cd7d 100644
--- a/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_dgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index 6da86b72..c7a5bb4f 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 27ae274c..87e4ac04 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
index d6134fa2..d211cacd 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 90af0d76..4dbfb67f 100644
--- a/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu b/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu
index a3461f8e..b14332b7 100644
--- a/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu
+++ b/test/unit/conv/device/conv3d_fprop_with_broadcast_simt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_problems.h b/test/unit/conv/device/conv3d_problems.h
index 4082cce8..fae7d619 100644
--- a/test/unit/conv/device/conv3d_problems.h
+++ b/test/unit/conv/device/conv3d_problems.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_testbed.h b/test/unit/conv/device/conv3d_testbed.h
index 54c8143f..029f5eff 100644
--- a/test/unit/conv/device/conv3d_testbed.h
+++ b/test/unit/conv/device/conv3d_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
index b644c3f3..6e8b64d0 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index 601e4c6f..8183ae84 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
index 14269a32..fe0d2030 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
index fdcdf482..47fa1d49 100644
--- a/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/conv3d_wgrad_implicit_gemm_tf32ndhwc_tf32ndhwc_f32ndhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/conv3d_with_broadcast_testbed.h b/test/unit/conv/device/conv3d_with_broadcast_testbed.h
index 437dbd30..f8ba785c 100644
--- a/test/unit/conv/device/conv3d_with_broadcast_testbed.h
+++ b/test/unit/conv/device/conv3d_with_broadcast_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu b/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
index 73a78d33..fdfa56a5 100644
--- a/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/deconv2d_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu b/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu
index bfb85d51..5e40ef0e 100644
--- a/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu
+++ b/test/unit/conv/device/deconv2d_with_broadcast_simt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu b/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
index 929a5151..a740b0a0 100644
--- a/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
+++ b/test/unit/conv/device/deconv3d_implicit_gemm_f32ndhwc_f32ndhwc_f32ndhwc_simt_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu b/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu
index 09817d71..7f022818 100644
--- a/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu
+++ b/test/unit/conv/device/deconv3d_with_broadcast_simt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h b/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
index cff483d9..cef5f981 100644
--- a/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
+++ b/test/unit/conv/device/depthwise_conv2d_direct_conv_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
index f23292ec..2dc92c1b 100644
--- a/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
index 6fdd5de7..8e44346f 100644
--- a/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu b/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
index a4d8ea8b..199437f9 100644
--- a/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
+++ b/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu b/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
index 2072fd22..d5eb264d 100644
--- a/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
+++ b/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/CMakeLists.txt b/test/unit/conv/device_3x/CMakeLists.txt
index dddeba6f..f7152779 100644
--- a/test/unit/conv/device_3x/CMakeLists.txt
+++ b/test/unit/conv/device_3x/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/conv_problem_sizes.hpp b/test/unit/conv/device_3x/conv_problem_sizes.hpp
index d66de64a..54c11281 100644
--- a/test/unit/conv/device_3x/conv_problem_sizes.hpp
+++ b/test/unit/conv/device_3x/conv_problem_sizes.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -821,6 +821,78 @@ get_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>() {
   return problem_shapes;
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Grouped Wgrad
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Get problem size vectors for group conv problems
+template<int SpatialDim, cutlass::conv::Operator ConvOp>
+std::vector<cutlass::conv::ConvProblemShape<ConvOp, SpatialDim>>
+inline
+get_grouped_conv_problem_vector(int GroupsPerTile);
+
+// Specialization for 3D wgrad problems
+template<>
+std::vector<cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>> inline
+get_grouped_conv_problem_vector<3, cutlass::conv::Operator::kWgrad>(int GroupsPerTile) {
+  using ProblemShape = cutlass::conv::ConvProblemShape<cutlass::conv::Operator::kWgrad, 3>;
+  std::vector<ProblemShape> problem_shapes;
+
+  if (GroupsPerTile == 1) {
+    // channel_per_group == 64
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 2048}, // ndhwc
+      {2048, 1, 3, 3, 64},  // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  else if (GroupsPerTile == 2) {
+    // channel_per_group == 32
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 1024}, // ndhwc
+      {1024, 1, 3, 3, 32},  // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  else if (GroupsPerTile == 4) {
+    // channel_per_group == 16
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 512}, // ndhwc
+      {512, 1, 3, 3, 16},  // ktrsc
+      {0, 1, 1},           // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},           // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},           // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},           // dilation (dilation_d, dilation_h, dilation_w)
+      32                   // groups
+    });
+  }
+  else if (GroupsPerTile == 8) {
+    // channel_per_group == 8
+    problem_shapes.push_back({
+      cutlass::conv::Mode::kCrossCorrelation,
+      {1, 1, 16, 16, 256},  // ndhwc
+      {256, 1, 3, 3, 8},    // ktrsc
+      {0, 1, 1},            // padding lower (pad_d, pad_h, pad_w)
+      {0, 1, 1},            // padding upper (pad_d, pad_h, pad_w)
+      {1, 1, 1},            // stride (stride_d, stride_h, stride_w)
+      {1, 1, 1},            // dilation (dilation_d, dilation_h, dilation_w)
+      32                    // groups
+    });
+  }
+  return problem_shapes;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Unit Stride Dgrad
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/test/unit/conv/device_3x/dgrad/CMakeLists.txt b/test/unit/conv/device_3x/dgrad/CMakeLists.txt
index a37bdbdb..ede0ad9a 100644
--- a/test/unit/conv/device_3x/dgrad/CMakeLists.txt
+++ b/test/unit/conv/device_3x/dgrad/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 5b8aaf6d..25bb4d75 100644
--- a/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index 61add31e..95ee882f 100644
--- a/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv1d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 1928f5d7..756e038b 100644
--- a/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index 95365a20..c232a58a 100644
--- a/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 17d41b5a..9bf671d5 100644
--- a/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index b7423649..a6f97cd5 100644
--- a/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/dgrad/sm90_conv3d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/CMakeLists.txt b/test/unit/conv/device_3x/fprop/CMakeLists.txt
index 480d0e45..90b2cef2 100644
--- a/test/unit/conv/device_3x/fprop/CMakeLists.txt
+++ b/test/unit/conv/device_3x/fprop/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2013 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2013 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index b501bdfb..e1a85b81 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index 4e2ef6bf..12990858 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
index 98f7f001..8c22b15f 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
index 4b35683a..22b21980 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv1d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 62378de7..9bf27888 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index 7058d460..4daa9f8a 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
index dbddc698..ffb1ecc2 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
index 7fab79d3..f2cdf383 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv2d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index c2a30d85..4186498b 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index af2a4a9c..66efc1fe 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
index 417ed2e1..a3f6d6f5 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
index 60a81a67..c0184b8a 100644
--- a/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_tf32_tf32_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/testbed_conv.hpp b/test/unit/conv/device_3x/testbed_conv.hpp
index b392165c..ddf8ea40 100644
--- a/test/unit/conv/device_3x/testbed_conv.hpp
+++ b/test/unit/conv/device_3x/testbed_conv.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -113,14 +113,14 @@ struct DenseConvParams {
   // Default Kernel data types
   using ElementA = typename Conv::ConvKernel::ElementA;
   using ElementB = typename Conv::ConvKernel::ElementB;
-  
+
   static constexpr cutlass::conv::Operator ConvOp = Conv::DispatchPolicy::ConvOp;
   static constexpr int NumSpatialDimensions = Conv::NumSpatialDimensions;
   using ProblemShape = cutlass::conv::ConvProblemShape<ConvOp, NumSpatialDimensions>;
 
   // get the default arguments without sparse data
   auto get_mainloop_arguments(
-    [[maybe_unused]] ProblemShape const& problem_shape,  
+    [[maybe_unused]] ProblemShape const& problem_shape,
     thrust::universal_vector<ElementA>& tensor_A,
     thrust::universal_vector<ElementB>& tensor_B
   ) {
@@ -242,6 +242,74 @@ struct ConvTestbed {
     return max_smem_size >= Conv::ConvKernel::SharedStorageSize;
   }
 
+  auto transform_shape_and_stride_with_groups(ProblemShape const& problem_shape) {
+    using TensorExtent = cute::array<int32_t, NumSpatialDimensions + 3>;
+    using TensorStride = cute::array<int64_t, NumSpatialDimensions + 3>;
+
+    TensorExtent shape_a_g{};
+    TensorExtent shape_b_g{};
+    TensorExtent shape_c_g{};
+    TensorStride stride_a_g{};
+    TensorStride stride_b_g{};
+    TensorStride stride_c_g{};
+
+    auto shape_a = cute::reverse(problem_shape.shape_A);
+    auto shape_b = cute::reverse(problem_shape.shape_B);
+    auto shape_c = cute::reverse(problem_shape.shape_C);
+    auto stride_a = cute::reverse(problem_shape.stride_A);
+    auto stride_b = cute::reverse(problem_shape.stride_B);
+    auto stride_c = cute::reverse(problem_shape.stride_C);
+
+    int32_t G = problem_shape.groups;
+
+    if constexpr (ConvOp == cutlass::conv::Operator::kFprop ||
+                  ConvOp == cutlass::conv::Operator::kDgrad) {
+      // shape_a_g = (c,w,h,d,n,g) or (k,q,p,z,n,g)
+      // shape_b_g = (c,s,r,k,t,g)
+      // shape_c_g = (k,q,p,z,n,g) or (c,w,h,d,n,g)
+      shape_a_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_a) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_a),
+        cute::make_shape(G)));
+      shape_b_g = cute::to_array<int32_t>(tuple_cat(
+        cute::take<0,NumSpatialDimensions + 1>(shape_b),
+        cute::make_shape(cute::size<NumSpatialDimensions + 1>(shape_b) / G, G)));
+      shape_c_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_c) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_c),
+        cute::make_shape(G)));
+
+      stride_a_g = cute::to_array<int64_t>(append(stride_a, cute::size<0>(shape_a) / G));
+      stride_b_g = cute::to_array<int64_t>(append(stride_b,
+        cute::size<NumSpatialDimensions + 1>(stride_b) * cute::size<NumSpatialDimensions + 1>(shape_b) / G));
+      stride_c_g = cute::to_array<int64_t>(append(stride_c, cute::size<0>(shape_c) / G));
+    }
+    else if constexpr (ConvOp == cutlass::conv::Operator::kWgrad) {
+      // shape_a_g = (k,q,p,z,n,g)
+      // shape_b_g = (c,w,h,d,n,g)
+      // shape_c_g = (c,s,r,k,t,g)
+      shape_a_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_a) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_a),
+        cute::make_shape(G)));
+      shape_b_g = cute::to_array<int32_t>(tuple_cat(
+        cute::make_shape(cute::size<0>(shape_b) / G),
+        cute::take<1,NumSpatialDimensions + 2>(shape_b),
+        cute::make_shape(G)));
+      shape_c_g = cute::to_array<int32_t>(tuple_cat(
+        cute::take<0,NumSpatialDimensions + 1>(shape_c),
+        cute::make_shape(cute::size<NumSpatialDimensions + 1>(shape_c) / G, G)));
+
+      stride_a_g = cute::to_array<int64_t>(append(stride_a, cute::size<0>(shape_a) / G));
+      stride_b_g = cute::to_array<int64_t>(append(stride_b, cute::size<0>(shape_b) / G));
+      stride_c_g = cute::to_array<int64_t>(append(stride_c,
+        cute::size<NumSpatialDimensions + 1>(stride_c) * cute::size<NumSpatialDimensions + 1>(shape_c) / G));
+    }
+
+    return make_tuple(shape_a_g, shape_b_g, shape_c_g,
+                      stride_a_g, stride_b_g, stride_c_g);
+  }
+
   // Executes one test
   bool run(
     ProblemShape const& problem_shape,
@@ -263,7 +331,7 @@ struct ConvTestbed {
     }
 
     bool ret = initialize(problem_shape);
-    
+
     if (!ret) {
       std::cerr << "initialize failed for the given problem_shape: \n";
       return false;
@@ -382,15 +450,10 @@ struct ConvTestbed {
                                    << cudaGetErrorString(result);
 
     // Create cute::Tensors using the logical rank-3 MNK multi-mode shapes the mainloop gives us
-    auto shape_mA = cute::reverse(problem_shape.shape_A);
-    auto shape_mB = cute::reverse(problem_shape.shape_B);
-    auto shape_mC = cute::reverse(problem_shape.shape_C);
+    auto [shape_mA, shape_mB, shape_mC, stride_mA, stride_mB, stride_mC] =
+      transform_shape_and_stride_with_groups(problem_shape);
     auto shape_mBias = cute::make_shape(cute::size(cute::get<0>(problem_shape.get_shape_B())));
 
-    auto stride_mA = cute::reverse(problem_shape.stride_A);
-    auto stride_mB = cute::reverse(problem_shape.stride_B);
-    auto stride_mC = cute::reverse(problem_shape.stride_C);
-
     auto mA = make_tensor(tensor_A.data().get(), make_layout(shape_mA, stride_mA));
     auto mB = make_tensor(tensor_B.data().get(), make_layout(shape_mB, stride_mB));
     auto mC = make_tensor(tensor_C.data().get(), make_layout(shape_mC, stride_mC));
diff --git a/test/unit/conv/device_3x/wgrad/CMakeLists.txt b/test/unit/conv/device_3x/wgrad/CMakeLists.txt
index 82521d09..7d7d310c 100644
--- a/test/unit/conv/device_3x/wgrad/CMakeLists.txt
+++ b/test/unit/conv/device_3x/wgrad/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2013 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2013 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 1ef19fc8..955853cc 100644
--- a/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index 9385fc2f..14540228 100644
--- a/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 8c8177fd..dc21758e 100644
--- a/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index a1a28ec5..46bca7fb 100644
--- a/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv2d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
index 34c9ccd0..353e7613 100644
--- a/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
index 55489335..6389f0d5 100644
--- a/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
+++ b/test/unit/conv/device_3x/wgrad/sm90_conv3d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt
index 9c68d4af..b52263e9 100644
--- a/test/unit/core/CMakeLists.txt
+++ b/test/unit/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu
index 9fc04564..05605942 100644
--- a/test/unit/core/array.cu
+++ b/test/unit/core/array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu
index b586068f..7af972dc 100644
--- a/test/unit/core/bfloat16.cu
+++ b/test/unit/core/bfloat16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu
index 880af586..c065c494 100644
--- a/test/unit/core/complex.cu
+++ b/test/unit/core/complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/fast_numeric_conversion.cu b/test/unit/core/fast_numeric_conversion.cu
index 9f7a886f..7cf29e19 100644
--- a/test/unit/core/fast_numeric_conversion.cu
+++ b/test/unit/core/fast_numeric_conversion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/float8.cu b/test/unit/core/float8.cu
index 14d9d22b..2d3973d5 100644
--- a/test/unit/core/float8.cu
+++ b/test/unit/core/float8.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu
index 4d765617..11b9c7c2 100644
--- a/test/unit/core/functional.cu
+++ b/test/unit/core/functional.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu
index 9fc24ed0..d52bc674 100644
--- a/test/unit/core/half.cu
+++ b/test/unit/core/half.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/matrix.cu b/test/unit/core/matrix.cu
index 5d1d0787..41a3092b 100644
--- a/test/unit/core/matrix.cu
+++ b/test/unit/core/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu
index 4b76b8c8..e54fd008 100644
--- a/test/unit/core/matrix_coord.cu
+++ b/test/unit/core/matrix_coord.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu
index a4b9e723..9bd727f6 100644
--- a/test/unit/core/numeric_conversion.cu
+++ b/test/unit/core/numeric_conversion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/numeric_conversion_subbyte.cu b/test/unit/core/numeric_conversion_subbyte.cu
index a670afce..7b70b88e 100644
--- a/test/unit/core/numeric_conversion_subbyte.cu
+++ b/test/unit/core/numeric_conversion_subbyte.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu
index ab4ef200..5acba851 100644
--- a/test/unit/core/predicate_vector.cu
+++ b/test/unit/core/predicate_vector.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/quaternion.cu b/test/unit/core/quaternion.cu
index 9d8a596d..adbfc46f 100644
--- a/test/unit/core/quaternion.cu
+++ b/test/unit/core/quaternion.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu
index 2182dc10..9af8b3ed 100644
--- a/test/unit/core/tensor_ref.cu
+++ b/test/unit/core/tensor_ref.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu
index 1fee5e3e..02f6062c 100644
--- a/test/unit/core/tensor_view.cu
+++ b/test/unit/core/tensor_view.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp
index f25d8fa9..b145d36a 100644
--- a/test/unit/core/test_unit_core.cpp
+++ b/test/unit/core/test_unit_core.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu
index 3e9a8a2d..b0af3671 100644
--- a/test/unit/core/tfloat32.cu
+++ b/test/unit/core/tfloat32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/core/uint128.cu b/test/unit/core/uint128.cu
index 3e9f0426..a6872328 100644
--- a/test/unit/core/uint128.cu
+++ b/test/unit/core/uint128.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/CMakeLists.txt b/test/unit/cute/CMakeLists.txt
index 601c0c0d..ce40e74d 100644
--- a/test/unit/cute/CMakeLists.txt
+++ b/test/unit/cute/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/CMakeLists.txt b/test/unit/cute/ampere/CMakeLists.txt
index c1a654e8..cc578ecf 100644
--- a/test/unit/cute/ampere/CMakeLists.txt
+++ b/test/unit/cute/ampere/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/cooperative_copy.cu b/test/unit/cute/ampere/cooperative_copy.cu
index fef61aa2..b66d93eb 100644
--- a/test/unit/cute/ampere/cooperative_copy.cu
+++ b/test/unit/cute/ampere/cooperative_copy.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/cooperative_gemm.cu b/test/unit/cute/ampere/cooperative_gemm.cu
index 5bb6ecd2..b192ec72 100644
--- a/test/unit/cute/ampere/cooperative_gemm.cu
+++ b/test/unit/cute/ampere/cooperative_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/cp_sync.cu b/test/unit/cute/ampere/cp_sync.cu
index f5045410..165d9d16 100644
--- a/test/unit/cute/ampere/cp_sync.cu
+++ b/test/unit/cute/ampere/cp_sync.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/ldsm.cu b/test/unit/cute/ampere/ldsm.cu
index 07bce57a..c5afcb3c 100644
--- a/test/unit/cute/ampere/ldsm.cu
+++ b/test/unit/cute/ampere/ldsm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/tiled_cp_async.cu b/test/unit/cute/ampere/tiled_cp_async.cu
index 8f8e74d5..7b31b5cd 100644
--- a/test/unit/cute/ampere/tiled_cp_async.cu
+++ b/test/unit/cute/ampere/tiled_cp_async.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/ampere/tiled_cp_async_testbed.hpp b/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
index 8edd5b9f..ff170be1 100644
--- a/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
+++ b/test/unit/cute/ampere/tiled_cp_async_testbed.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/cooperative_gemm_common.hpp b/test/unit/cute/cooperative_gemm_common.hpp
index dbb85e6b..e524dc28 100644
--- a/test/unit/cute/cooperative_gemm_common.hpp
+++ b/test/unit/cute/cooperative_gemm_common.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/CMakeLists.txt b/test/unit/cute/core/CMakeLists.txt
index 77037ac4..d74ed3a7 100644
--- a/test/unit/cute/core/CMakeLists.txt
+++ b/test/unit/cute/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/array_subbyte.cpp b/test/unit/cute/core/array_subbyte.cpp
index f3b94a8f..ccf0f662 100644
--- a/test/unit/cute/core/array_subbyte.cpp
+++ b/test/unit/cute/core/array_subbyte.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/bitfield.cpp b/test/unit/cute/core/bitfield.cpp
index 04213869..897e8486 100644
--- a/test/unit/cute/core/bitfield.cpp
+++ b/test/unit/cute/core/bitfield.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/coalesce.cpp b/test/unit/cute/core/coalesce.cpp
index d45945c7..3f980f54 100644
--- a/test/unit/cute/core/coalesce.cpp
+++ b/test/unit/cute/core/coalesce.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/compact_xmajor.cpp b/test/unit/cute/core/compact_xmajor.cpp
index 9e2fbc60..a7e3c5f5 100644
--- a/test/unit/cute/core/compact_xmajor.cpp
+++ b/test/unit/cute/core/compact_xmajor.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/compare.cpp b/test/unit/cute/core/compare.cpp
index 1659a3ec..fdbc5c29 100644
--- a/test/unit/cute/core/compare.cpp
+++ b/test/unit/cute/core/compare.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/complement.cpp b/test/unit/cute/core/complement.cpp
index cba486f6..94e77f24 100644
--- a/test/unit/cute/core/complement.cpp
+++ b/test/unit/cute/core/complement.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/composition.cpp b/test/unit/cute/core/composition.cpp
index 679e7a00..08d4d5b7 100644
--- a/test/unit/cute/core/composition.cpp
+++ b/test/unit/cute/core/composition.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/constants.cpp b/test/unit/cute/core/constants.cpp
index 562d8600..4e660c47 100644
--- a/test/unit/cute/core/constants.cpp
+++ b/test/unit/cute/core/constants.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/core_unit.cpp b/test/unit/cute/core/core_unit.cpp
index ea3e72a0..f4229b83 100644
--- a/test/unit/cute/core/core_unit.cpp
+++ b/test/unit/cute/core/core_unit.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/domain_distribute.cpp b/test/unit/cute/core/domain_distribute.cpp
index f12b9177..728e1ace 100644
--- a/test/unit/cute/core/domain_distribute.cpp
+++ b/test/unit/cute/core/domain_distribute.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/int_tuple.cpp b/test/unit/cute/core/int_tuple.cpp
index 0ef68f7a..c97b812e 100644
--- a/test/unit/cute/core/int_tuple.cpp
+++ b/test/unit/cute/core/int_tuple.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/inverse_left.cpp b/test/unit/cute/core/inverse_left.cpp
index 142d80fb..e06c3ead 100644
--- a/test/unit/cute/core/inverse_left.cpp
+++ b/test/unit/cute/core/inverse_left.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/inverse_right.cpp b/test/unit/cute/core/inverse_right.cpp
index 69c8ccdd..8edb0ca4 100644
--- a/test/unit/cute/core/inverse_right.cpp
+++ b/test/unit/cute/core/inverse_right.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/logical_divide.cpp b/test/unit/cute/core/logical_divide.cpp
index 061fd548..bfeba1d5 100644
--- a/test/unit/cute/core/logical_divide.cpp
+++ b/test/unit/cute/core/logical_divide.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/logical_product.cpp b/test/unit/cute/core/logical_product.cpp
index 4d9aa0cd..8289cf1d 100644
--- a/test/unit/cute/core/logical_product.cpp
+++ b/test/unit/cute/core/logical_product.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/math.cpp b/test/unit/cute/core/math.cpp
index b022df56..1478a867 100644
--- a/test/unit/cute/core/math.cpp
+++ b/test/unit/cute/core/math.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/mixedbits.cpp b/test/unit/cute/core/mixedbits.cpp
index 10c82883..c0d03afd 100644
--- a/test/unit/cute/core/mixedbits.cpp
+++ b/test/unit/cute/core/mixedbits.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/nullspace.cpp b/test/unit/cute/core/nullspace.cpp
index 3f4d11a5..f4eff603 100644
--- a/test/unit/cute/core/nullspace.cpp
+++ b/test/unit/cute/core/nullspace.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/packed_tuple.cpp b/test/unit/cute/core/packed_tuple.cpp
index fbbcab05..77584e88 100644
--- a/test/unit/cute/core/packed_tuple.cpp
+++ b/test/unit/cute/core/packed_tuple.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/pointer.cpp b/test/unit/cute/core/pointer.cpp
index 0f248697..efeced74 100644
--- a/test/unit/cute/core/pointer.cpp
+++ b/test/unit/cute/core/pointer.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/reverse.cpp b/test/unit/cute/core/reverse.cpp
index 8fd4044f..c6273ed7 100644
--- a/test/unit/cute/core/reverse.cpp
+++ b/test/unit/cute/core/reverse.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/swizzle_layout.cpp b/test/unit/cute/core/swizzle_layout.cpp
index 211a40fa..57511683 100644
--- a/test/unit/cute/core/swizzle_layout.cpp
+++ b/test/unit/cute/core/swizzle_layout.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/transform.cpp b/test/unit/cute/core/transform.cpp
index 2fda83fe..81801d30 100644
--- a/test/unit/cute/core/transform.cpp
+++ b/test/unit/cute/core/transform.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/tuple.cpp b/test/unit/cute/core/tuple.cpp
index fa2a0ccf..f1efb36e 100644
--- a/test/unit/cute/core/tuple.cpp
+++ b/test/unit/cute/core/tuple.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/core/tuple_find.cpp b/test/unit/cute/core/tuple_find.cpp
index 0eeeb166..7ce1e6a5 100644
--- a/test/unit/cute/core/tuple_find.cpp
+++ b/test/unit/cute/core/tuple_find.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/CMakeLists.txt b/test/unit/cute/hopper/CMakeLists.txt
index f77aad93..79813000 100644
--- a/test/unit/cute/hopper/CMakeLists.txt
+++ b/test/unit/cute/hopper/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/bulk_load.cu b/test/unit/cute/hopper/bulk_load.cu
index 06281e2a..678f6cc7 100644
--- a/test/unit/cute/hopper/bulk_load.cu
+++ b/test/unit/cute/hopper/bulk_load.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/bulk_store.cu b/test/unit/cute/hopper/bulk_store.cu
index 2ff4d135..b4ee1a25 100644
--- a/test/unit/cute/hopper/bulk_store.cu
+++ b/test/unit/cute/hopper/bulk_store.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/cooperative_gemm.cu b/test/unit/cute/hopper/cooperative_gemm.cu
index 7d992510..c6ed2eb2 100644
--- a/test/unit/cute/hopper/cooperative_gemm.cu
+++ b/test/unit/cute/hopper/cooperative_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/stsm.cu b/test/unit/cute/hopper/stsm.cu
index ae6b3b80..77ea5c7c 100644
--- a/test/unit/cute/hopper/stsm.cu
+++ b/test/unit/cute/hopper/stsm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/tma_load.cu b/test/unit/cute/hopper/tma_load.cu
index 0105d351..c2719ff1 100644
--- a/test/unit/cute/hopper/tma_load.cu
+++ b/test/unit/cute/hopper/tma_load.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/tma_load_testbed.hpp b/test/unit/cute/hopper/tma_load_testbed.hpp
index 58d19e4a..dfd9831f 100644
--- a/test/unit/cute/hopper/tma_load_testbed.hpp
+++ b/test/unit/cute/hopper/tma_load_testbed.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/tma_mcast_load.cu b/test/unit/cute/hopper/tma_mcast_load.cu
index 9a330716..27b44d64 100644
--- a/test/unit/cute/hopper/tma_mcast_load.cu
+++ b/test/unit/cute/hopper/tma_mcast_load.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/tma_mcast_load_testbed.hpp b/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
index bca37879..3e0ec46d 100644
--- a/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
+++ b/test/unit/cute/hopper/tma_mcast_load_testbed.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/tma_store.cu b/test/unit/cute/hopper/tma_store.cu
index e347a0ac..c287b4f0 100644
--- a/test/unit/cute/hopper/tma_store.cu
+++ b/test/unit/cute/hopper/tma_store.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/hopper/tma_store_testbed.hpp b/test/unit/cute/hopper/tma_store_testbed.hpp
index ebdec55a..0429d243 100644
--- a/test/unit/cute/hopper/tma_store_testbed.hpp
+++ b/test/unit/cute/hopper/tma_store_testbed.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/layout/CMakeLists.txt b/test/unit/cute/layout/CMakeLists.txt
index f7e5c77a..6e9f9956 100644
--- a/test/unit/cute/layout/CMakeLists.txt
+++ b/test/unit/cute/layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/layout/layout_operator.cu b/test/unit/cute/layout/layout_operator.cu
index 06a823bd..df4a30f0 100644
--- a/test/unit/cute/layout/layout_operator.cu
+++ b/test/unit/cute/layout/layout_operator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/msvc_compilation/CMakeLists.txt b/test/unit/cute/msvc_compilation/CMakeLists.txt
index a6ba58f9..c7ab7ae6 100644
--- a/test/unit/cute/msvc_compilation/CMakeLists.txt
+++ b/test/unit/cute/msvc_compilation/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/msvc_compilation/tuple.cpp b/test/unit/cute/msvc_compilation/tuple.cpp
index a8a31dd3..c4cf11f4 100644
--- a/test/unit/cute/msvc_compilation/tuple.cpp
+++ b/test/unit/cute/msvc_compilation/tuple.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/turing/CMakeLists.txt b/test/unit/cute/turing/CMakeLists.txt
index ac8a0487..f6c6f64b 100644
--- a/test/unit/cute/turing/CMakeLists.txt
+++ b/test/unit/cute/turing/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/turing/cooperative_gemm.cu b/test/unit/cute/turing/cooperative_gemm.cu
index 1bda5cf7..b05c179e 100644
--- a/test/unit/cute/turing/cooperative_gemm.cu
+++ b/test/unit/cute/turing/cooperative_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/volta/CMakeLists.txt b/test/unit/cute/volta/CMakeLists.txt
index 27ebcb29..dc62dc10 100644
--- a/test/unit/cute/volta/CMakeLists.txt
+++ b/test/unit/cute/volta/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/volta/cooperative_gemm.cu b/test/unit/cute/volta/cooperative_gemm.cu
index 54cf4f22..9302d268 100644
--- a/test/unit/cute/volta/cooperative_gemm.cu
+++ b/test/unit/cute/volta/cooperative_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/cute/volta/vectorization_auto.cu b/test/unit/cute/volta/vectorization_auto.cu
index 585abf0e..b15d5b8c 100644
--- a/test/unit/cute/volta/vectorization_auto.cu
+++ b/test/unit/cute/volta/vectorization_auto.cu
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt
index bb3c2ca3..86fb6002 100755
--- a/test/unit/epilogue/CMakeLists.txt
+++ b/test/unit/epilogue/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt
index 7410063b..ba4124c2 100644
--- a/test/unit/epilogue/thread/CMakeLists.txt
+++ b/test/unit/epilogue/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/activation.cu b/test/unit/epilogue/thread/activation.cu
index 76d8d903..65f79362 100644
--- a/test/unit/epilogue/thread/activation.cu
+++ b/test/unit/epilogue/thread/activation.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu
index 66e07a38..b896053f 100644
--- a/test/unit/epilogue/thread/linear_combination.cu
+++ b/test/unit/epilogue/thread/linear_combination.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
index 6cbc9589..0af876aa 100644
--- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu
+++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt
index 90da0a5f..70348da4 100755
--- a/test/unit/epilogue/threadblock/CMakeLists.txt
+++ b/test/unit/epilogue/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
index 70c3b523..257076ab 100644
--- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
+++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu
index 0b8d45a5..3aa30d4a 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
index 3e7e5a13..f10541eb 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
index 81b1d99f..12659a88 100644
--- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
+++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
index 8858ca17..ba4dd391 100644
--- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
index 9f8a58d6..3599d4ad 100644
--- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
index e4e179fb..3a0929f5 100644
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
index 5ed1431b..3163a0d0 100644
--- a/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
+++ b/test/unit/epilogue/threadblock/epilogue_with_reduction_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
index 23786aec..2f4a66e2 100644
--- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
+++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
index f59f5dc9..b6b495d0 100644
--- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu
+++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
index ce8f9aee..f005f16e 100644
--- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h
index b773d27c..e2457fdb 100644
--- a/test/unit/epilogue/threadblock/testbed.h
+++ b/test/unit/epilogue/threadblock/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h
index 00d02aff..a76578f7 100644
--- a/test/unit/epilogue/threadblock/testbed_planar_complex.h
+++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt
index 044af86a..d2e67ad4 100644
--- a/test/unit/epilogue/warp/CMakeLists.txt
+++ b/test/unit/epilogue/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
index 8be67942..94bf9536 100644
--- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
index 7c6c7418..93e0daba 100644
--- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
index 45776f71..b4b13cc4 100644
--- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
+++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt
index c73d930d..75fa9cbe 100644
--- a/test/unit/gemm/CMakeLists.txt
+++ b/test/unit/gemm/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 87b6e53d..fedee385 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/default_gemm_configuration.hpp b/test/unit/gemm/device/default_gemm_configuration.hpp
index 35e7f8b3..0054a1b6 100644
--- a/test/unit/gemm/device/default_gemm_configuration.hpp
+++ b/test/unit/gemm/device/default_gemm_configuration.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -199,6 +199,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    float,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<float, 1, float, float>,
@@ -331,6 +332,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    float,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<float, 1, float, float>,
@@ -398,6 +400,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
   >;
 
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    int32_t,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<int32_t, 1, int32_t, int32_t>,
@@ -506,6 +509,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
@@ -582,6 +586,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
@@ -646,6 +651,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
@@ -708,6 +714,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
@@ -770,6 +777,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, int32_t, int32_t>,
@@ -834,6 +842,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
@@ -894,6 +903,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
@@ -955,6 +965,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
@@ -1016,6 +1027,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    ElementC,
     TagToStrideC_t<LayoutC>,
     TagToStrideC_t<LayoutC>,
     epilogue::thread::LinearCombination<ElementC, 1, ElementAccumulator, ElementAccumulator>,
@@ -1081,6 +1093,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     epilogue::thread::LinearCombination<double, 1, double, double>,
@@ -1160,6 +1173,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     epilogue::thread::LinearCombination<double, 1, double, double>,
@@ -1225,6 +1239,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     epilogue::thread::LinearCombination<double, 1, double, double>,
@@ -1290,6 +1305,7 @@ struct DefaultGemmConfigurationToCutlass3Types<
 
   // Epilogue
   using CollectiveEpilogue = epilogue::collective::DefaultEpilogue<
+    double,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     TagToStrideC_t<cutlass::layout::ColumnMajor>,
     epilogue::thread::LinearCombination<double, 1, double, double>,
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
index 7459edc9..1959e1e5 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
index 2f67a022..7d1332ea 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
index c283f75b..f384af94 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
index b3c6eb27..dcecca89 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
index 330ac512..522325c7 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
index 96bc99f5..235b9c58 100644
--- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
index fb59a04f..1004e7e3 100644
--- a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
index 8e107e95..53360dd1 100644
--- a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
index 02f34628..a34bb16f 100644
--- a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
index be739dba..e98f3c6d 100644
--- a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index 4e51b1cf..0ec6e420 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
index c3972b9f..23b6c2e0 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index 0508c1f7..d200329d 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
index 097db4a3..b132dbcd 100644
--- a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index 2317b41f..2c10baeb 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu
index a332f767..dcc87e40 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
index aefa451d..7d947bfd 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
index e246d3b4..3454ec33 100644
--- a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
index 94b46326..35e9f2b5 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_direct_store_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index b0f964bb..cdcffbad 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index 3d110dc6..79a6a8d2 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
index bfcaf180..9fa27f42 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
index e4223d5a..6e13bb67 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
index 0764a012..5c9d727e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
index 6c34478a..c8fbc06a 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index 587f2b23..2ceb6ab5 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index 5a734b9e..763e5089 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
index 71b95d67..1807a2fa 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
index 28961576..9953aeae 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index 1587320c..4d6e8d37 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
index 52c2aaed..860f06df 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
index 9cd589d4..4822bd09 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index 8c652f38..453c9f25 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
index c64d7be6..871995e7 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 12606f27..d7fc7410 100644
--- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index d4aa9a2c..d98082f4 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index 77a46eef..3d721da0 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
index 9b4c1657..7d03e78d 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
index 3670d282..8877c8a7 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
index 2317ed96..77460f4b 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
index 6d499ad5..0a4a2de1 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
index b05e95d2..cd9a9c10 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
index 794ce6fc..c681a68f 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
index 5e54e3a9..51d694da 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index 5db310a3..99e31424 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index b8d3c243..d204741e 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index 27734528..8f2db970 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index 1d2d0f58..cfc717d0 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
index 29fda22c..b6abeb7d 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 81aa8016..5adaa160 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
index d638de05..5b4f79bb 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 14320374..a532a5f6 100644
--- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
index 8c5ee861..d9df461a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
index 45ba6158..cda88b45 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
index ecfc8379..6776c7d4 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
index 92604db0..0ac9800b 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu
index ef8f9d4b..43ceead1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_broadcast_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
index 2968cb79..588bf345 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
index fb26e42b..8d157741 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
index c901facf..a2edea8a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
index 77696489..9a0acfb5 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
index 18871125..c2bbf14f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu
index 0fe636e5..ef520c3d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu
index 6c80e839..d6b83446 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
index ef7b17bc..6b6c7bc1 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
index bc0c36ae..274e816a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
index fc35e615..8e2819f3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
index c503f904..db711c1a 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
index dd5f03d4..82a10dc8 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
index a5a613b8..aaab5973 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
index a59df7f7..3b7726ed 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
index 01e191ba..3e1217d3 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
index b3628631..220e861f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
index 1e6cfc05..20d13f0f 100644
--- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
index f4e20c1b..3d24351d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
index c9e993ba..e8a8718e 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
index 57a1fb37..86a77071 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
index 5e01ed21..2b96f728 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
index d9a03f55..c191c4e0 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
index e3593580..d72a6e13 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
index f1c3e34b..77550d09 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
index 4b7bad63..2f146243 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
index 87d71286..a9063b99 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
index 0da78ec4..eda50039 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
index 280f0e71..2408536d 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
index 11971be9..56bea685 100644
--- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
+++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
index 2778497f..c2135c66 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
index 2df79761..47cd22fd 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index 3b05ae18..f359cb06 100644
--- a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index 452250fb..a6a429fe 100644
--- a/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
index 5fd7d7d4..be3b5904 100644
--- a/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
index 3988e6e4..0e1a9cf2 100644
--- a/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_f32t_f32t_f32t_tensor_op_f32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
index fd787289..e001da46 100644
--- a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
index 0d446439..fcf7ff1d 100644
--- a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
index 03748bd4..40503fda 100644
--- a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
index 2b1906d5..fc1d5e88 100644
--- a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu
index 7208ea53..f82f7291 100644
--- a/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sm89.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu
index d21020f2..d0b0d068 100644
--- a/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f32t_tensor_op_f32_sparse_sm89.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu
index 0f7f4d95..1d0bfd95 100644
--- a/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sm89.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu
index 0733bc70..8007ece8 100644
--- a/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu
+++ b/test/unit/gemm/device/gemm_f8t_f8n_f8t_tensor_op_f32_sparse_sm89.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu b/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu
index 45568419..5266ed81 100644
--- a/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu
+++ b/test/unit/gemm/device/gemm_grouped_scheduler_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_grouped_sm80.cu b/test/unit/gemm/device/gemm_grouped_sm80.cu
index 0f098038..b19c44e7 100644
--- a/test/unit/gemm/device/gemm_grouped_sm80.cu
+++ b/test/unit/gemm/device/gemm_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
index eb2d0478..f2f6b10e 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
index d6648a5f..21b2fcd8 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
index e0ad29bc..b47b1cb5 100644
--- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
+++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
index b3e790d6..99e005a8 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
index 8032d098..445cb000 100644
--- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
index f5c80110..d027ad25 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
index d7f3cb90..dd0a6c5f 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
index 49dbd73a..30be420f 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
index 819a487e..63481c47 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
index 7eede816..e80aa322 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
index 73d45d54..ee0d09af 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
index 69ed4882..33add5e3 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
index f032b4cd..ad957ac6 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
index 80d42e31..293620c6 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
index 768201be..62eb53da 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
index f34e47e5..7be69819 100644
--- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
index f3efdb0a..491d3378 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
index b20ec8fb..3a8d5cf3 100644
--- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu
index 45620463..dd5f1c4b 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_f16t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
index a9a18c33..8ee1a7aa 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
index 9aeb5757..b6287dd9 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
index ba0ff864..0f541aab 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
index 80e0506a..849942fd 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
index a7ff179a..50749c43 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
index 96b56322..8e879f20 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
index 1d4c84f1..f315d4db 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
index c95d8423..8cafe653 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
index d48abf84..9f8e7eb1 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
index 5173ac09..36edee74 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
index 9e9da8f4..4307714f 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
index ad655fd8..bf79444d 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /**************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
index d3453925..0358789d 100644
--- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
index 4fc06e6a..311b50c1 100644
--- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
index 643b595d..2cc37fb2 100644
--- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
+++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
index b30a4e2d..dead164b 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
index 12155491..5c0834b2 100644
--- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
+++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp
index 3a6cf0b2..a9db8715 100644
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -208,7 +208,7 @@ struct IsLegacyEpiloguePolicy {
 };
 
 template <typename Epilogue>
-struct IsLegacyEpiloguePolicy<Epilogue, cute::void_t<typename Epilogue::DispatchPolicy>> {
+struct IsLegacyEpiloguePolicy<Epilogue, cute::void_t<decltype(Epilogue::DispatchPolicy::FragmentSize)>> {
   using EpiloguePolicy = typename Epilogue::DispatchPolicy;
   static constexpr bool value = cute::is_same_v<
                                       EpiloguePolicy,
@@ -830,11 +830,11 @@ template<
 >
 struct HostCollectiveMainloop<ScheduleType_, Gemm, ElementA_, ElementB_,
     cute::enable_if_t<
-      cute::is_same_v<
-        typename Gemm::CollectiveMainloop::DispatchPolicy, 
+      cute::is_base_of_v<
         cutlass::gemm::MainloopSm90TmaGmmaWarpSpecializedSparse<Gemm::CollectiveMainloop::DispatchPolicy::Stages,
                                                                 typename Gemm::CollectiveMainloop::DispatchPolicy::ClusterShape,
-                                                                ScheduleType_>>>>
+                                                                ScheduleType_>,
+        typename Gemm::CollectiveMainloop::DispatchPolicy>>>
   : HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>
 {
   using HostCollectiveMainloopSparse<Gemm, ElementA_, ElementB_>::HostCollectiveMainloopSparse;
diff --git a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
index d6f0e0b1..f18a7b39 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_evt.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
index 479102b3..db1114ba 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_ptr_array.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -28,7 +28,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  **************************************************************************************************/
-
 /*! \file
     \brief Testbed for Ptr-Array and Grouped GEMM interface
 */
@@ -1354,6 +1353,8 @@ struct HostCollectiveEpilogue {
       decltype(Valpha),
       decltype(Vbeta),
       ActivationFunctor
+      , cutlass::plus<ElementCompute>
+      , false
     > epilogue_params{};
 
     epilogue_params.C = C;
@@ -1602,14 +1603,14 @@ struct TestbedImpl {
     mainloop_args = collective_mma_inputs.to_args(problem_shapes);
 
     if constexpr (IsGroupGemm) {
-    arguments =
-    {
-      cutlass::gemm::GemmUniversalMode::kGrouped,
-      problem_shapes,
-      mainloop_args,
-      collective_epilogue.to_args(problem_shapes),
-      hw_info
-    };
+      arguments =
+      {
+        cutlass::gemm::GemmUniversalMode::kGrouped,
+        problem_shapes,
+        mainloop_args,
+        collective_epilogue.to_args(problem_shapes),
+        hw_info
+      };
     }
     else {
       arguments =
@@ -1797,6 +1798,127 @@ bool TestAll(double alpha = 1.0, double beta = 0.0, CheckEquality check_relative
   return passed;
 }
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Gemm, bool force_legacy_epilogue = false, bool apply_alignment_offset = false>
+bool TestSmall(double alpha = 1.0, double beta = 1.0,
+  CheckEquality check_relative_equality = CheckEquality::RELATIVE, 
+  ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE, 
+  VectorScale vector_scale_mode = VectorScale::ENABLED, 
+  std::vector<int> override_problem_size_k = {}) {
+  using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+  using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar;
+  using ElementA = typename Gemm::GemmKernel::ElementA;
+  using ElementB = typename Gemm::GemmKernel::ElementB;
+  using TiledMma = typename Gemm::GemmKernel::TiledMma;
+  int alignment_bits = 128;
+  int alignment_input = (alignment_bits / cute::sizeof_bits<ElementA>::value == 128) ? 0 : (alignment_bits / cute::sizeof_bits<ElementA>::value);
+
+  using CtaShape_MNK = typename Gemm::GemmKernel::CollectiveMainloop::CtaShape_MNK;
+  using DispatchPolicy = typename Gemm::GemmKernel::CollectiveMainloop::DispatchPolicy;
+  CtaShape_MNK cta_shape;
+  Testbed3x<Gemm, cutlass::epilogue::thread::Identity, force_legacy_epilogue> testbed(check_relative_equality, use_device_scalars, vector_scale_mode);
+  // For Ptr-Array and Grouped GEMM ideally we need to know SM count at runtime 
+  static constexpr int SmCount = 16;
+
+  float waves[] = {0.5, 2.5};
+  int batches[] = {3};
+  int cluster_m = 1;
+  int cluster_n = 1;
+
+  std::vector<int> problem_size_k;
+  if (override_problem_size_k.empty()) {
+    // this is to test with min alignment
+    problem_size_k = {256 - alignment_input, 512 + alignment_input};
+  }
+  else {
+    problem_size_k = override_problem_size_k;
+  }
+
+  if constexpr(DispatchPolicy::ArchTag::kMinComputeCapability >= 90) {
+    typename DispatchPolicy::ClusterShape cluster_shape;
+    cluster_m = cute::size<0>(cluster_shape);
+    cluster_n = cute::size<1>(cluster_shape);
+  }
+
+  bool passed = true;
+
+  for (int batch : batches) {
+    for (float wave : waves) {
+      for (int k : problem_size_k) {
+        int grid_m, grid_n = 0;
+        float num_grid = wave * SmCount;
+
+        if (cluster_m >= cluster_n) {
+          grid_m = cluster_m;
+          grid_n = static_cast<int>(num_grid) / grid_m;
+          // Align grid_n to cluster_n
+          grid_n = std::max((grid_n + cluster_n - 1 ) / cluster_n * cluster_n, 1);
+        }
+        else {
+          grid_n = cluster_n;
+          grid_m = static_cast<int>(num_grid) / grid_n;
+          // Align grid_m to cluster_m
+          grid_m = std::max((grid_m + cluster_m - 1 ) / cluster_m * cluster_m, 1);
+        }
+
+        int m = grid_m * cute::size<0>(cta_shape) - alignment_input; // this is just to test with unusual problem shapes
+        int n = grid_n * cute::size<1>(cta_shape) + alignment_input;
+
+        if constexpr (Testbed3x<Gemm, cutlass::epilogue::thread::Identity, force_legacy_epilogue>::IsGroupGemm) {
+          std::vector<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_host;
+          cutlass::DeviceAllocation<typename ProblemShapeType::UnderlyingProblemShape> problem_sizes_device;
+          for (int i = 0; i < batch; ++i) {
+            problem_sizes_host.push_back({m * ((i % 2) + 1), n * ((i % 3) + 1), k * ((i % 2) + 1)});
+          }
+          problem_sizes_device.reset(problem_sizes_host.size());
+          problem_sizes_device.copy_from_host(problem_sizes_host.data());
+
+          ProblemShapeType problem_shapes{batch, problem_sizes_device.get(), problem_sizes_host.data()};
+
+          if (CUTLASS_DEBUG_TRACE_LEVEL > 0) {
+            for (int i = 0; i < batch; ++i) {
+              std::cout << "problem_shapes : "  << problem_shapes.get_host_problem_shape(i) << " \n";
+            }
+          }
+          passed = testbed.run(
+            problem_shapes,
+            cutlass::from_real<ElementScalar>(alpha),
+            cutlass::from_real<ElementScalar>(beta)
+          );
+        }
+        else {
+          ProblemShapeType problem_shapes{{m, n, k, batch}};
+          if (CUTLASS_DEBUG_TRACE_LEVEL > 0) {
+            std::cout << "problem_shapes : "  << problem_shapes.get_host_problem_shape() << " \n";
+          }
+          passed = testbed.run(
+            problem_shapes,
+            cutlass::from_real<ElementScalar>(alpha),
+            cutlass::from_real<ElementScalar>(beta)
+          );
+        }
+
+        if (!passed) {
+          std::cout << __FILE__ << ':' << __LINE__ << " : GEMM MNK " << m << " " << n << " " << k << " FAILED.\n";
+          return false;
+        }
+      } // k
+    } // waves
+  } // batches
+
+  return passed;
+}
+
+template <typename Gemm, bool force_legacy_epilogue = false, bool apply_alignment_offset = true>
+bool TestSmallFusion(double alpha = 1.0, double beta = 0.0,
+    CheckEquality check_relative_equality = CheckEquality::RELATIVE,
+    ScalarLoc use_device_scalars = ScalarLoc::ON_DEVICE,
+    VectorScale vector_scale_mode = VectorScale::ENABLED) {
+  return TestSmall<Gemm, force_legacy_epilogue, apply_alignment_offset>(
+    alpha, beta, check_relative_equality, use_device_scalars, vector_scale_mode);
+}
+
 } // namespace device
 } // namespace gemm
 } // namespace test
diff --git a/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
index 1c3e6448..4fc24ea4 100644
--- a/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x_tensor_broadcast.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
index 6620c6f0..29d90489 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
index e1d742e8..77a414c9 100644
--- a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
index de068a7b..b9e35d0c 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
index 8c2564f6..9c7ac773 100644
--- a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
index 59c0d0f7..8b06a308 100644
--- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
+++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
index de6f28ff..dd9f0d9e 100644
--- a/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_bf16t_s8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
index bbeb9a16..2815fed9 100644
--- a/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_bf16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
index 49d48492..d708e79c 100644
--- a/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_bf16t_u8n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
index 93c59c51..ccaaca3b 100644
--- a/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_bf16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
index d26ae201..02169a0a 100644
--- a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
index 0c670050..b107dc4a 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
index a2c489c8..8fb34a48 100644
--- a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
index 81d84094..d77ae6ce 100644
--- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32n_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
index 19518e81..b0492d58 100644
--- a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu
index 61134963..a6558670 100644
--- a/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu
index 86d1da77..6a4e839c 100644
--- a/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16t_s8n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
index 20da1150..89cd51fd 100644
--- a/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16t_s8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu
index 9b105c9e..fde85b43 100644
--- a/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu
index b26b2136..2ef1e0b0 100644
--- a/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16t_u8n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
index 926a88e8..94d797e4 100644
--- a/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_f16t_u8n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu
index 421ea0c0..1e7b4cf3 100644
--- a/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s4t_s8n_s32t_mixed_input_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu
index 685092fb..fea56eaa 100644
--- a/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s4t_s8n_s8t_mixed_input_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
index 0622b9a5..4723d9e0 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
index 8572e559..c058287b 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
index 1c566740..0c6e9e3d 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
index eb4e293a..3f9571e9 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
index 064c9b04..376df075 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu
index b28cee62..8ea27720 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_s4n_s32t_mixed_input_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu
index 89a52b3e..8d70e79a 100644
--- a/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_s8t_s4n_s8t_mixed_input_tensor_op_s32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
index 020c8b38..c33fb174 100644
--- a/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_u8t_bf16n_bf16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
index d6b65974..669cceb2 100644
--- a/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_u8t_bf16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
index 41657c2f..0dd1ab04 100644
--- a/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f16_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
index b2b3cd3a..a7515a4f 100644
--- a/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_u8t_f16n_f16t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
index 358c109e..719a184c 100644
--- a/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_universal_u8t_f16n_f32t_mixed_input_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu b/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
index 13e44864..a30370d5 100644
--- a/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu b/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
index 1a1c4e03..378ac351 100644
--- a/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
+++ b/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu b/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
index af691458..8c67a38e 100644
--- a/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
+++ b/test/unit/gemm/device/gemm_with_reduction_f16t_f16n_f16n_tensorop_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/gemv.cu b/test/unit/gemm/device/gemv.cu
index 356883bf..f9f65a3c 100644
--- a/test/unit/gemm/device/gemv.cu
+++ b/test/unit/gemm/device/gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
index dd26f00e..78a2dbcc 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
index d8aa09b1..f9746cea 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
index 1cff46ce..338f2708 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
index 86396f57..a928c22a 100644
--- a/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf32h_cf32n_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
index 963a073f..238f560d 100644
--- a/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
index de3f78d5..771b661c 100644
--- a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
index acc4d040..9eac0374 100644
--- a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
index f5fbd473..e6d93796 100644
--- a/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
+++ b/test/unit/gemm/device/hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
index 3733bea6..6bbcfa0f 100644
--- a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
index e5b09d97..600f16a1 100644
--- a/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu
index 03f82bad..f8e8ffb8 100644
--- a/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/her2k_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu
index 68f3ac98..db2e1c24 100644
--- a/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf64h_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
index 6fd74d29..63f27837 100644
--- a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
index d39ba9cd..8cf4f2a2 100644
--- a/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
index 3d1725d2..7cba0383 100644
--- a/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/her2k_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
index 3a63676e..e1a96c2c 100644
--- a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
index 84fb0fa7..f0bfcbe6 100644
--- a/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/herk_cf32h_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu
index 4696326b..f51d5f7b 100644
--- a/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/herk_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
index 6e99b376..6182ea31 100644
--- a/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/herk_cf64h_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h
index 2fc71864..6ae7b864 100644
--- a/test/unit/gemm/device/multistage_testbed.h
+++ b/test/unit/gemm/device/multistage_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h
index 361f977b..e309208b 100644
--- a/test/unit/gemm/device/multistage_testbed_interleaved.h
+++ b/test/unit/gemm/device/multistage_testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu b/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu
index c6141383..de3d8340 100644
--- a/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu
+++ b/test/unit/gemm/device/rank_2k_grouped_scheduler_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
index e1a9ec6a..fe8917d3 100644
--- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
index 648c8794..ea4794ab 100644
--- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm80.cu b/test/unit/gemm/device/simt_cgemm_nt_sm80.cu
index e1b829f6..145e3491 100644
--- a/test/unit/gemm/device/simt_cgemm_nt_sm80.cu
+++ b/test/unit/gemm/device/simt_cgemm_nt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
index daae04a6..e10235f9 100644
--- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm80.cu b/test/unit/gemm/device/simt_cgemm_tn_sm80.cu
index 81b22e1d..58ae1c45 100644
--- a/test/unit/gemm/device/simt_cgemm_tn_sm80.cu
+++ b/test/unit/gemm/device/simt_cgemm_tn_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
index eda445d6..915bccbd 100644
--- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
index 4f56824e..1c2af695 100644
--- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
index 92c3467c..c78564c8 100644
--- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
index 1a34ddd0..b6ca47de 100644
--- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
index 31af0475..d92e6dba 100644
--- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu b/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu
index 88248a27..91e3efd8 100644
--- a/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_f8gemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
index 028b6061..484b7255 100644
--- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
index f756823e..8dbdf92b 100644
--- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
index 915f452d..1970a885 100644
--- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
index 4cf9876c..3234315c 100644
--- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
index f6b3859b..06c7604a 100644
--- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
index 13551cef..90883131 100644
--- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
index bcae3fce..7471fca3 100644
--- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
index 332ec856..1144e028 100644
--- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
index e1ee3cd4..fb98a041 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
index 39d8a0a6..b59d1956 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
index 08587742..6d314de0 100644
--- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
+++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_qgemm_nn_sm50.cu b/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
index fc58a990..51c5150d 100644
--- a/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_qgemm_nt_sm50.cu b/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
index 9e92ea9d..b014b263 100644
--- a/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_qgemm_tn_sm50.cu b/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
index 6badfe27..fd3fe186 100644
--- a/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_qgemm_tt_sm50.cu b/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
index bb7555b8..9ba137e5 100644
--- a/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_qgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
index a162c2e7..32b92770 100644
--- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
index 050c08d2..5b3a336d 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
index 6c9625a7..dbd6d2da 100644
--- a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
index 247722ff..cc331735 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
index e061e01d..a55b6c72 100644
--- a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
+++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
index fd555bc4..2670d438 100644
--- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py
index ff4d00fc..a1800282 100644
--- a/test/unit/gemm/device/simt_sm50.py
+++ b/test/unit/gemm/device/simt_sm50.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -130,7 +130,7 @@ for precision in precisions:
 
         # write file header
         out.write("/***************************************************************************************************\n"
-" * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                 \n"
+" * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.                 \n"
 " * SPDX-License-Identifier: BSD-3-Clause                                                           \n"
 " *                                                                                                 \n"
 " * Redistribution and use in source and binary forms, with or without                              \n"
diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
index 06d29f25..1b10cc96 100644
--- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
index 57374e42..7e1c299a 100644
--- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
index b67acaab..dc5cb216 100644
--- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
index 116cfc54..71fb62e7 100644
--- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
+++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu b/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu
index da66a915..d20615df 100644
--- a/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu
+++ b/test/unit/gemm/device/sm50_gemm_f32_f32_f32_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu b/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu
index db5035eb..d53c2624 100644
--- a/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu
+++ b/test/unit/gemm/device/sm50_gemm_f64_f64_f64_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu b/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu
index 2e1deb78..cbfad32e 100644
--- a/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu
+++ b/test/unit/gemm/device/sm61_gemm_s8_s8_s32_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu
index 52fb5c61..3126ef0e 100644
--- a/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm80_gemm_f16_f16_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu b/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu
index 3174a7ad..4ade1e80 100644
--- a/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu
+++ b/test/unit/gemm/device/sm80_gemm_f32_f32_f32_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu
index bfbcbc1c..de99aa84 100644
--- a/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu
+++ b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu
index 66ba8488..f46ab3e6 100644
--- a/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu
+++ b/test/unit/gemm/device/sm80_gemm_f64_f64_f64_tensor_op_f64.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu b/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu
index d8b0ca9d..c65df2c9 100644
--- a/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu
+++ b/test/unit/gemm/device/sm80_gemm_s8_s8_s32_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
index 93ec737a..aaab2bbd 100644
--- a/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm80_gemm_tf32_tf32_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_evt_operations.hpp b/test/unit/gemm/device/sm90_evt_operations.hpp
index 73f228d4..63ffc328 100644
--- a/test/unit/gemm/device/sm90_evt_operations.hpp
+++ b/test/unit/gemm/device/sm90_evt_operations.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu
index b9bd305e..23133d98 100644
--- a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu
index 589c4fad..aef0b512 100644
--- a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
index 5ca50813..cbbd2dd7 100644
--- a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
index 16dfb285..74088072 100644
--- a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu
index 79ebc2ad..63845512 100644
--- a/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu
index ecf91a69..27b1fd04 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu
index c2db370a..b0c806cc 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
index 2d64c036..4b595257 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
index 8551aaaf..9c58970c 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_alignx_tensor_op_f32_warpspecialized_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu
index ed92ef43..e0bc25c9 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu
index 33864ea5..85a41b7d 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_unspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu
index dccef44d..d2ffa6eb 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu
index 34f10fbc..dcdcad13 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
index 5ca84b13..107e3569 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_load.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
index bf8b1fc7..3d39fb3a 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_aux_store.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
index c27c8de9..a2fd5bc7 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
index 9f381b79..62b42afe 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_dag.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu
index 9e15c91d..0f55d464 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_reduce.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
index 7739dec5..de05c859 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_row_broadcast.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu
index 3bb324ca..f87451ca 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
index cf435511..1f4fd652 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_aux_load.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
index e9e2dba1..c510f5c3 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_bias_elementwise.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
index 94c4a655..3b1c140f 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_dag.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu
index 544b4fd5..94699493 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_reduce.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
index df791d8d..142992f9 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_pingpong_row_broadcast.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu
index b5b53b1a..28939021 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cooperative_stream_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
index 7031a243..ad29c326 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
index d387bb9f..e509a6e2 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_group_gemm_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
index 53748dc8..35ea9837 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu
index 3e8be486..24ce8422 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu
index 0774aa82..11764103 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_tensor_broadcast.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
index 928e20a2..e4f24e61 100644
--- a/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_f16_f16_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
index 3b911f1f..2ec4b901 100644
--- a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
index 575f7a7d..c592ad18 100644
--- a/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_f32_f32_f32_tensor_op_f32_tensor_broadcast.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu
index b89a214c..59b1526f 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32.cu
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
index 04080117..70a3f73d 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_bf16_tensor_op_fp32_evt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu
index d9e7830e..6d27de4a 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
index bcf837d6..404cbf3e 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cluster_warpspecialized_cooperative_evt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu
index 86fc08c6..db1f4c16 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_cooperative_stream_k.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
index 86414021..da694d53 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_f32_rs_cluster_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
index a2fd5f66..63956c90 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu
index 9cf8f312..e8611f63 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32.cu
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
index 9873594c..7d35289d 100644
--- a/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
+++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f8_tensor_op_fp32_evt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu
index 0c28f575..a0e61dbd 100644
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu
index a19ec463..73efb306 100644
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu
index 371e1f93..c0700142 100644
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu
index 43d6ad3c..e86c446a 100644
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_alignx_tensor_op_s32_warpspecialized_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu
index a54245c0..15a369af 100644
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
index 864ee38c..5f88171d 100644
--- a/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
+++ b/test/unit/gemm/device/sm90_gemm_s8_s8_s8_tensor_op_s32_tensor_broadcast.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu b/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu
index 2af2a3b0..2581f3bf 100644
--- a/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu
+++ b/test/unit/gemm/device/sm90_gemm_stream_k_scheduler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu
index b23c0150..b8b8b67b 100644
--- a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu
index ce18a41d..631f6160 100644
--- a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu
index e70c1a89..1e828717 100644
--- a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_cooperative.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu
index 259500dd..d0540dbd 100644
--- a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_alignx_tensor_op_f32_warpspecialized_pingpong.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu
index f513e3a7..acd5ffff 100644
--- a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu
index a15f5669..bf803910 100644
--- a/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu
+++ b/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu b/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu
index 4d03fc93..f9d56f47 100644
--- a/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu
+++ b/test/unit/gemm/device/sm90_gett_f16_f16_f16_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu
index 43d1839f..22653285 100644
--- a/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu
index 9b15b74b..2499ecb6 100644
--- a/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -103,6 +103,45 @@ TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x128_1x1x1_wa
   using TileShape = Shape<_128,_128,_128>;
   using ClusterShape = Shape<_1,_1,_1>;
 
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecialized
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x128_1x1x1_warpspecialized_fastaccum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_128>;
+  using ClusterShape = Shape<_1,_1,_1>;
+
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
       TileShape, ClusterShape,
@@ -142,6 +181,45 @@ TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x256_1x2x1_co
   using TileShape = Shape<_128,_128,_256>;
   using ClusterShape = Shape<_1,_2,_1>;
 
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedCooperative
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x256_1x2x1_cooperative_fastaccum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_256>;
+  using ClusterShape = Shape<_1,_2,_1>;
+
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
       TileShape, ClusterShape,
@@ -181,6 +259,45 @@ TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x64_2x1x1_pin
   using TileShape = Shape<_128,_128,_64>;
   using ClusterShape = Shape<_2,_1,_1>;
 
+  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
+      TileShape, ClusterShape,
+      cutlass::epilogue::collective::EpilogueTileAuto,
+      float, float,
+      float, LayoutC, 4,
+      float, LayoutC, 4,
+      cutlass::epilogue::TmaWarpSpecialized
+    >::CollectiveOp;
+
+  using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilder<
+      cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+      cutlass::float_e4m3_t, LayoutA, 32,
+      cutlass::float_e5m2_t, LayoutB, 16,
+      float,
+      TileShape, ClusterShape,
+      cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
+      cutlass::gemm::KernelTmaWarpSpecializedPingpong
+    >::CollectiveOp;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+      Shape<int,int,int,int>,
+      CollectiveMainloop,
+      CollectiveEpilogue
+    >;
+
+  using namespace test::gemm::device;
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+  bool result = TestAll<Gemm>(1.0, 1.0, CheckEquality::EXACT);
+  EXPECT_TRUE(result);
+}
+
+TEST(SM90_Device_Sparse_Gemm_e4m3t_e5m2n_f32t_tensorop_f32, 128x128x64_2x1x1_pingpong_fastaccum) {
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using TileShape = Shape<_128,_128,_64>;
+  using ClusterShape = Shape<_2,_1,_1>;
+
   using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
       cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp,
       TileShape, ClusterShape,
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu b/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu
index 09e52f51..15a6709b 100644
--- a/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu
+++ b/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu b/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu
index cc7b9486..1dcb1c51 100644
--- a/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu
+++ b/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
index 6a560076..c7cd706d 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
index 724d8f69..3231bc2d 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
index 33d37a87..08f6e871 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
index 5770f350..b36dd25e 100644
--- a/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_cf32n_cf32n_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
index 99847c97..3ca2d874 100644
--- a/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/symm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
index 3a4a4495..fb634310 100644
--- a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
index 799de8ae..589a4052 100644
--- a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
index a0719087..d327ee21 100644
--- a/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
+++ b/test/unit/gemm/device/symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
index 5c700a40..cce3aa9c 100644
--- a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
index 8fd17423..2f79f342 100644
--- a/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
index c6dd7f31..3d4b58ea 100644
--- a/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu
index 9a075292..87da2076 100644
--- a/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/symm_f64_f64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
index 2483714b..0fed5801 100644
--- a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
index e19c807b..78cfe53b 100644
--- a/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64n_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
index 71e35155..3c6c2cc2 100644
--- a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
index 6dc9dc0b..629e2484 100644
--- a/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64n_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
index f88b5384..ccdeaabb 100644
--- a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
index 70af1472..9d5aa411 100644
--- a/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64n_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
index 82b1c5e1..6df0a3bf 100644
--- a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
index e3ccfa68..7c63d319 100644
--- a/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_f64t_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
index 7d2aebc3..04468336 100644
--- a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
index 8b627fcb..583f7fca 100644
--- a/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/symm_tf32n_f32n_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
index f47d01ac..950f6f6a 100644
--- a/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/symm_tf32t_f32t_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
index 839e1e7e..9b042c95 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
index b3ba0a7d..0c7844da 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
index b4940680..dbda0793 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
index c7e4b202..a6dee281 100644
--- a/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu
index e71cb244..e8554396 100644
--- a/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/syr2k_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
index 42396118..13ba861a 100644
--- a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
index 5b6b089d..28e8d3e2 100644
--- a/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu
index 5c43d52d..5be6312e 100644
--- a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
index c9a7b12f..f3b2b32f 100644
--- a/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu
index 8c1ec972..db8ebd0d 100644
--- a/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64t_cf64n_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu
index 96fe1647..409bfdf2 100644
--- a/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_cf64t_cf64t_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
index ef1ffefd..9341946c 100644
--- a/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f32n_f32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
index 9db017ec..911764ab 100644
--- a/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f32t_f32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu
index e2c8cedf..177f94f9 100644
--- a/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/syr2k_f64_f64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu
index 877c2263..5ce7d436 100644
--- a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
index 94b3569c..60efc57e 100644
--- a/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64n_f64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu
index e54c03bb..c2ae630c 100644
--- a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
index 90975a91..57b03a44 100644
--- a/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu
index 44824679..99483af8 100644
--- a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
index 7d689e34..08a1c5b5 100644
--- a/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64t_f64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu b/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu
index 56690b56..953f26a5 100644
--- a/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu
+++ b/test/unit/gemm/device/syr2k_f64t_f64t_tensor_op_f64_grouped_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
index c2010556..eac08982 100644
--- a/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_tf32n_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
index 0bde51ba..b57a0df0 100644
--- a/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syr2k_tf32t_f32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
index 0e35706e..ae95476c 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
index 3de98e1e..7b1a4345 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32n_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
index 23aeaf56..b8f19d7c 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
index 7b95c5d4..44b2ad63 100644
--- a/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu
index 14735e87..2a0705ab 100644
--- a/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/syrk_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
index 18b8c2d7..116c132c 100644
--- a/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf64n_cf64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index 39df8ea9..9b2aef9d 100644
--- a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
index 0d6c7706..e4335da2 100644
--- a/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
index 58486bf0..007eafa3 100644
--- a/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
index 8b3b22ac..be1b885a 100644
--- a/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_f32t_f32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu
index cc0f9ab7..3c994fb0 100644
--- a/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/syrk_f64_f64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
index e308ae13..2c279be2 100644
--- a/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_f64n_f64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu b/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
index c74de13e..b856f4b4 100644
--- a/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/syrk_f64t_f64n_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
index 6d33a362..1e6c3296 100644
--- a/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_tf32n_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
index 530c4ba7..d6c70e3b 100644
--- a/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/syrk_tf32t_f32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h
index 83cf2864..0007666c 100644
--- a/test/unit/gemm/device/testbed.h
+++ b/test/unit/gemm/device/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h
index 1b5e3f74..add984ca 100644
--- a/test/unit/gemm/device/testbed_complex.h
+++ b/test/unit/gemm/device/testbed_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_gemm_with_broadcast.h b/test/unit/gemm/device/testbed_gemm_with_broadcast.h
index 491a8b33..eca0b0ae 100644
--- a/test/unit/gemm/device/testbed_gemm_with_broadcast.h
+++ b/test/unit/gemm/device/testbed_gemm_with_broadcast.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_gemm_with_reduction.h b/test/unit/gemm/device/testbed_gemm_with_reduction.h
index 12c8d408..af3629cc 100644
--- a/test/unit/gemm/device/testbed_gemm_with_reduction.h
+++ b/test/unit/gemm/device/testbed_gemm_with_reduction.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_grouped.h b/test/unit/gemm/device/testbed_grouped.h
index d8f5d439..c7317eb8 100644
--- a/test/unit/gemm/device/testbed_grouped.h
+++ b/test/unit/gemm/device/testbed_grouped.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_grouped_rank_2k.h b/test/unit/gemm/device/testbed_grouped_rank_2k.h
index 2678be11..f8f08f23 100644
--- a/test/unit/gemm/device/testbed_grouped_rank_2k.h
+++ b/test/unit/gemm/device/testbed_grouped_rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h b/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
index e399c7f6..e9315e12 100644
--- a/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
+++ b/test/unit/gemm/device/testbed_grouped_rank_2k_scheduler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_grouped_scheduler.h b/test/unit/gemm/device/testbed_grouped_scheduler.h
index 1aab9c12..bda2704b 100644
--- a/test/unit/gemm/device/testbed_grouped_scheduler.h
+++ b/test/unit/gemm/device/testbed_grouped_scheduler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h
index ee96f78f..2a595600 100644
--- a/test/unit/gemm/device/testbed_interleaved.h
+++ b/test/unit/gemm/device/testbed_interleaved.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h
index 6bd289c9..32452c30 100644
--- a/test/unit/gemm/device/testbed_planar_complex.h
+++ b/test/unit/gemm/device/testbed_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_rank2k_universal.h b/test/unit/gemm/device/testbed_rank2k_universal.h
index 5d009197..4d9f6743 100644
--- a/test/unit/gemm/device/testbed_rank2k_universal.h
+++ b/test/unit/gemm/device/testbed_rank2k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_rank_k_universal.h b/test/unit/gemm/device/testbed_rank_k_universal.h
index 15f23d34..cb46528a 100644
--- a/test/unit/gemm/device/testbed_rank_k_universal.h
+++ b/test/unit/gemm/device/testbed_rank_k_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h
index 822374dd..0a01a6a3 100644
--- a/test/unit/gemm/device/testbed_sanity.h
+++ b/test/unit/gemm/device/testbed_sanity.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_sparse.h b/test/unit/gemm/device/testbed_sparse.h
index eeac68c0..a95bf996 100644
--- a/test/unit/gemm/device/testbed_sparse.h
+++ b/test/unit/gemm/device/testbed_sparse.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h
index 7407b5bd..8fa4a855 100644
--- a/test/unit/gemm/device/testbed_splitk.h
+++ b/test/unit/gemm/device/testbed_splitk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_symm_universal.h b/test/unit/gemm/device/testbed_symm_universal.h
index 7940b096..b7a57f7e 100644
--- a/test/unit/gemm/device/testbed_symm_universal.h
+++ b/test/unit/gemm/device/testbed_symm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_trmm_universal.h b/test/unit/gemm/device/testbed_trmm_universal.h
index 0f992896..b30acfed 100644
--- a/test/unit/gemm/device/testbed_trmm_universal.h
+++ b/test/unit/gemm/device/testbed_trmm_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h
index f7dd861b..00368a5e 100644
--- a/test/unit/gemm/device/testbed_universal.h
+++ b/test/unit/gemm/device/testbed_universal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h
index 5783e5a9..89ac33a1 100644
--- a/test/unit/gemm/device/testbed_utils.h
+++ b/test/unit/gemm/device/testbed_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/testbed_with_absmax.h b/test/unit/gemm/device/testbed_with_absmax.h
index 2bccba4f..8b5588f5 100644
--- a/test/unit/gemm/device/testbed_with_absmax.h
+++ b/test/unit/gemm/device/testbed_with_absmax.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
index f39211c5..fe3d4aaf 100644
--- a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
index e96b7bc7..8356b29c 100644
--- a/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf32n_cf32n_cf32t_tensor_op_fast_f32_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
index 8b556a6a..a2836f99 100644
--- a/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/trmm_cf64_cf64_cf64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
index d57de4fa..332c6f98 100644
--- a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
index ac2d2028..72de2d71 100644
--- a/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
+++ b/test/unit/gemm/device/trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
index 7c34e1c8..ab653117 100644
--- a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
index 28614b0c..6d908977 100644
--- a/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
index 503d4113..58a5f863 100644
--- a/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
index 53bcdd0f..e6f1425a 100644
--- a/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f32t_f32n_f32t_tensor_op_fast_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu b/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu
index 8ce29aa0..1a7f371f 100644
--- a/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu
+++ b/test/unit/gemm/device/trmm_f64_f64_f64_tensor_op_f64_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
index 684b470d..aba422e4 100644
--- a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
index 1435d2e7..41260c96 100644
--- a/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64n_f64n_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
index fe233045..4479b8e4 100644
--- a/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64n_f64t_f64t_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
index 35086e7c..fa7ec9e2 100644
--- a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
index 9defdcc6..17fb22f6 100644
--- a/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_f64t_f64t_f64n_tensor_op_f64_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
index e705152d..deee7b9f 100644
--- a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
index dd9a9273..ad43808e 100644
--- a/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32n_tf32t_f32t_tensor_op_f32_rs_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
index 202d86fe..10f20f1b 100644
--- a/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32t_tf32n_f32n_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu b/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
index ef276e8b..024af215 100644
--- a/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
+++ b/test/unit/gemm/device/trmm_tf32t_tf32n_f32t_tensor_op_f32_ls_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/kernel/batched_gemv.cu b/test/unit/gemm/kernel/batched_gemv.cu
index efcc8cee..697b2fb0 100755
--- a/test/unit/gemm/kernel/batched_gemv.cu
+++ b/test/unit/gemm/kernel/batched_gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/kernel/testbed_gemv.h b/test/unit/gemm/kernel/testbed_gemv.h
index 52cb3504..8e939f97 100755
--- a/test/unit/gemm/kernel/testbed_gemv.h
+++ b/test/unit/gemm/kernel/testbed_gemv.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt
index 65858ca8..adb78169 100644
--- a/test/unit/gemm/thread/CMakeLists.txt
+++ b/test/unit/gemm/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu
index 5d3ff05d..cf62ffde 100644
--- a/test/unit/gemm/thread/gemm_sm50.cu
+++ b/test/unit/gemm/thread/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu
index 542d8cb9..94625967 100644
--- a/test/unit/gemm/thread/gemm_sm60.cu
+++ b/test/unit/gemm/thread/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu
index 78a59ce0..8a3a3565 100644
--- a/test/unit/gemm/thread/gemm_sm61.cu
+++ b/test/unit/gemm/thread/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt
index 1dae0cf1..da4ac3c9 100644
--- a/test/unit/gemm/thread/host/CMakeLists.txt
+++ b/test/unit/gemm/thread/host/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu
index d1a83154..e1d1defc 100644
--- a/test/unit/gemm/thread/host/gemm_sm60_host.cu
+++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h
index 919254b6..6e3d6ab0 100644
--- a/test/unit/gemm/thread/host/testbed_host.h
+++ b/test/unit/gemm/thread/host/testbed_host.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h
index 98808046..8d34d799 100644
--- a/test/unit/gemm/thread/testbed.h
+++ b/test/unit/gemm/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt
index 376ec1cc..ca986dc9 100644
--- a/test/unit/gemm/threadblock/CMakeLists.txt
+++ b/test/unit/gemm/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu
index 417a6d41..2d1b9e9e 100644
--- a/test/unit/gemm/threadblock/batched_gemv.cu
+++ b/test/unit/gemm/threadblock/batched_gemv.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu
index 55e7741e..ca79c0fa 100644
--- a/test/unit/gemm/threadblock/epilogue_workspace.cu
+++ b/test/unit/gemm/threadblock/epilogue_workspace.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage.cu b/test/unit/gemm/threadblock/mma_multistage.cu
index 13df9dc1..cee23cb2 100644
--- a/test/unit/gemm/threadblock/mma_multistage.cu
+++ b/test/unit/gemm/threadblock/mma_multistage.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage_slicedk.cu b/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
index 2bf2b653..141785c0 100644
--- a/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
+++ b/test/unit/gemm/threadblock/mma_multistage_slicedk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse.cu b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
index 1625146b..6f1c030f 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse.cu
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
index a0cba6d7..1f3bc8cf 100644
--- a/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_sparse_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed.h b/test/unit/gemm/threadblock/mma_multistage_testbed.h
index 98432668..5caaf38a 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
index d6b494ad..4e617d63 100644
--- a/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
+++ b/test/unit/gemm/threadblock/mma_multistage_testbed_slicedk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
index 0c304b64..8dab93b5 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu b/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
index 5b0ef732..79404e83 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_slicedk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
index dac66004..ba3821a3 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
index 38ddcb4c..7f594df1 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
index 7b6f46f8..c98261e3 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
index a3d697c4..7eb62f9a 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
index 65eb7f9d..36e55b25 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
+++ b/test/unit/gemm/threadblock/mma_pipelined_testbed_slicedk.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
index 47ef1f86..dd223c6c 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
index 49e7a108..1dbe880f 100644
--- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
index 697392bf..23b28e72 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
+++ b/test/unit/gemm/threadblock/mma_planar_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
index 948d1d62..e5fdc077 100644
--- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
+++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
index e56bfcbd..4d771a02 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
index 16b6d7af..fff7c657 100644
--- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
+++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt
index 8e62ed28..e4929064 100644
--- a/test/unit/gemm/warp/CMakeLists.txt
+++ b/test/unit/gemm/warp/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu
index 6d09e44e..cf8586f1 100644
--- a/test/unit/gemm/warp/gemm_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_complex_sm90.cu b/test/unit/gemm/warp/gemm_complex_sm90.cu
index 6a5b2f88..973f4488 100644
--- a/test/unit/gemm/warp/gemm_complex_sm90.cu
+++ b/test/unit/gemm/warp/gemm_complex_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
index 9fd1e913..3658999d 100644
--- a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
+++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_mixed_input_sm80.cu b/test/unit/gemm/warp/gemm_mixed_input_sm80.cu
index db5b178f..19910629 100644
--- a/test/unit/gemm/warp/gemm_mixed_input_sm80.cu
+++ b/test/unit/gemm/warp/gemm_mixed_input_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu
index 1321f2ff..29a26a0c 100644
--- a/test/unit/gemm/warp/gemm_sm50.cu
+++ b/test/unit/gemm/warp/gemm_sm50.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu
index 89b5ae16..cf5b1c11 100644
--- a/test/unit/gemm/warp/gemm_sm60.cu
+++ b/test/unit/gemm/warp/gemm_sm60.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu
index d3904e89..e385331e 100644
--- a/test/unit/gemm/warp/gemm_sm61.cu
+++ b/test/unit/gemm/warp/gemm_sm61.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu
index 8a9fc9b1..2e7ba895 100644
--- a/test/unit/gemm/warp/gemm_sm70.cu
+++ b/test/unit/gemm/warp/gemm_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu
index 6a5bdcc6..f01cac98 100644
--- a/test/unit/gemm/warp/gemm_sm75.cu
+++ b/test/unit/gemm/warp/gemm_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu
index c9732f21..83f034e0 100644
--- a/test/unit/gemm/warp/gemm_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sm90.cu b/test/unit/gemm/warp/gemm_sm90.cu
index d131b419..691810c3 100644
--- a/test/unit/gemm/warp/gemm_sm90.cu
+++ b/test/unit/gemm/warp/gemm_sm90.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/gemm_sparse_sm80.cu b/test/unit/gemm/warp/gemm_sparse_sm80.cu
index f7f83e94..b696b9c9 100644
--- a/test/unit/gemm/warp/gemm_sparse_sm80.cu
+++ b/test/unit/gemm/warp/gemm_sparse_sm80.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h
index 28388606..921d1abd 100644
--- a/test/unit/gemm/warp/testbed.h
+++ b/test/unit/gemm/warp/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu
index ac858be6..5b44e2c6 100644
--- a/test/unit/gemm/warp/wmma_sm70.cu
+++ b/test/unit/gemm/warp/wmma_sm70.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu
index 98533b71..3d269f90 100644
--- a/test/unit/gemm/warp/wmma_sm72.cu
+++ b/test/unit/gemm/warp/wmma_sm72.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu
index 0865d707..2912fa8f 100644
--- a/test/unit/gemm/warp/wmma_sm75.cu
+++ b/test/unit/gemm/warp/wmma_sm75.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt
index 3c5154a4..a9b96d36 100644
--- a/test/unit/layout/CMakeLists.txt
+++ b/test/unit/layout/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu
index 7e76a378..ed341ebd 100644
--- a/test/unit/layout/matrix.cu
+++ b/test/unit/layout/matrix.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu
index 82d9ec5f..35a04183 100644
--- a/test/unit/layout/tensor.cu
+++ b/test/unit/layout/tensor.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu
index 1ca0be54..ef5146fb 100644
--- a/test/unit/layout/tensor_nhwc.cu
+++ b/test/unit/layout/tensor_nhwc.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt
index 6e5643b2..70c50a4a 100644
--- a/test/unit/nvrtc/CMakeLists.txt
+++ b/test/unit/nvrtc/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h
index 3d0e51f3..3311e915 100644
--- a/test/unit/nvrtc/cutlass/nvrtc/environment.h
+++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/kernel/thread/contraction.hpp b/test/unit/nvrtc/kernel/thread/contraction.hpp
index f90e882e..55df4437 100644
--- a/test/unit/nvrtc/kernel/thread/contraction.hpp
+++ b/test/unit/nvrtc/kernel/thread/contraction.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -114,7 +114,7 @@ using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder<
   cutlass::gemm::KernelTmaWarpSpecialized
 >::CollectiveOp;
 
-using EpilogueOutputOp = cutlass::epilogue::collective::DefaultEpilogue<StrideC, StrideC, EpilogueThread, cutlass::gemm::EpilogueDefault>;
+using EpilogueOutputOp = cutlass::epilogue::collective::DefaultEpilogue<ElementC, StrideC, StrideC, EpilogueThread, cutlass::gemm::EpilogueDefault>;
 using CollectiveEpilogue = cutlass::epilogue::collective::detail::Sm90TmaWarpSpecializedAdapter<EpilogueOutputOp>;
 using Kernel = cutlass::gemm::kernel::GemmUniversal<
   ProblemShape,
diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
index 7cff66c0..576f55cd 100644
--- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h
+++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/stdlib/assert.h b/test/unit/nvrtc/stdlib/assert.h
index efc3225a..c7e6e946 100644
--- a/test/unit/nvrtc/stdlib/assert.h
+++ b/test/unit/nvrtc/stdlib/assert.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h
index baf2e7fc..5ba5432f 100644
--- a/test/unit/nvrtc/stdlib/stdint.h
+++ b/test/unit/nvrtc/stdlib/stdint.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt
index 8161c4e5..76d7da6b 100644
--- a/test/unit/nvrtc/thread/CMakeLists.txt
+++ b/test/unit/nvrtc/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/thread/nvrtc_contraction.cu b/test/unit/nvrtc/thread/nvrtc_contraction.cu
index 8dd0132a..fa079434 100644
--- a/test/unit/nvrtc/thread/nvrtc_contraction.cu
+++ b/test/unit/nvrtc/thread/nvrtc_contraction.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/thread/nvrtc_gemm.cu b/test/unit/nvrtc/thread/nvrtc_gemm.cu
index 2d918658..41a83ebd 100644
--- a/test/unit/nvrtc/thread/nvrtc_gemm.cu
+++ b/test/unit/nvrtc/thread/nvrtc_gemm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h
index 6c59afeb..8fd6863e 100644
--- a/test/unit/nvrtc/thread/testbed.h
+++ b/test/unit/nvrtc/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/CMakeLists.txt b/test/unit/pipeline/CMakeLists.txt
index 3051cf12..81bdbf32 100644
--- a/test/unit/pipeline/CMakeLists.txt
+++ b/test/unit/pipeline/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/pipeline_async.cu b/test/unit/pipeline/pipeline_async.cu
index 86eac35e..04b406cc 100644
--- a/test/unit/pipeline/pipeline_async.cu
+++ b/test/unit/pipeline/pipeline_async.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/pipeline_tma_async.cu b/test/unit/pipeline/pipeline_tma_async.cu
index b9a61264..40133be9 100644
--- a/test/unit/pipeline/pipeline_tma_async.cu
+++ b/test/unit/pipeline/pipeline_tma_async.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu b/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu
index 5b3c5dac..2c1f974f 100644
--- a/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu
+++ b/test/unit/pipeline/pipeline_tma_async_warp_specialized.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu b/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
index 22d5cadf..e0b59a33 100644
--- a/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
+++ b/test/unit/pipeline/pipeline_tma_async_warp_specialized_persistent.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/sequence_barrier.cu b/test/unit/pipeline/sequence_barrier.cu
index 9bb3150b..66afe061 100644
--- a/test/unit/pipeline/sequence_barrier.cu
+++ b/test/unit/pipeline/sequence_barrier.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/pipeline/testbed.h b/test/unit/pipeline/testbed.h
index ea95f7a9..6cc2946a 100644
--- a/test/unit/pipeline/testbed.h
+++ b/test/unit/pipeline/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt
index 577f05d5..19d13c33 100644
--- a/test/unit/reduction/CMakeLists.txt
+++ b/test/unit/reduction/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/device/CMakeLists.txt b/test/unit/reduction/device/CMakeLists.txt
index 6d1ef4ed..fe2e88d4 100644
--- a/test/unit/reduction/device/CMakeLists.txt
+++ b/test/unit/reduction/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/device/tensor_reduce_contiguous.cu b/test/unit/reduction/device/tensor_reduce_contiguous.cu
index 9eeffba2..7e0106f1 100644
--- a/test/unit/reduction/device/tensor_reduce_contiguous.cu
+++ b/test/unit/reduction/device/tensor_reduce_contiguous.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/device/tensor_reduce_strided.cu b/test/unit/reduction/device/tensor_reduce_strided.cu
index 6e8c992e..d35cdcc7 100644
--- a/test/unit/reduction/device/tensor_reduce_strided.cu
+++ b/test/unit/reduction/device/tensor_reduce_strided.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt
index 826586ba..f1db0b4d 100644
--- a/test/unit/reduction/kernel/CMakeLists.txt
+++ b/test/unit/reduction/kernel/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu
index c6cc4c81..558d6fc8 100644
--- a/test/unit/reduction/kernel/reduce_splitk.cu
+++ b/test/unit/reduction/kernel/reduce_splitk.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h
index 21f9282b..e44a4246 100644
--- a/test/unit/reduction/kernel/reduce_splitk_testbed.h
+++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt
index 1c27a693..1a0fb4c6 100644
--- a/test/unit/reduction/thread/CMakeLists.txt
+++ b/test/unit/reduction/thread/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu
index d125cccb..c747dc8d 100644
--- a/test/unit/reduction/thread/reduction_thread.cu
+++ b/test/unit/reduction/thread/reduction_thread.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h
index 006835a2..239f2288 100644
--- a/test/unit/reduction/thread/testbed.h
+++ b/test/unit/reduction/thread/testbed.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/substrate/CMakeLists.txt b/test/unit/substrate/CMakeLists.txt
index 693c597e..c693aedd 100644
--- a/test/unit/substrate/CMakeLists.txt
+++ b/test/unit/substrate/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/substrate/dependent_false.cpp b/test/unit/substrate/dependent_false.cpp
index dd09378a..512c4dbe 100644
--- a/test/unit/substrate/dependent_false.cpp
+++ b/test/unit/substrate/dependent_false.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp
index e515cf9a..39690429 100644
--- a/test/unit/test_unit.cpp
+++ b/test/unit/test_unit.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt
index 0ab0b93f..4cc80945 100644
--- a/test/unit/transform/CMakeLists.txt
+++ b/test/unit/transform/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/device/CMakeLists.txt b/test/unit/transform/device/CMakeLists.txt
index 74ad63f2..d5c0d5ee 100644
--- a/test/unit/transform/device/CMakeLists.txt
+++ b/test/unit/transform/device/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu b/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu
index 2f42d6a1..35b08d3f 100644
--- a/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_f16.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu b/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu
index 295622b2..29895ee9 100644
--- a/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_f32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu b/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu
index 03714710..8e27bd85 100644
--- a/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_f8.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp b/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
index 8ec0c4ac..29280655 100644
--- a/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
+++ b/test/unit/transform/device/sm90_sparse_gemm_compressor_legacy.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp b/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
index 03e4fa75..f4445824 100644
--- a/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
+++ b/test/unit/transform/device/testbed_sparse_gemm_compressor.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/kernel/CMakeLists.txt b/test/unit/transform/kernel/CMakeLists.txt
index 92d4a47b..0dfdc23b 100644
--- a/test/unit/transform/kernel/CMakeLists.txt
+++ b/test/unit/transform/kernel/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/kernel/filter_format_transformer.cu b/test/unit/transform/kernel/filter_format_transformer.cu
index ce489afd..ceabfb27 100644
--- a/test/unit/transform/kernel/filter_format_transformer.cu
+++ b/test/unit/transform/kernel/filter_format_transformer.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt
index 16796d7e..46bddca8 100644
--- a/test/unit/transform/threadblock/CMakeLists.txt
+++ b/test/unit/transform/threadblock/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu
index 7e9f9bf2..5739ad30 100644
--- a/test/unit/transform/threadblock/predicated_tile_iterator.cu
+++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
index 052df60a..5f171c49 100644
--- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
+++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/util/CMakeLists.txt b/test/unit/util/CMakeLists.txt
index e3b397f4..b68bbd32 100644
--- a/test/unit/util/CMakeLists.txt
+++ b/test/unit/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/util/cutlass_test_levels.cu b/test/unit/util/cutlass_test_levels.cu
index 72150de5..4f28c596 100644
--- a/test/unit/util/cutlass_test_levels.cu
+++ b/test/unit/util/cutlass_test_levels.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/util/rms_norm.cu b/test/unit/util/rms_norm.cu
index 41114067..a08921fc 100644
--- a/test/unit/util/rms_norm.cu
+++ b/test/unit/util/rms_norm.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/test/unit/util/tensor_reduce.cu b/test/unit/util/tensor_reduce.cu
index 06fafdc3..7348371d 100644
--- a/test/unit/util/tensor_reduce.cu
+++ b/test/unit/util/tensor_reduce.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 5314f639..14fee67c 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt
index 21ad9562..2052dd2c 100644
--- a/tools/library/CMakeLists.txt
+++ b/tools/library/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
@@ -285,7 +285,7 @@ execute_process(
     --kernel-filter-file "${KERNEL_FILTER_FILE}"
     --selected-kernel-list "${CUTLASS_LIBRARY_GENERATED_KERNEL_LIST_FILE}"
     --cuda-version "${CUTLASS_GENERATOR_CUDA_COMPILER_VERSION}"
-    --log-level DEBUG
+    --log-level INFO
     --disable-cutlass-package-imports
   RESULT_VARIABLE cutlass_lib_INSTANCE_GENERATION_RESULT
   OUTPUT_VARIABLE cutlass_lib_INSTANCE_GENERATION_OUTPUT
diff --git a/tools/library/include/cutlass/library/arch_mappings.h b/tools/library/include/cutlass/library/arch_mappings.h
index 74a768c5..eee0c786 100644
--- a/tools/library/include/cutlass/library/arch_mappings.h
+++ b/tools/library/include/cutlass/library/arch_mappings.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/descriptions.h b/tools/library/include/cutlass/library/descriptions.h
index 6a6aab43..ae96395f 100644
--- a/tools/library/include/cutlass/library/descriptions.h
+++ b/tools/library/include/cutlass/library/descriptions.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h
index d87d0895..bb37b1bc 100644
--- a/tools/library/include/cutlass/library/handle.h
+++ b/tools/library/include/cutlass/library/handle.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h
index 19812d4b..a4c6572e 100644
--- a/tools/library/include/cutlass/library/library.h
+++ b/tools/library/include/cutlass/library/library.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -117,8 +117,7 @@ public:
     void const *arguments,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const = 0;
+    cudaStream_t stream = nullptr) const = 0;
 
 };
 
@@ -173,6 +172,9 @@ struct GemmArguments {
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
   ScalarPointerMode pointer_mode{};
+  
+  /// Whether to use PDL when launching the kernel
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -253,6 +255,7 @@ struct GemmArrayArguments {
   void const *alpha{nullptr};
   void const *beta{nullptr};
   ScalarPointerMode pointer_mode{};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -307,6 +310,8 @@ struct GemmUniversalArguments {
   int swizzle_size{1};
 
   int device_index{0};
+  
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -354,6 +359,7 @@ struct GemmPlanarComplexArguments {
   int64_t batch_stride_C_imag{0};
   int64_t batch_stride_D_real{0};
   int64_t batch_stride_D_imag{0};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -394,6 +400,7 @@ struct GemmPlanarComplexArrayArguments {
   void const * alpha{nullptr};
   void const * beta{nullptr};
   ScalarPointerMode pointer_mode{};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -425,6 +432,7 @@ struct GemmGroupedArguments {
   void const *alpha{nullptr};
   void const *beta{nullptr};
   ScalarPointerMode pointer_mode{};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -461,6 +469,7 @@ struct SparseGemmArguments {
   void const *beta{nullptr};       /// pointer to beta scalar
   ScalarPointerMode pointer_mode{}; /// enumerant indicating whether alpha/beta pointers are host
                                     ///   or device pointers.
+  bool use_pdl{false};              /// Whether to use PDL when launching the kernel
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -519,6 +528,7 @@ struct RankKArguments {
   int64_t batch_stride_B{0};
   int64_t batch_stride_C{0};
   int64_t batch_stride_D{0};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -570,6 +580,7 @@ struct TrmmArguments {
   int64_t batch_stride_A{0};
   int64_t batch_stride_B{0};
   int64_t batch_stride_D{0};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -628,6 +639,7 @@ struct SymmArguments {
   int64_t batch_stride_B{0};
   int64_t batch_stride_C{0};
   int64_t batch_stride_D{0};
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -744,6 +756,9 @@ struct ConvArguments {
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
   ScalarPointerMode pointer_mode{};
+  
+  /// Whether to use PDL when launching the kernel
+  bool use_pdl{false};
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
@@ -796,6 +811,9 @@ struct ReductionArguments {
 
   /// Enumerant indicating whether alpha/beta point to host or device memory
   ScalarPointerMode pointer_mode{};
+
+  /// Whether to use PDL when launching the kernel
+  bool use_pdl{false};
 };
 
 } // namespace library
diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h
index 08272af1..c4fb0ee8 100644
--- a/tools/library/include/cutlass/library/manifest.h
+++ b/tools/library/include/cutlass/library/manifest.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h
index ee7b65fe..05b84b1e 100644
--- a/tools/library/include/cutlass/library/operation_table.h
+++ b/tools/library/include/cutlass/library/operation_table.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/singleton.h b/tools/library/include/cutlass/library/singleton.h
index 83cc29e2..9a757433 100644
--- a/tools/library/include/cutlass/library/singleton.h
+++ b/tools/library/include/cutlass/library/singleton.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/types.h b/tools/library/include/cutlass/library/types.h
index 12c6fb61..56853863 100644
--- a/tools/library/include/cutlass/library/types.h
+++ b/tools/library/include/cutlass/library/types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h
index 13ac4f68..af82ffbc 100644
--- a/tools/library/include/cutlass/library/util.h
+++ b/tools/library/include/cutlass/library/util.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -177,6 +177,13 @@ char const *to_string(RasterOrder type, bool pretty = false);
 template<>
 RasterOrder from_string<RasterOrder>(std::string const &str);
 
+/// Converts a bool to a string
+char const *to_string(bool type, bool pretty = false);
+
+/// Convers a bool from a string
+template<>
+bool from_string<bool>(std::string const &str);
+
 /// Lexical cast from int64_t to string
 std::string lexical_cast(int64_t int_value);
 
diff --git a/tools/library/src/conv2d_operation.h b/tools/library/src/conv2d_operation.h
index 027b2615..3b1a1584 100644
--- a/tools/library/src/conv2d_operation.h
+++ b/tools/library/src/conv2d_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -238,6 +238,10 @@ protected:
     operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
     operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -326,12 +330,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
 
@@ -495,6 +494,10 @@ protected:
     operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
     operator_args.ref_reordered_B.reset(static_cast<ElementC *>(const_cast<void *>(arguments->reordered_B)));
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -583,12 +586,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
 
diff --git a/tools/library/src/conv3d_operation.h b/tools/library/src/conv3d_operation.h
index 6cb1796b..fe402c44 100644
--- a/tools/library/src/conv3d_operation.h
+++ b/tools/library/src/conv3d_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -229,6 +229,10 @@ protected:
     operator_args.ref_C.reset(static_cast<ElementC *>(const_cast<void *>(arguments->C)));
     operator_args.ref_D.reset(static_cast<ElementC *>(const_cast<void *>(arguments->D)));
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -317,12 +321,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
 
diff --git a/tools/library/src/conv_operation_3x.hpp b/tools/library/src/conv_operation_3x.hpp
index d6f79e91..86c1513e 100644
--- a/tools/library/src/conv_operation_3x.hpp
+++ b/tools/library/src/conv_operation_3x.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2024 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -334,8 +334,7 @@ public:
     void const* arguments,
     void* host_workspace,
     void* device_workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const override
+    cudaStream_t stream = nullptr) const override
   {
     auto status = Status::kInvalid;
 
@@ -361,7 +360,7 @@ public:
     }
 
     auto* op = reinterpret_cast<Operator*>(host_workspace);
-    return op->run(out_args, device_workspace, stream, nullptr, launch_with_pdl);
+    return op->run(out_args, device_workspace, stream, nullptr, in_args_ptr->use_pdl);
   }
 
 private:
diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h
index 5c6f9ca8..0ff45dae 100644
--- a/tools/library/src/gemm_operation.h
+++ b/tools/library/src/gemm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -206,6 +206,10 @@ protected:
       return Status::kErrorInvalidProblem;
     }
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     operator_args.ref_A.reset(static_cast<ElementA const *>(arguments->A));
     operator_args.ref_B.reset(static_cast<ElementB const *>(arguments->B));
     operator_args.ref_C.reset(static_cast<ElementC const *>(arguments->C));
@@ -296,12 +300,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
 
@@ -420,6 +419,10 @@ protected:
     operator_args.ref_D.reset(static_cast<ElementD *>(arguments->D));
     operator_args.ref_E.reset(static_cast<ElementE const *>(arguments->E));
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -505,13 +508,8 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
-
+    cudaStream_t stream = nullptr) const {
+ 
     OperatorArguments args;
 
     Status status = update_arguments_(
@@ -634,6 +632,10 @@ protected:
     operator_args.batch_stride_C = arguments->batch_stride_C;
     operator_args.batch_stride_D = arguments->batch_stride_D;
     
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+    
     return Status::kSuccess;
   }
 
@@ -731,12 +733,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
     
@@ -945,13 +942,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-    
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
-
+    cudaStream_t stream = nullptr) const {
     OperatorArguments args;
 
     Status status = update_arguments_(
@@ -1064,6 +1055,10 @@ protected:
     operator_args.ptr_N = arguments->N;
     operator_args.ptr_K = arguments->K;
     
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -1153,12 +1148,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-    
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
     
@@ -1265,6 +1255,10 @@ protected:
     op_args.ldc           = arguments->ldc;
     op_args.ldd           = arguments->ldd;
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -1362,12 +1356,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
 
diff --git a/tools/library/src/gemm_operation_3x.hpp b/tools/library/src/gemm_operation_3x.hpp
index 7c87b45e..a089cb5d 100644
--- a/tools/library/src/gemm_operation_3x.hpp
+++ b/tools/library/src/gemm_operation_3x.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -248,8 +248,17 @@ protected:
         arguments->ldc, arguments->batch_stride_C);
     operator_args.epilogue.dD = operator_args.epilogue.dC;
 
-    /* Query device SM count to pass onto the kernel as an argument, where needed */
+    /* Query device SM count and max active clusters to pass onto the kernel as an argument, where needed */
     operator_args.hw_info.sm_count = arguments->sm_count;
+    if constexpr (Operator::ArchTag::kMinComputeCapability == 90) {
+      dim3 cluster_dims(cute::size<0>(typename Operator::GemmKernel::ClusterShape{}),
+                        cute::size<1>(typename Operator::GemmKernel::ClusterShape{}),
+                        cute::size<2>(typename Operator::GemmKernel::ClusterShape{}));
+      uint32_t threads_per_block = Operator::GemmKernel::MaxThreadsPerBlock;
+      void const* kernel_ptr = (void*)(device_kernel<typename Operator::GemmKernel>);
+      operator_args.hw_info.max_active_clusters = cutlass::KernelHardwareInfo::query_device_max_active_clusters(
+                                                    cluster_dims, threads_per_block, kernel_ptr);
+    }
     if constexpr (!std::is_const_v<decltype(operator_args.scheduler.max_swizzle_size)>) {
       operator_args.scheduler.max_swizzle_size = arguments->swizzle_size;
     }
@@ -275,20 +284,11 @@ public:
 
   /// Returns success if the operation can proceed
   Status can_implement(
-      void const *configuration_ptr, void const *arguments_ptr) const override {
-    GemmUniversalConfiguration const *configuration =
-      static_cast<GemmUniversalConfiguration const *>(configuration_ptr);
+      [[maybe_unused]] void const *configuration_ptr, void const *arguments_ptr) const override {
     GemmUniversalArguments const *arguments =
       static_cast<GemmUniversalArguments const *>(arguments_ptr);
-
     OperatorArguments args;
-    // can_implement rules may need access to problem shape
-    args.problem_shape = cute::make_shape(
-      configuration->problem_size.m(),
-      configuration->problem_size.n(),
-      configuration->problem_size.k(),
-      configuration->batch_count);
-
+    
     auto status = update_arguments_(args, arguments);
     if (status != Status::kSuccess) {
       return status;
@@ -332,8 +332,7 @@ public:
       void const *arguments_ptr,
       void *host_workspace,
       void *device_workspace = nullptr,
-      cudaStream_t stream = nullptr,
-      bool launch_with_pdl = false) const override {
+      cudaStream_t stream = nullptr) const override {
 
     OperatorArguments args;
     Status status = update_arguments_(args, static_cast<GemmUniversalArguments const *>(arguments_ptr));
@@ -343,7 +342,8 @@ public:
 
     Operator *op = static_cast<Operator *>(host_workspace);
     // We need to call initialize() since we have to rebuild TMA desc for every new set of args
-    status = op->run(args, device_workspace, stream, nullptr, launch_with_pdl);
+    status = op->run(args, device_workspace, stream, nullptr, 
+                     static_cast<GemmUniversalArguments const *>(arguments_ptr)->use_pdl);
     return status;
   }
 };
diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu
index e6f00f72..82dc25d6 100644
--- a/tools/library/src/handle.cu
+++ b/tools/library/src/handle.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index be311c62..8f4de516 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp
index 82a8fea4..b9c04de7 100644
--- a/tools/library/src/manifest.cpp
+++ b/tools/library/src/manifest.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu
index bb5c921f..6719cd31 100644
--- a/tools/library/src/operation_table.cu
+++ b/tools/library/src/operation_table.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/rank_2k_operation.h b/tools/library/src/rank_2k_operation.h
index 5a611104..76d8d0df 100644
--- a/tools/library/src/rank_2k_operation.h
+++ b/tools/library/src/rank_2k_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -223,6 +223,10 @@ protected:
     operator_args.batch_stride_C = arguments->batch_stride_C;
     operator_args.batch_stride_D = arguments->batch_stride_D;
     
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -314,12 +318,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
     
diff --git a/tools/library/src/rank_k_operation.h b/tools/library/src/rank_k_operation.h
index e6afb1da..021f7f03 100644
--- a/tools/library/src/rank_k_operation.h
+++ b/tools/library/src/rank_k_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -221,6 +221,10 @@ protected:
     operator_args.batch_stride_C = arguments->batch_stride_C;
     operator_args.batch_stride_D = arguments->batch_stride_D;
     
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -310,12 +314,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
     
diff --git a/tools/library/src/reduction/init_reduction_operations.cu b/tools/library/src/reduction/init_reduction_operations.cu
index b0d92acc..9871d789 100644
--- a/tools/library/src/reduction/init_reduction_operations.cu
+++ b/tools/library/src/reduction/init_reduction_operations.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reduction/reduction_device.cu b/tools/library/src/reduction/reduction_device.cu
index 956f1d3d..3d561e84 100644
--- a/tools/library/src/reduction/reduction_device.cu
+++ b/tools/library/src/reduction/reduction_device.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reduction/reduction_operation.h b/tools/library/src/reduction/reduction_operation.h
index 3bcabf09..6e948540 100644
--- a/tools/library/src/reduction/reduction_operation.h
+++ b/tools/library/src/reduction/reduction_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -145,6 +145,10 @@ protected:
     operator_args.source.reset(static_cast<ElementOutput *>(const_cast<void *>(arguments->source)));
     operator_args.destination.reset(static_cast<ElementOutput *>(const_cast<void *>(arguments->destination)));
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -231,13 +235,8 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
-
+    cudaStream_t stream = nullptr) const {
+ 
     OperatorArguments args;
 
     Status status = update_arguments_(
diff --git a/tools/library/src/reference/conv2d.cu b/tools/library/src/reference/conv2d.cu
index bf56f93f..0d645edc 100644
--- a/tools/library/src/reference/conv2d.cu
+++ b/tools/library/src/reference/conv2d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/conv3d.cu b/tools/library/src/reference/conv3d.cu
index f486ff9c..95ad2fa4 100644
--- a/tools/library/src/reference/conv3d.cu
+++ b/tools/library/src/reference/conv3d.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/conv_reference_operation.h b/tools/library/src/reference/conv_reference_operation.h
index 2bafc4af..240fe18d 100644
--- a/tools/library/src/reference/conv_reference_operation.h
+++ b/tools/library/src/reference/conv_reference_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -432,12 +432,7 @@ public:
     void const *arguments,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-    
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     ConvArguments const  &args = *static_cast<ConvArguments const *>(arguments);
 
diff --git a/tools/library/src/reference/gemm_e4m3a_e4m3out.cu b/tools/library/src/reference/gemm_e4m3a_e4m3out.cu
index 52628b7f..d45093e8 100644
--- a/tools/library/src/reference/gemm_e4m3a_e4m3out.cu
+++ b/tools/library/src/reference/gemm_e4m3a_e4m3out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_e4m3a_e5m2out.cu b/tools/library/src/reference/gemm_e4m3a_e5m2out.cu
index cb6c6435..9a444f96 100644
--- a/tools/library/src/reference/gemm_e4m3a_e5m2out.cu
+++ b/tools/library/src/reference/gemm_e4m3a_e5m2out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_e5m2a_e4m3out.cu b/tools/library/src/reference/gemm_e5m2a_e4m3out.cu
index a41669e4..e6c68e59 100644
--- a/tools/library/src/reference/gemm_e5m2a_e4m3out.cu
+++ b/tools/library/src/reference/gemm_e5m2a_e4m3out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_e5m2a_e5m2out.cu b/tools/library/src/reference/gemm_e5m2a_e5m2out.cu
index b2568e8a..67247c5d 100644
--- a/tools/library/src/reference/gemm_e5m2a_e5m2out.cu
+++ b/tools/library/src/reference/gemm_e5m2a_e5m2out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_fp32out.cu b/tools/library/src/reference/gemm_fp32out.cu
index 3f15e82e..3c19b459 100644
--- a/tools/library/src/reference/gemm_fp32out.cu
+++ b/tools/library/src/reference/gemm_fp32out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_fp8in_bf16out.cu b/tools/library/src/reference/gemm_fp8in_bf16out.cu
index a8cdef38..581def9b 100644
--- a/tools/library/src/reference/gemm_fp8in_bf16out.cu
+++ b/tools/library/src/reference/gemm_fp8in_bf16out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_fp8in_fp16out.cu b/tools/library/src/reference/gemm_fp8in_fp16out.cu
index 86e336de..8d8178ef 100644
--- a/tools/library/src/reference/gemm_fp8in_fp16out.cu
+++ b/tools/library/src/reference/gemm_fp8in_fp16out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_fp8in_fp32out.cu b/tools/library/src/reference/gemm_fp8in_fp32out.cu
index f278f7c2..005ba8be 100644
--- a/tools/library/src/reference/gemm_fp8in_fp32out.cu
+++ b/tools/library/src/reference/gemm_fp8in_fp32out.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_fp_mixed_input.cu b/tools/library/src/reference/gemm_fp_mixed_input.cu
index 46949236..18fbf2ff 100644
--- a/tools/library/src/reference/gemm_fp_mixed_input.cu
+++ b/tools/library/src/reference/gemm_fp_mixed_input.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_fp_other.cu b/tools/library/src/reference/gemm_fp_other.cu
index 3a196c20..5e9ad777 100644
--- a/tools/library/src/reference/gemm_fp_other.cu
+++ b/tools/library/src/reference/gemm_fp_other.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_int4.cu b/tools/library/src/reference/gemm_int4.cu
index 46447141..ffaf7d34 100644
--- a/tools/library/src/reference/gemm_int4.cu
+++ b/tools/library/src/reference/gemm_int4.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_int8_interleaved_32.cu b/tools/library/src/reference/gemm_int8_interleaved_32.cu
index fd2393fa..7885afc0 100644
--- a/tools/library/src/reference/gemm_int8_interleaved_32.cu
+++ b/tools/library/src/reference/gemm_int8_interleaved_32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_int8_interleaved_64.cu b/tools/library/src/reference/gemm_int8_interleaved_64.cu
index e2f013a0..effc47ac 100644
--- a/tools/library/src/reference/gemm_int8_interleaved_64.cu
+++ b/tools/library/src/reference/gemm_int8_interleaved_64.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_int_mixed_input.cu b/tools/library/src/reference/gemm_int_mixed_input.cu
index c37ddfe4..0bf07ceb 100644
--- a/tools/library/src/reference/gemm_int_mixed_input.cu
+++ b/tools/library/src/reference/gemm_int_mixed_input.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_reference_operation.h b/tools/library/src/reference/gemm_reference_operation.h
index 940ff521..e07158b0 100644
--- a/tools/library/src/reference/gemm_reference_operation.h
+++ b/tools/library/src/reference/gemm_reference_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -192,12 +192,7 @@ public:
     void const *arguments,
     void *host_workspace,
     void *device_workspace = nullptr,
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     GemmUniversalConfiguration const &config = *static_cast<GemmUniversalConfiguration const *>(host_workspace);
     GemmUniversalArguments const &args = *static_cast<GemmUniversalArguments const *>(arguments);
diff --git a/tools/library/src/reference/gemm_s8_s8_s32.cu b/tools/library/src/reference/gemm_s8_s8_s32.cu
index 8c661b98..939450f7 100644
--- a/tools/library/src/reference/gemm_s8_s8_s32.cu
+++ b/tools/library/src/reference/gemm_s8_s8_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/gemm_u8_u8_s32.cu b/tools/library/src/reference/gemm_u8_u8_s32.cu
index f18f7e64..b3b1acf8 100644
--- a/tools/library/src/reference/gemm_u8_u8_s32.cu
+++ b/tools/library/src/reference/gemm_u8_u8_s32.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/reference/initialize_reference_operations.cu b/tools/library/src/reference/initialize_reference_operations.cu
index b097d580..f5dcc67c 100644
--- a/tools/library/src/reference/initialize_reference_operations.cu
+++ b/tools/library/src/reference/initialize_reference_operations.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/singleton.cu b/tools/library/src/singleton.cu
index e844083a..47d875ef 100644
--- a/tools/library/src/singleton.cu
+++ b/tools/library/src/singleton.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/library/src/sparse_gemm_operation_3x.hpp b/tools/library/src/sparse_gemm_operation_3x.hpp
index 8bfc41d7..2fc51ff6 100644
--- a/tools/library/src/sparse_gemm_operation_3x.hpp
+++ b/tools/library/src/sparse_gemm_operation_3x.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -398,8 +398,7 @@ public:
       void const *arguments_ptr,
       void *host_workspace,
       void *device_workspace,
-      cudaStream_t stream = nullptr,
-      bool launch_with_pdl = false) const override {
+      cudaStream_t stream = nullptr) const override {
 
     OperatorArguments operator_args;
 
@@ -421,7 +420,8 @@ public:
 
     Operator *op = static_cast<Operator *>(host_workspace);
     // We need to call initialize() since we have to rebuild TMA desc for every new set of args
-    status = op->run(operator_args, device_op_workspace_ptr, stream, nullptr, launch_with_pdl);
+    status = op->run(operator_args, device_op_workspace_ptr, stream, nullptr, 
+                     static_cast<GemmUniversalArguments const *>(arguments_ptr)->use_pdl);
     return status;
   }
 
diff --git a/tools/library/src/symm_operation.h b/tools/library/src/symm_operation.h
index aeb06caf..c95d238a 100644
--- a/tools/library/src/symm_operation.h
+++ b/tools/library/src/symm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -221,6 +221,10 @@ protected:
     operator_args.batch_stride_C = arguments->batch_stride_C;
     operator_args.batch_stride_D = arguments->batch_stride_D;
     
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -312,13 +316,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-    
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
-
+    cudaStream_t stream = nullptr) const {
     OperatorArguments args;
     
     Status status = update_arguments_(
diff --git a/tools/library/src/trmm_operation.h b/tools/library/src/trmm_operation.h
index 88c4f7ab..d4197237 100644
--- a/tools/library/src/trmm_operation.h
+++ b/tools/library/src/trmm_operation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -215,6 +215,10 @@ protected:
     operator_args.ptr_D = arguments->D;
     operator_args.batch_stride_D = arguments->batch_stride_D;
 
+    if (arguments->use_pdl) {
+      return Status::kErrorNotSupported; 
+    }
+
     return Status::kSuccess;
   }
 
@@ -304,12 +308,7 @@ public:
     void const *arguments_ptr,
     void *host_workspace, 
     void *device_workspace = nullptr, 
-    cudaStream_t stream = nullptr,
-    bool launch_with_pdl = false) const {
-
-    if (launch_with_pdl) {
-      return Status::kErrorNotSupported;
-    }
+    cudaStream_t stream = nullptr) const {
 
     OperatorArguments args;
     
diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu
index dee94810..f841781a 100644
--- a/tools/library/src/util.cu
+++ b/tools/library/src/util.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -1052,6 +1052,53 @@ RasterOrder from_string<RasterOrder>(std::string const &str) {
 
   return RasterOrder::kInvalid;
 }
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+static struct {
+  char const *text;
+  char const *pretty;
+  char const *character;
+  bool enumerant;
+}
+Bool_enumerants[] = {
+  {"true", "<true>", "t", true},
+  {"false", "<false>", "f", false},
+};
+
+/// Converts a RasterOrder enumerant to a string
+char const *to_string(bool type, bool pretty) {
+
+  for (auto const & possible : Bool_enumerants) {
+    if (type == possible.enumerant) {
+      if (pretty) {
+        return possible.pretty;
+      }
+      else {
+        return possible.text;
+      }
+    }
+  }
+
+  return pretty ? "Invalid" : "invalid";
+}
+
+
+/// Converts a RasterOrder enumerant from a string
+template <>
+bool from_string<bool>(std::string const &str) {
+
+  for (auto const & possible : Bool_enumerants) {
+    if ((str.compare(possible.text) == 0) ||
+        (str.compare(possible.pretty) == 0) ||
+        (str.compare(possible.character) == 0)) {
+      return possible.enumerant;
+    }
+  }
+
+  return false;
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid.
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index d71caf41..7038289d 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h b/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
index 32d79211..683465f5 100644
--- a/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/conv2d_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h b/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
index 2ce0a1c2..ac4abdef 100644
--- a/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/conv3d_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/cublas_helpers.h b/tools/profiler/include/cutlass/profiler/cublas_helpers.h
index 10642e5f..873ba1ab 100644
--- a/tools/profiler/include/cutlass/profiler/cublas_helpers.h
+++ b/tools/profiler/include/cutlass/profiler/cublas_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/cudnn_helpers.h b/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
index 7bee15d3..7ce9eea5 100644
--- a/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
+++ b/tools/profiler/include/cutlass/profiler/cudnn_helpers.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/cutlass_profiler.h b/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
index c5fdc9e3..be822453 100644
--- a/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/cutlass_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/debug.h b/tools/profiler/include/cutlass/profiler/debug.h
index 1c4bb7c4..98f1fdc3 100644
--- a/tools/profiler/include/cutlass/profiler/debug.h
+++ b/tools/profiler/include/cutlass/profiler/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/device_allocation.h b/tools/profiler/include/cutlass/profiler/device_allocation.h
index 97a1e722..488b635c 100644
--- a/tools/profiler/include/cutlass/profiler/device_allocation.h
+++ b/tools/profiler/include/cutlass/profiler/device_allocation.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/device_context.h b/tools/profiler/include/cutlass/profiler/device_context.h
index 19fc42c5..0443b340 100644
--- a/tools/profiler/include/cutlass/profiler/device_context.h
+++ b/tools/profiler/include/cutlass/profiler/device_context.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/enumerated_types.h b/tools/profiler/include/cutlass/profiler/enumerated_types.h
index 3e6efa48..897311c2 100644
--- a/tools/profiler/include/cutlass/profiler/enumerated_types.h
+++ b/tools/profiler/include/cutlass/profiler/enumerated_types.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
index b103e3db..b87b73f8 100644
--- a/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -92,6 +92,8 @@ public:
     std::vector<uint8_t> alpha_one;
     std::vector<uint8_t> beta_zero;
 
+    bool use_pdl{false};
+
     //
     // Methods
     //
diff --git a/tools/profiler/include/cutlass/profiler/gpu_timer.h b/tools/profiler/include/cutlass/profiler/gpu_timer.h
index 815b6af1..15404529 100644
--- a/tools/profiler/include/cutlass/profiler/gpu_timer.h
+++ b/tools/profiler/include/cutlass/profiler/gpu_timer.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/operation_profiler.h b/tools/profiler/include/cutlass/profiler/operation_profiler.h
index 7e3005fe..185e6f03 100644
--- a/tools/profiler/include/cutlass/profiler/operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -239,6 +239,21 @@ protected:
     void *host_workspace,
     void *device_workspace);
 
+  /// Profiles the GPU kernel launched in `func` running simultaneously on all
+  /// requested devices.
+  Status profile_kernel_(
+    PerformanceResult &result,
+    Options const &options,
+    const std::function<Status(int, cudaStream_t, int)> &func,
+    const std::vector<cudaStream_t> &streams);
+
+  /// Profiles the GPU kernel launched in `func` on the `stream`
+  Status profile_kernel_(
+    PerformanceResult &result,
+    Options const &options,
+    const std::function<Status(cudaStream_t, int)> &func,
+    cudaStream_t stream = nullptr);
+
 private:
   /// finds string matches filter_string in operation_name
   bool find_string_matches_(
diff --git a/tools/profiler/include/cutlass/profiler/options.h b/tools/profiler/include/cutlass/profiler/options.h
index 6093f49b..449aa70e 100644
--- a/tools/profiler/include/cutlass/profiler/options.h
+++ b/tools/profiler/include/cutlass/profiler/options.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -199,8 +199,15 @@ public:
     int warmup_iterations{10};
 
     /// Number of iterations to profile each kernel - if 0, kernels are launched up to the profiling duration
+    /// This will always override profiling-duration and min-iterations.
     int iterations{100};
 
+    /// Time to spend profiling each kernel (ms)
+    int duration{10};
+
+    /// Minimum number of iterations to profile
+    int min_iterations{10};
+
     /// Number of ms to sleep between profiling periods (ms)
     int sleep_duration{50};
 
diff --git a/tools/profiler/include/cutlass/profiler/performance_report.h b/tools/profiler/include/cutlass/profiler/performance_report.h
index 6ea11800..5228ff42 100644
--- a/tools/profiler/include/cutlass/profiler/performance_report.h
+++ b/tools/profiler/include/cutlass/profiler/performance_report.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/performance_result.h b/tools/profiler/include/cutlass/profiler/performance_result.h
index 4b9a3321..ce1aba7e 100644
--- a/tools/profiler/include/cutlass/profiler/performance_result.h
+++ b/tools/profiler/include/cutlass/profiler/performance_result.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/problem_space.h b/tools/profiler/include/cutlass/profiler/problem_space.h
index 00391c9b..8a5f001d 100644
--- a/tools/profiler/include/cutlass/profiler/problem_space.h
+++ b/tools/profiler/include/cutlass/profiler/problem_space.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -873,6 +873,13 @@ bool arg_as_int(
   ProblemSpace const &problem_space, 
   ProblemSpace::Problem const &problem);
 
+bool arg_as_bool(bool &bool_value, KernelArgument::Value const *value_ptr);
+
+bool arg_as_bool(bool &bool_value,
+  char const *name,
+  ProblemSpace const &problem_space, 
+  ProblemSpace::Problem const &problem);
+
 /// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
 bool arg_as_NumericTypeID(library::NumericTypeID &numeric_type, KernelArgument::Value const *value_ptr);
 
diff --git a/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h b/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
index c4058452..ba47a683 100644
--- a/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/rank_2k_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h b/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
index 67f77ebb..fff190a7 100644
--- a/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/rank_k_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h b/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
index 823457c3..0c81ef46 100644
--- a/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/reduction_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
index 7e4131f7..60204d8c 100644
--- a/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/sparse_gemm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
index b10a6028..94ded5e8 100644
--- a/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/symm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h b/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
index 392c9959..9f21dafa 100644
--- a/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
+++ b/tools/profiler/include/cutlass/profiler/trmm_operation_profiler.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/conv2d_operation_profiler.cu b/tools/profiler/src/conv2d_operation_profiler.cu
index 9589c0ca..0d1559ec 100644
--- a/tools/profiler/src/conv2d_operation_profiler.cu
+++ b/tools/profiler/src/conv2d_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -1281,8 +1281,6 @@ Status Conv2dOperationProfiler::profile_cutlass_(
   void *host_workspace,
   void *device_workspace) {
 
-  GpuTimer timer;
-
   // initialize conv2d underlying operation to handle parallel reduction
   library::Operation const* underlying_operation = operation;
 
@@ -1294,23 +1292,9 @@ Status Conv2dOperationProfiler::profile_cutlass_(
     }
   }
 
-  //
-  // Optional sleep to limit power consumption and thermals
-  //
-
-  sleep(options.profiling.sleep_duration);
-
-  //
-  // Warmup loop
-  //
-
-  Status status;
-
-  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
-
+  auto func = [&](cudaStream_t, int iteration) {
     // Setup rotating workspace
-    int workspace_idx = options.profiling.warmup_iterations + iteration;
-    int problem_idx = (workspace_idx % conv_workspace_.problem_count);
+    int problem_idx = iteration % conv_workspace_.problem_count;
 
     conv_arguments->A = conv_workspace_.A->batch_data(problem_idx);
     conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
@@ -1328,7 +1312,7 @@ Status Conv2dOperationProfiler::profile_cutlass_(
     }
 
     // Run underlying conv2d operation
-    status = underlying_operation->run(
+    Status status = underlying_operation->run(
       arguments,
       host_workspace,
       device_workspace);
@@ -1345,74 +1329,10 @@ Status Conv2dOperationProfiler::profile_cutlass_(
     if (status != Status::kSuccess) {
       return status;
     }
-  }
+    return status;
+  };
 
-  //
-  // Initialize GPU timer
-  //
-
-  timer.start();
-
-  //
-  // Profiling loop
-  //
-
-  int Iterations = options.profiling.iterations;
-
-  int iteration = 0;
-  for (; iteration < Iterations; ++iteration) {
-
-    // Setup rotating workspace
-    int problem_idx = (iteration % conv_workspace_.problem_count);
-
-    conv_arguments->A = conv_workspace_.A->batch_data(problem_idx);
-    conv_arguments->B = conv_workspace_.B->batch_data(problem_idx);
-    conv_arguments->C = conv_workspace_.C->batch_data(problem_idx);
-    conv_arguments->D = conv_workspace_.Computed->batch_data(problem_idx);
-
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-      // update library::ConvArguments for parallel split-k reduction
-      conv_arguments->D = conv_workspace_.device_workspace.data();
-
-      /// initialize library::ReductionArguments
-      conv_workspace_.reduction_arguments.workspace           = conv_workspace_.device_workspace.data();
-      conv_workspace_.reduction_arguments.source              = conv_workspace_.C->batch_data(problem_idx);
-      conv_workspace_.reduction_arguments.destination         = conv_workspace_.Computed->batch_data(problem_idx);
-    }
-
-    // Run underlying conv2d operation
-    status = underlying_operation->run(
-      arguments,
-      host_workspace,
-      device_workspace);
-
-    // Run parallel reduction kernel for parallel split_k_mode
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-
-      status = reduction_op_->run(
-        &conv_workspace_.reduction_arguments,
-        conv_workspace_.reduction_host_workspace.data(),
-        nullptr);
-    }
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-  }
-
-  //
-  // Wait for completion
-  //
-
-  timer.stop_and_wait();
-
-  //
-  // Update performance result
-  //
-
-  result.runtime = timer.duration(iteration);
-
-  return status;
+  return profile_kernel_(result, options, func);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/conv3d_operation_profiler.cu b/tools/profiler/src/conv3d_operation_profiler.cu
index 04d338c3..ca5b8531 100644
--- a/tools/profiler/src/conv3d_operation_profiler.cu
+++ b/tools/profiler/src/conv3d_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -1148,8 +1148,6 @@ Status Conv3dOperationProfiler::profile_cutlass_(
   void *host_workspace,
   void *device_workspace) {
 
-  GpuTimer timer;
-
   // initialize conv2d underlying operation to handle parallel reduction
   library::Operation const* underlying_operation = operation;
 
@@ -1159,68 +1157,14 @@ Status Conv3dOperationProfiler::profile_cutlass_(
     }
   }
 
-  //
-  // Optional sleep to limit power consumption and thermals
-  //
-
-  sleep(options.profiling.sleep_duration);
-
-  //
-  // Warmup loop
-  //
-
-  Status status;
-
-  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
-
+  auto func = [&](cudaStream_t, int iteration) {
     // Setup rotating workspace
-    int workspace_idx = options.profiling.warmup_iterations + iteration;
-    int problem_idx = (workspace_idx % conv_workspace_.problem_count);
+    int problem_idx = iteration % conv_workspace_.problem_count;
 
     set_cutlass_operator_arguments_(problem_idx);
 
     // Run underlying conv2d operation
-    status = underlying_operation->run(
-      arguments,
-      host_workspace,
-      device_workspace);
-
-    // Run parallel reduction kernel for parallel split_k_mode
-    if (conv_workspace_.configuration.split_k_mode == conv::SplitKMode::kParallel) {
-
-      status = reduction_op_->run(
-        &conv_workspace_.reduction_arguments,
-        conv_workspace_.reduction_host_workspace.data(),
-        nullptr);
-    }
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-  }
-
-  //
-  // Initialize GPU timer
-  //
-
-  timer.start();
-
-  //
-  // Profiling loop
-  //
-
-  int Iterations = options.profiling.iterations;
-
-  int iteration = 0;
-  for (; iteration < Iterations; ++iteration) {
-
-    // Setup rotating workspace
-    int problem_idx = (iteration % conv_workspace_.problem_count);
-
-    set_cutlass_operator_arguments_(problem_idx);
-
-    // Run underlying conv2d operation
-    status = underlying_operation->run(
+    Status status = underlying_operation->run(
       arguments,
       host_workspace,
       device_workspace);
@@ -1236,21 +1180,11 @@ Status Conv3dOperationProfiler::profile_cutlass_(
     if (status != Status::kSuccess) {
       return status;
     }
-  }
 
-  //
-  // Wait for completion
-  //
+    return status;
+  };
 
-  timer.stop_and_wait();
-
-  //
-  // Update performance result
-  //
-
-  result.runtime = timer.duration(iteration);
-
-  return status;
+  return profile_kernel_(result, options, func);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/cublas_helpers.cu b/tools/profiler/src/cublas_helpers.cu
index 412b0a24..612ccfc5 100644
--- a/tools/profiler/src/cublas_helpers.cu
+++ b/tools/profiler/src/cublas_helpers.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/cudnn_helpers.cpp b/tools/profiler/src/cudnn_helpers.cpp
index 418b69f6..1d67442e 100644
--- a/tools/profiler/src/cudnn_helpers.cpp
+++ b/tools/profiler/src/cudnn_helpers.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index d9aae932..b4070046 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index a1866b55..f06b9607 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu
index eaca07b0..b90b2ee1 100644
--- a/tools/profiler/src/device_context.cu
+++ b/tools/profiler/src/device_context.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp
index 8e8ebe98..5bfa0b28 100644
--- a/tools/profiler/src/enumerated_types.cpp
+++ b/tools/profiler/src/enumerated_types.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index 1bed599f..80d346a0 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -76,6 +76,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
       {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"},
       {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"},
       {ArgumentTypeID::kEnumerated, {"raster_order", "raster-order"}, "Raster order (heuristic, along_n, along_m)"},
+      {ArgumentTypeID::kInteger, {"use_pdl", "use-pdl"}, "Use PDL (true, false)"}, 
       {ArgumentTypeID::kInteger, {"swizzle_size", "swizzle-size"}, "Size to swizzle"},
     },
     { library::Provider::kCUBLAS}
@@ -171,6 +172,11 @@ Status GemmOperationProfiler::GemmProblem::parse(
     this->k = 1024;
   }
 
+  if (!arg_as_bool(this->use_pdl, "use_pdl", problem_space, problem)) {
+    // default value
+    this->use_pdl = false;
+  }
+
   if (!arg_as_SplitKModeID(this->split_k_mode, "split_k_mode", problem_space, problem)) {
     // default value
     this->split_k_mode = library::SplitKMode::kSerial;
@@ -337,6 +343,7 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
   set_argument(result, "batch_count", problem_space, batch_count);
   set_argument(result, "raster_order", problem_space, library::to_string(raster_order));
   set_argument(result, "swizzle_size", problem_space, swizzle_size);
+  set_argument(result, "use_pdl", problem_space, library::to_string(use_pdl));
 
   set_argument(result, "alpha", problem_space,
     library::lexical_cast(alpha, operation_desc.element_epilogue));
@@ -388,6 +395,7 @@ Status GemmOperationProfiler::initialize_configuration(
 
     gemm_workspace_[i].configuration.device_count = static_cast<int>(device_count);
     gemm_workspace_[i].arguments.device_index = static_cast<int>(i);
+    gemm_workspace_[i].arguments.use_pdl = problem_.use_pdl;
 
     if (problem_.mode == library::GemmUniversalMode::kBatched) {
       gemm_workspace_[i].configuration.batch_count = problem_.batch_count;
@@ -396,6 +404,16 @@ Status GemmOperationProfiler::initialize_configuration(
       gemm_workspace_[i].configuration.batch_count = problem_.split_k_slices;
     }
 
+    gemm_workspace_[i].arguments.problem_size.m() = int(problem_.m);
+    gemm_workspace_[i].arguments.problem_size.n() = int(problem_.n);
+    gemm_workspace_[i].arguments.problem_size.k() = int(problem_.k);
+    if (problem_.mode == library::GemmUniversalMode::kBatched) {
+      gemm_workspace_[i].arguments.batch_count = problem_.batch_count;
+    }
+    else {
+      gemm_workspace_[i].arguments.batch_count = problem_.split_k_slices;
+    }
+
     gemm_workspace_[i].arguments.A = nullptr;
     gemm_workspace_[i].arguments.B = nullptr;
     gemm_workspace_[i].arguments.C = nullptr;
@@ -406,7 +424,6 @@ Status GemmOperationProfiler::initialize_configuration(
     gemm_workspace_[i].arguments.swizzle_size = problem_.swizzle_size;
     gemm_workspace_[i].arguments.raster_order = problem_.raster_order;
     initialize_result_(this->model_result_, options, operation_desc, problem_space);
-
     if (const auto can_implement = operation->can_implement(&gemm_workspace_[i].configuration, &gemm_workspace_[i].arguments); can_implement != Status::kSuccess) {
       return can_implement;
     }
@@ -1145,18 +1162,6 @@ bool GemmOperationProfiler::verify_with_reference_(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace {
-extern "C" {
-  __global__ void delay(cuda::atomic<bool> const* release) {
-    while (release->load(cuda::memory_order_acquire) != true) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
-      __nanosleep(100);
-#endif
-    }
-  }
-}
-}
-
 /// Measures performance results
 bool GemmOperationProfiler::profile(
   Options const &options,
@@ -1219,15 +1224,6 @@ Status GemmOperationProfiler::profile_cutlass_(
   void *,
   void *) {
 
-  cuda::atomic<bool> *release;
-  cudaHostAlloc(&release, sizeof(*release), cudaHostAllocPortable);
-  release->store(false, cuda::memory_order_release);
-
-  std::vector<GpuTimer> timer;
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(i));
-    timer.emplace_back();
-  }
   // initialize gemm underlying operation to handle parallel reduction
   library::Operation const * underlying_operation = operation;
 
@@ -1237,172 +1233,58 @@ Status GemmOperationProfiler::profile_cutlass_(
     }
   }
 
-  //
-  // Optional sleep to limit power consumption and thermals
-  //
+  auto launch_gemm = [&](int dev_id, cudaStream_t stream, int iteration) {
+    int problem_idx = (iteration % gemm_workspace_[dev_id].problem_count) * problem_.batch_count;
 
-  sleep(options.profiling.sleep_duration);
+    gemm_workspace_[dev_id].arguments.A = gemm_workspace_[dev_id].A->batch_data(problem_idx);
+    gemm_workspace_[dev_id].arguments.B = gemm_workspace_[dev_id].B->batch_data(problem_idx);
+    gemm_workspace_[dev_id].arguments.C = gemm_workspace_[dev_id].C->batch_data(problem_idx);
+    gemm_workspace_[dev_id].arguments.D = gemm_workspace_[dev_id].Computed->batch_data(problem_idx);
 
-  //
-  // Warmup loop
-  //
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      gemm_workspace_[dev_id].arguments.D                     = gemm_workspace_[dev_id].device_workspace.data();
 
-  Status status;
+      gemm_workspace_[dev_id].reduction_arguments.workspace   = gemm_workspace_[dev_id].device_workspace.data();
+      gemm_workspace_[dev_id].reduction_arguments.source      = gemm_workspace_[dev_id].C->batch_data(problem_idx);
+      gemm_workspace_[dev_id].reduction_arguments.destination = gemm_workspace_[dev_id].Computed->batch_data(problem_idx);
+    }
 
-  std::vector<cudaGraph_t> graphs;
-  graphs.resize(gemm_workspace_.size());
-  std::vector<cudaGraphExec_t> graphExecs;
-  graphExecs.resize(gemm_workspace_.size());
+    // Execute the CUTLASS operation
+    Status status = underlying_operation->run(
+      &gemm_workspace_[dev_id].arguments,
+      gemm_workspace_[dev_id].host_workspace.data(),
+      gemm_workspace_[dev_id].device_workspace.data(),
+      stream);
 
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(i));
-    cudaStreamBeginCapture(gemm_workspace_[i].stream, cudaStreamCaptureModeGlobal);
-    // Halt execution until all GPUs are ready to precede.
-    // It allows the CPU to trigger the GPUs all start at the same time.
-    delay<<<1, 1, 0, gemm_workspace_[i].stream>>>(release);
-    for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
-      int problem_idx = (iteration % gemm_workspace_[i].problem_count) * problem_.batch_count;
+    if (status != Status::kSuccess) {
+      return status;
+    }
 
-      gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->batch_data(problem_idx);
-      gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->batch_data(problem_idx);
-      gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->batch_data(problem_idx);
-      gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->batch_data(problem_idx);
-
-      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-        gemm_workspace_[i].arguments.D                     = gemm_workspace_[i].device_workspace.data();
-
-        gemm_workspace_[i].reduction_arguments.workspace   = gemm_workspace_[i].device_workspace.data();
-        gemm_workspace_[i].reduction_arguments.source      = gemm_workspace_[i].C->batch_data(problem_idx);
-        gemm_workspace_[i].reduction_arguments.destination = gemm_workspace_[i].Computed->batch_data(problem_idx);
-      }
-
-      // Execute the CUTLASS operation
-      status = underlying_operation->run(
-        &gemm_workspace_[i].arguments,
-        gemm_workspace_[i].host_workspace.data(),
-        gemm_workspace_[i].device_workspace.data(),
-        gemm_workspace_[i].stream);
+    // Run parallel reduction kernel for parallel split_k_mode
+    if (problem_.split_k_mode == library::SplitKMode::kParallel) {
+      status = reduction_op_->run(
+        &gemm_workspace_[dev_id].reduction_arguments,
+        gemm_workspace_[dev_id].reduction_host_workspace.data(),
+        nullptr,
+        gemm_workspace_[dev_id].stream);
 
       if (status != Status::kSuccess) {
         return status;
       }
-
-      // Run parallel reduction kernel for parallel split_k_mode
-      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-        status = reduction_op_->run(
-          &gemm_workspace_[i].reduction_arguments,
-          gemm_workspace_[i].reduction_host_workspace.data(),
-          nullptr,
-          gemm_workspace_[i].stream);
-
-        if (status != Status::kSuccess) {
-          return status;
-        }
-      }
     }
+    return Status::kSuccess;
+  };
 
-    //
-    // Initialize GPU timer
-    //
-
-    timer[i].start(gemm_workspace_[i].stream, cudaEventRecordExternal);
-
-    //
-    // Profiling loop
-    //
-
-    int Iterations = options.profiling.iterations;
-
-    int iteration = 0;
-
-    for (; iteration < Iterations; ++iteration) {
-      // Iterate over copies of the problem in memory
-      int workspace_idx = options.profiling.warmup_iterations + iteration;
-      int problem_idx = (workspace_idx % gemm_workspace_[i].problem_count) * problem_.batch_count;
-
-      gemm_workspace_[i].arguments.A = gemm_workspace_[i].A->batch_data(problem_idx);
-      gemm_workspace_[i].arguments.B = gemm_workspace_[i].B->batch_data(problem_idx);
-      gemm_workspace_[i].arguments.C = gemm_workspace_[i].C->batch_data(problem_idx);
-      gemm_workspace_[i].arguments.D = gemm_workspace_[i].Computed->batch_data(problem_idx);
-
-      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-        gemm_workspace_[i].arguments.D                     = gemm_workspace_[i].device_workspace.data();
-
-        gemm_workspace_[i].reduction_arguments.workspace   = gemm_workspace_[i].device_workspace.data();
-        gemm_workspace_[i].reduction_arguments.source      = gemm_workspace_[i].C->batch_data(problem_idx);
-        gemm_workspace_[i].reduction_arguments.destination = gemm_workspace_[i].Computed->batch_data(problem_idx);
-      }
-
-      status = underlying_operation->run(
-        &gemm_workspace_[i].arguments,
-        gemm_workspace_[i].host_workspace.data(),
-        gemm_workspace_[i].device_workspace.data(),
-        gemm_workspace_[i].stream);
-
-      if (status != Status::kSuccess) {
-        return status;
-      }
-
-      // Run parallel reduction kernel for parallel split_k_mode
-      if (problem_.split_k_mode == library::SplitKMode::kParallel) {
-        status = reduction_op_->run(
-          &gemm_workspace_[i].reduction_arguments,
-          gemm_workspace_[i].reduction_host_workspace.data(),
-          nullptr,
-          gemm_workspace_[i].stream);
-
-        if (status != Status::kSuccess) {
-          return status;
-        }
-      }
-    }
-    timer[i].stop(gemm_workspace_[i].stream, cudaEventRecordExternal);
-    cudaStreamEndCapture(gemm_workspace_[i].stream, &graphs[i]);
-    cudaGraphInstantiate(&graphExecs[i], graphs[i], nullptr, nullptr, 0);
+  if (options.device.devices.size() == 1) {
+    auto func = [&](cudaStream_t stream, int iteration) { return launch_gemm(0, stream, iteration); };
+    return profile_kernel_(result, options, func, gemm_workspace_[0].stream);
   }
 
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(i));
-    cudaGraphLaunch(graphExecs[i], gemm_workspace_[i].stream);
+  std::vector<cudaStream_t> streams(gemm_workspace_.size());
+  for (size_t i = 0; i < streams.size(); i++) {
+    streams[i] = gemm_workspace_[i].stream;
   }
-
-  //
-  // Wait for completion
-  //
-
-  release->store(true, cuda::memory_order_release);
-
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(i));
-    cudaStreamSynchronize(gemm_workspace_[i].stream);
-  }
-  //
-  // Update performance result
-  //
-
-
-  result.runtime = 0;
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(i));
-    result.runtime_vector[i] = timer[i].duration(options.profiling.iterations);
-    result.runtime += result.runtime_vector[i];
-  }
-  result.runtime /= static_cast<double>(gemm_workspace_.size());
-
-  cudaFreeHost(release);
-
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(i));
-    cudaGraphExecDestroy(graphExecs[i]);
-    cudaGraphDestroy(graphs[i]);
-  }
-
-  for (size_t i = 0; i < gemm_workspace_.size(); ++i) {
-    cudaSetDevice(options.device.device_id(gemm_workspace_.size() - i - 1));
-    timer.pop_back();
-  }
-
-  return status;
+  return profile_kernel_(result, options, launch_gemm, streams);
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp
index cd0e4df0..4b915367 100644
--- a/tools/profiler/src/gpu_timer.cpp
+++ b/tools/profiler/src/gpu_timer.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp
index 1e46d15a..8e941516 100644
--- a/tools/profiler/src/main.cpp
+++ b/tools/profiler/src/main.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index 4d5c9d09..d11009ce 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -47,6 +47,8 @@
 // sleep not supported
 #endif
 
+#include <cuda/atomic>
+
 #include "cutlass/profiler/options.h"
 #include "cutlass/profiler/operation_profiler.h"
 #include "cutlass/profiler/gpu_timer.h"
@@ -55,9 +57,18 @@
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+#define CUDA_CHECK(call)                                                                                               \
+  do {                                                                                                                 \
+    cudaError_t err = call;                                                                                            \
+    if (err != cudaSuccess) {                                                                                          \
+      std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__ << " code=" << err << " \""                         \
+                << cudaGetErrorString(err) << "\"\n";                                                                  \
+      return Status::kErrorInternal;                                                                                   \
+    }                                                                                                                  \
+  } while (0)
+
 namespace cutlass {
 namespace profiler {
-
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 OperationProfiler::OperationProfiler(): kind_(library::OperationKind::kInvalid) { }
@@ -656,6 +667,203 @@ void OperationProfiler::save_workspace(
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
+namespace {
+extern "C" {
+__global__ void delay(cuda::atomic<bool> const *release) {
+  while (release->load(cuda::memory_order_acquire) != true) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
+    __nanosleep(100);
+#endif
+  }
+}
+}
+
+Status predict_iters(
+  int &iterations,
+  Options const &options,
+  const std::function<Status(cudaStream_t, int)> &func,
+  cudaStream_t stream) {
+  // always use profiling-iterations if requested
+  if (options.profiling.iterations != 0) {
+    iterations = options.profiling.iterations;
+    return Status::kSuccess;
+  }
+
+  // otherwise run for as many iterations as necessary to
+  // meet profiling-duration
+  constexpr int CALIBRATION_ITERS = 5;
+  GpuTimer timer;
+  timer.start(stream);
+  for (int i = 0; i < CALIBRATION_ITERS; i++) {
+    Status status = func(stream, i);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+  }
+  timer.stop_and_wait(stream);
+
+  double est_iters             = options.profiling.duration / std::max(timer.duration(CALIBRATION_ITERS), 1e-6);
+  constexpr uint64_t MAX_ITERS = 1'000'000;
+  iterations = std::min(static_cast<uint64_t>(std::ceil(est_iters)), static_cast<uint64_t>(MAX_ITERS));
+  iterations = std::max(options.profiling.min_iterations, iterations);
+  return Status::kSuccess;
+};
+
+} // namespace
+
+/// This profiling method is designed to run a kernel on several GPUs to
+/// measure interference (e.g. due to power throttling).
+/// To encourage the kernels to start at the same time and minimize jitter,
+/// a spinloop kernel blocks each stream while work is being enqueued, which is
+/// later triggered from the host.
+/// CUDA graphs allows you to record the launch of large numbers of kernels without
+/// blocking and therefore avoids a deadlock which happens if you try to enqueue too
+/// many kernels behind the spinloop kernel.
+Status OperationProfiler::profile_kernel_(
+  PerformanceResult &result,
+  Options const &options,
+  const std::function<Status(int, cudaStream_t, int)> &func,
+  const std::vector<cudaStream_t> &streams) {
+  auto dev_count = streams.size();
+  cuda::atomic<bool> *release;
+  CUDA_CHECK(cudaHostAlloc(&release, sizeof(*release), cudaHostAllocPortable));
+  release->store(false, cuda::memory_order_release);
+
+  std::vector<GpuTimer> timer;
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(i)));
+    timer.emplace_back();
+  }
+
+  std::vector<cudaGraph_t> graphs;
+  graphs.resize(dev_count);
+  std::vector<cudaGraphExec_t> graphExecs;
+  graphExecs.resize(dev_count);
+
+  sleep(options.profiling.sleep_duration);
+
+  // predict time by running on device 0
+  int iterations;
+  CUDA_CHECK(cudaSetDevice(0));
+  Status status = predict_iters(
+    iterations,
+    options,
+    [&](cudaStream_t stream, int iter) { return func(0, stream, iter); },
+    streams[0]);
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(i)));
+    CUDA_CHECK(cudaStreamBeginCapture(streams[i], cudaStreamCaptureModeGlobal));
+    // Halt execution until all GPUs are ready to precede.
+    // It allows the CPU to trigger the GPUs all start at the same time.
+    delay<<<1, 1, 0, streams[i]>>>(release);
+    for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
+      Status status = func(i, streams[i], iteration);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+
+    timer[i].start(streams[i], cudaEventRecordExternal);
+
+    int iteration = 0;
+    for (; iteration < iterations; ++iteration) {
+      Status status = func(i, streams[i], iteration + options.profiling.warmup_iterations);
+      if (status != Status::kSuccess) {
+        return status;
+      }
+    }
+    timer[i].stop(streams[i], cudaEventRecordExternal);
+    CUDA_CHECK(cudaStreamEndCapture(streams[i], &graphs[i]));
+    CUDA_CHECK(cudaGraphInstantiate(&graphExecs[i], graphs[i], nullptr, nullptr, 0));
+  }
+
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(i)));
+    CUDA_CHECK(cudaGraphLaunch(graphExecs[i], streams[i]));
+  }
+
+  // release the enqueued kernels
+  release->store(true, cuda::memory_order_release);
+
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(i)));
+    CUDA_CHECK(cudaStreamSynchronize(streams[i]));
+  }
+
+  result.runtime = 0;
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(i)));
+    result.runtime_vector[i] = timer[i].duration(iterations);
+    result.runtime += result.runtime_vector[i];
+  }
+  result.runtime /= static_cast<double>(dev_count);
+
+  CUDA_CHECK(cudaFreeHost(release));
+
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(i)));
+    CUDA_CHECK(cudaGraphExecDestroy(graphExecs[i]));
+    CUDA_CHECK(cudaGraphDestroy(graphs[i]));
+  }
+
+  for (size_t i = 0; i < dev_count; ++i) {
+    CUDA_CHECK(cudaSetDevice(options.device.device_id(dev_count - i - 1)));
+    timer.pop_back();
+  }
+
+  return Status::kSuccess;
+}
+
+/// Method to profile GPU execution time of a kernel launched in func
+Status OperationProfiler::profile_kernel_(
+  PerformanceResult &result,
+  Options const &options,
+  const std::function<Status(cudaStream_t, int)> &func,
+  cudaStream_t stream) {
+
+  GpuTimer timer;
+  // Optional sleep to limit power consumption and thermals
+  sleep(options.profiling.sleep_duration);
+
+  Status status = Status::kSuccess;
+
+  int iterations;
+  status = predict_iters(iterations, options, func, stream);
+  if (status != Status::kSuccess) {
+    return status;
+  }
+
+  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
+    status = func(stream, iteration);
+    if (status != Status::kSuccess) {
+      return status;
+    }
+  }
+
+  timer.start(stream);
+
+  int iteration = 0;
+  for (; iteration < iterations; ++iteration) {
+    status = func(stream, iteration + options.profiling.warmup_iterations);
+
+    if (status != Status::kSuccess) {
+      result.status = status;
+      return status;
+    }
+  }
+
+  timer.stop_and_wait(stream);
+
+  result.runtime = timer.duration(iteration);
+  result.status  = status;
+
+  return status;
+}
+
 /// Method to profile a CUTLASS Operation
 Status OperationProfiler::profile_cutlass_(
   PerformanceResult &result,
@@ -665,70 +873,8 @@ Status OperationProfiler::profile_cutlass_(
   void *host_workspace,
   void *device_workspace) {
 
-  GpuTimer timer;
-
-  //
-  // Optional sleep to limit power consumption and thermals
-  //
-
-  sleep(options.profiling.sleep_duration);
-
-  //
-  // Warmup loop
-  //
-
-  Status status;
-
-  for (int iteration = 0; iteration < options.profiling.warmup_iterations; ++iteration) {
-
-    status = operation->run(
-      arguments,
-      host_workspace,
-      device_workspace);
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-  }
-
-  //
-  // Initialize GPU timer
-  //
-
-  timer.start();
-
-  //
-  // Profiling loop
-  //
-
-  int Iterations = options.profiling.iterations;
-
-  int iteration = 0;
-  for (; iteration < Iterations; ++iteration) {
-
-    status = operation->run(
-      arguments,
-      host_workspace,
-      device_workspace);
-
-    if (status != Status::kSuccess) {
-      return status;
-    }
-  }
-
-  //
-  // Wait for completion
-  //
-
-  timer.stop_and_wait();
-
-  //
-  // Update performance result
-  //
-
-  result.runtime = timer.duration(iteration);
-
-  return status;
+  auto op = [=](cudaStream_t, int) { return operation->run(arguments, host_workspace, device_workspace); };
+  return profile_kernel_(result, options, op);
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
@@ -748,9 +894,6 @@ void OperationProfiler::initialize_result_(
   set_argument(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m());
   set_argument(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n());
   set_argument(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k());
-  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
-  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
-  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
   set_argument(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages);
   set_argument(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m());
   set_argument(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n());
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index 59368e9b..4dd066fe 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -474,6 +474,8 @@ Options::Profiling::Profiling(cutlass::CommandLine const &cmdline) {
   cmdline.get_cmd_line_argument("profiling-iterations", iterations, 100);
   cmdline.get_cmd_line_argument("sleep-duration", sleep_duration, 50);
   cmdline.get_cmd_line_argument("profiling-enabled", enabled, true);
+  cmdline.get_cmd_line_argument("profiling-duration", duration, 10);
+  cmdline.get_cmd_line_argument("min-iterations", min_iterations, 10);
 
   if (cmdline.check_cmd_line_flag("providers")) {
 
@@ -504,7 +506,17 @@ void Options::Profiling::print_usage(std::ostream &out) const {
 
     << "  --profiling-iterations=<iterations>          "
     << "    Number of iterations to profile each kernel. If zero, kernels" << end_of_line
-    << "      are launched up to the profiling duration.\n\n"
+    << "      are launched up to the profiling duration. If non-zero, this overrides" << end_of_line
+    << "      --profiling-duration and --min-iterations.\n\n"
+
+    << "  --profiling-duration=<duration>             "
+    << "    Time to spend profiling each kernel (ms)." << end_of_line
+    << "    Overriden by `profiling-iterations` when `profiling-iterations` > 0." << end_of_line
+    << "    Note that `min-iterations` must also be satisfied.\n\n"
+
+    << "  --min-iterations=<iterations>             "
+    << "    Minimum number of iterations to spend profiling each kernel, even if" << end_of_line
+    << "    `profiling-duration` has been met.\n\n"
 
     << "  --warmup-iterations=<iterations>             "
     << "    Number of iterations to execute each kernel prior to profiling.\n\n"
diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp
index 1d04f48f..50531d35 100644
--- a/tools/profiler/src/performance_report.cpp
+++ b/tools/profiler/src/performance_report.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu
index 3a21c9b5..ada9247f 100644
--- a/tools/profiler/src/performance_result.cu
+++ b/tools/profiler/src/performance_result.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp
index bd76bdbb..ced00009 100644
--- a/tools/profiler/src/problem_space.cpp
+++ b/tools/profiler/src/problem_space.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -800,6 +800,44 @@ bool arg_as_int(
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
+/// Lexically casts an argument to an bool if it is defined. Returns true if not null.
+bool arg_as_bool(bool &bool_value, KernelArgument::Value const *value_ptr) {
+  if (value_ptr->not_null) {
+    if (value_ptr->argument->description->type == ArgumentTypeID::kInteger) {
+      int64_t value64;
+      arg_as_int(value64, value_ptr);
+      bool_value = static_cast<bool>(value64); 
+    }
+    else if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) {
+      bool_value = library::from_string<bool>(
+        static_cast<EnumeratedTypeArgument::EnumeratedTypeValue const *>(value_ptr)->element);
+    }
+    else {
+      throw std::runtime_error(
+        "arg_as_bool() - illegal cast. Problem space argument must be integer or enumerated");
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+/// Lexically casts an argument to a bool
+bool arg_as_bool(
+  bool &bool_value,
+  char const *name,
+  ProblemSpace const &problem_space,
+  ProblemSpace::Problem const &problem) {
+
+  size_t idx = problem_space.argument_index(name);
+  KernelArgument::Value const *value_ptr = problem.at(idx).get();
+
+  return arg_as_bool(bool_value, value_ptr);
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 /// Lexically casts an argument to an int64 if it is defined. Returns true if not null.
 bool arg_as_NumericTypeID(
   library::NumericTypeID &numeric_type, 
diff --git a/tools/profiler/src/rank_2k_operation_profiler.cu b/tools/profiler/src/rank_2k_operation_profiler.cu
index 4b547a3e..1c0845d8 100644
--- a/tools/profiler/src/rank_2k_operation_profiler.cu
+++ b/tools/profiler/src/rank_2k_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -297,6 +297,10 @@ void Rank2KOperationProfiler::RankKProblem::initialize_result(
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
 
+  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
+
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
 
diff --git a/tools/profiler/src/rank_k_operation_profiler.cu b/tools/profiler/src/rank_k_operation_profiler.cu
index 52613b8e..93be3452 100644
--- a/tools/profiler/src/rank_k_operation_profiler.cu
+++ b/tools/profiler/src/rank_k_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -286,6 +286,10 @@ void RankKOperationProfiler::RankKProblem::initialize_result(
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
 
+  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
+
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
 
diff --git a/tools/profiler/src/sparse_gemm_operation_profiler.cu b/tools/profiler/src/sparse_gemm_operation_profiler.cu
index ec14a332..35ee1a8c 100644
--- a/tools/profiler/src/sparse_gemm_operation_profiler.cu
+++ b/tools/profiler/src/sparse_gemm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -240,6 +240,10 @@ void SparseGemmOperationProfiler::SparseGemmProblem::initialize_result(
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
 
+  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
+
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
 
diff --git a/tools/profiler/src/symm_operation_profiler.cu b/tools/profiler/src/symm_operation_profiler.cu
index 80f645e7..364862c4 100644
--- a/tools/profiler/src/symm_operation_profiler.cu
+++ b/tools/profiler/src/symm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -320,6 +320,10 @@ void SymmOperationProfiler::SymmProblem::initialize_result(
   set_argument(result, "m", problem_space, m);
   set_argument(result, "n", problem_space, n);
 
+  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
+
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
 
diff --git a/tools/profiler/src/trmm_operation_profiler.cu b/tools/profiler/src/trmm_operation_profiler.cu
index 9d3b4db6..260ec787 100644
--- a/tools/profiler/src/trmm_operation_profiler.cu
+++ b/tools/profiler/src/trmm_operation_profiler.cu
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -251,6 +251,10 @@ void TrmmOperationProfiler::TrmmProblem::initialize_result(
   set_argument(result, "m", problem_space, m);
   set_argument(result, "n", problem_space, n);
 
+  set_argument(result, "cluster_m", problem_space, operation_desc.tile_description.cluster_shape.m());
+  set_argument(result, "cluster_n", problem_space, operation_desc.tile_description.cluster_shape.n());
+  set_argument(result, "cluster_k", problem_space, operation_desc.tile_description.cluster_shape.k());
+
   set_argument(result, "split_k_slices", problem_space, split_k_slices);
   set_argument(result, "batch_count", problem_space, batch_count);
 
diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt
index 66d019b5..b69ea023 100644
--- a/tools/util/CMakeLists.txt
+++ b/tools/util/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/GPU_Clock.hpp b/tools/util/include/cutlass/util/GPU_Clock.hpp
index 2774c079..c2727c98 100644
--- a/tools/util/include/cutlass/util/GPU_Clock.hpp
+++ b/tools/util/include/cutlass/util/GPU_Clock.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h
index 9dc3a117..b60d868c 100644
--- a/tools/util/include/cutlass/util/command_line.h
+++ b/tools/util/include/cutlass/util/command_line.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/cublas_wrappers.hpp b/tools/util/include/cutlass/util/cublas_wrappers.hpp
index e6b5f550..8de1aa8e 100644
--- a/tools/util/include/cutlass/util/cublas_wrappers.hpp
+++ b/tools/util/include/cutlass/util/cublas_wrappers.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h
index 1dc059e3..88481a82 100644
--- a/tools/util/include/cutlass/util/debug.h
+++ b/tools/util/include/cutlass/util/debug.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h
index bb20e9b7..a73a8cfe 100644
--- a/tools/util/include/cutlass/util/device_dump.h
+++ b/tools/util/include/cutlass/util/device_dump.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_groupnorm.h b/tools/util/include/cutlass/util/device_groupnorm.h
index 5fc93a11..59457b2e 100644
--- a/tools/util/include/cutlass/util/device_groupnorm.h
+++ b/tools/util/include/cutlass/util/device_groupnorm.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_layernorm.h b/tools/util/include/cutlass/util/device_layernorm.h
index 7708c3eb..0fcbf5cb 100644
--- a/tools/util/include/cutlass/util/device_layernorm.h
+++ b/tools/util/include/cutlass/util/device_layernorm.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h
index 7d3fa73f..b79b3f92 100644
--- a/tools/util/include/cutlass/util/device_memory.h
+++ b/tools/util/include/cutlass/util/device_memory.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_nchw_to_nhwc.h b/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
index c4619896..8e380299 100644
--- a/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
+++ b/tools/util/include/cutlass/util/device_nchw_to_nhwc.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_nhwc_padding.h b/tools/util/include/cutlass/util/device_nhwc_padding.h
index 9a2317e2..f58da62a 100644
--- a/tools/util/include/cutlass/util/device_nhwc_padding.h
+++ b/tools/util/include/cutlass/util/device_nhwc_padding.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_nhwc_pooling.h b/tools/util/include/cutlass/util/device_nhwc_pooling.h
index cce452d9..5633456c 100644
--- a/tools/util/include/cutlass/util/device_nhwc_pooling.h
+++ b/tools/util/include/cutlass/util/device_nhwc_pooling.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_nhwc_to_nchw.h b/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
index 4a5f7800..babfecd3 100644
--- a/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
+++ b/tools/util/include/cutlass/util/device_nhwc_to_nchw.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_rmsnorm.h b/tools/util/include/cutlass/util/device_rmsnorm.h
index 44a1c084..0d1b1af5 100644
--- a/tools/util/include/cutlass/util/device_rmsnorm.h
+++ b/tools/util/include/cutlass/util/device_rmsnorm.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/device_utils.h b/tools/util/include/cutlass/util/device_utils.h
index 7a8378fc..9747d509 100644
--- a/tools/util/include/cutlass/util/device_utils.h
+++ b/tools/util/include/cutlass/util/device_utils.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h
index 086e033a..6565aba9 100644
--- a/tools/util/include/cutlass/util/distribution.h
+++ b/tools/util/include/cutlass/util/distribution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h
index 54c62fdb..f2b7df6c 100644
--- a/tools/util/include/cutlass/util/exceptions.h
+++ b/tools/util/include/cutlass/util/exceptions.h
@@ -1,5 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/gett_commandline.hpp b/tools/util/include/cutlass/util/gett_commandline.hpp
index 9f2e5a29..be226446 100644
--- a/tools/util/include/cutlass/util/gett_commandline.hpp
+++ b/tools/util/include/cutlass/util/gett_commandline.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/helper_cuda.hpp b/tools/util/include/cutlass/util/helper_cuda.hpp
index 69bc1e2b..58d08b86 100644
--- a/tools/util/include/cutlass/util/helper_cuda.hpp
+++ b/tools/util/include/cutlass/util/helper_cuda.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h
index fe06c300..4e771805 100644
--- a/tools/util/include/cutlass/util/host_reorder.h
+++ b/tools/util/include/cutlass/util/host_reorder.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h
index 3f061875..3226055a 100644
--- a/tools/util/include/cutlass/util/host_tensor.h
+++ b/tools/util/include/cutlass/util/host_tensor.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
index 923aa49a..ca770e4d 100644
--- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h
+++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/host_uncompress.h b/tools/util/include/cutlass/util/host_uncompress.h
index ccf23934..9cd62927 100644
--- a/tools/util/include/cutlass/util/host_uncompress.h
+++ b/tools/util/include/cutlass/util/host_uncompress.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/index_sequence.h b/tools/util/include/cutlass/util/index_sequence.h
index 95628670..6b72b043 100644
--- a/tools/util/include/cutlass/util/index_sequence.h
+++ b/tools/util/include/cutlass/util/index_sequence.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/packed_stride.hpp b/tools/util/include/cutlass/util/packed_stride.hpp
index e9a243a1..811ba152 100644
--- a/tools/util/include/cutlass/util/packed_stride.hpp
+++ b/tools/util/include/cutlass/util/packed_stride.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/print_error.hpp b/tools/util/include/cutlass/util/print_error.hpp
index 9eed9d14..c38ad3f7 100644
--- a/tools/util/include/cutlass/util/print_error.hpp
+++ b/tools/util/include/cutlass/util/print_error.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h
index 2bce60b1..8167c91b 100644
--- a/tools/util/include/cutlass/util/reference/detail/inner_product.h
+++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
index 1f784c46..652d6225 100644
--- a/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
+++ b/tools/util/include/cutlass/util/reference/detail/linear_to_coordinate.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/convolution.h b/tools/util/include/cutlass/util/reference/device/convolution.h
index c91cd0e2..552a7a2e 100644
--- a/tools/util/include/cutlass/util/reference/device/convolution.h
+++ b/tools/util/include/cutlass/util/reference/device/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h
index 1a1bd375..7d575d52 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
index b4d41bd2..bddf5962 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
index 37c103c3..48819cf6 100644
--- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/gett.hpp b/tools/util/include/cutlass/util/reference/device/gett.hpp
index 78586ad6..497a257d 100644
--- a/tools/util/include/cutlass/util/reference/device/gett.hpp
+++ b/tools/util/include/cutlass/util/reference/device/gett.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
index f7731213..6e131126 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
index c703f07f..149e4b2e 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
index a64a419d..3223cb20 100644
--- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
index d5892457..2e76fe52 100644
--- a/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
+++ b/tools/util/include/cutlass/util/reference/device/rank_2k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
index e6b36990..f8b5395f 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
index 059076d9..a19b4282 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
index 3911b024..b2e8d32a 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
index 47b898b4..c210d533 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
index 4e5a5040..0e3d99dd 100644
--- a/tools/util/include/cutlass/util/reference/device/tensor_relu.h
+++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
index 04775a74..dd11f96b 100644
--- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h
+++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/conv.hpp b/tools/util/include/cutlass/util/reference/host/conv.hpp
index 545dbba9..1056f4e9 100644
--- a/tools/util/include/cutlass/util/reference/host/conv.hpp
+++ b/tools/util/include/cutlass/util/reference/host/conv.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
@@ -55,8 +55,9 @@ template<class EngineAct, class LayoutAct>
 bool
 is_activation_in_bounds(
     cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<4>(activation)) &&
+    int32_t n_, int32_t d_, int32_t h_, int32_t w_, int32_t c_, int32_t g_) {
+  return ((g_ >= 0 && g_ < size<5>(activation)) &&
+          (n_ >= 0 && n_ < size<4>(activation)) &&
           (d_ >= 0 && d_ < size<3>(activation)) &&
           (h_ >= 0 && h_ < size<2>(activation)) &&
           (w_ >= 0 && w_ < size<1>(activation)) &&
@@ -67,8 +68,9 @@ template<class EngineAct, class LayoutAct>
 bool
 is_activation_in_bounds(
     cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t h_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<3>(activation)) &&
+    int32_t n_, int32_t h_, int32_t w_, int32_t c_, int32_t g_) {
+  return ((g_ >= 0 && g_ < size<4>(activation)) &&
+          (n_ >= 0 && n_ < size<3>(activation)) &&
           (h_ >= 0 && h_ < size<2>(activation)) &&
           (w_ >= 0 && w_ < size<1>(activation)) &&
           (c_ >= 0 && c_ < size<0>(activation)));
@@ -78,8 +80,9 @@ template<class EngineAct, class LayoutAct>
 bool
 is_activation_in_bounds(
     cute::Tensor<EngineAct, LayoutAct> const& activation,
-    int32_t n_, int32_t w_, int32_t c_) {
-  return ((n_ >= 0 && n_ < size<2>(activation)) &&
+    int32_t n_, int32_t w_, int32_t c_, int32_t g_) {
+  return ((g_ >= 0 && g_ < size<3>(activation)) &&
+          (n_ >= 0 && n_ < size<2>(activation)) &&
           (w_ >= 0 && w_ < size<1>(activation)) &&
           (c_ >= 0 && c_ < size<0>(activation)));
 }
@@ -196,6 +199,7 @@ struct ConvReferenceImpl {
 private:
   // Specialization for 1D fprop kernel
   void fprop_reference(cute::Int<1> spatial_dims) {
+    int32_t G = size<3>(tensor_d_);
     int32_t N = size<2>(tensor_d_);
     int32_t Q = size<1>(tensor_d_);
     int32_t K = size<0>(tensor_d_);
@@ -205,31 +209,33 @@ private:
 #if defined(_OPENMP)
   #pragma omp parallel for collapse(2)
 #endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t q = 0; q < Q; ++q) {
-        for (int32_t k = 0; k < K; ++k) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-              if (detail::is_activation_in_bounds(tensor_a_, n, w, c)) {
-                auto a = tensor_a_(c, w, n);
-                auto b = tensor_b_(c, s, k);
-                accumulator += ElementAcc(a * b);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t q = 0; q < Q; ++q) {
+          for (int32_t k = 0; k < K; ++k) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t s = 0; s < S; ++s) {
+              for (int32_t c = 0; c < C; ++c) {
+                int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                if (detail::is_activation_in_bounds(tensor_a_, n, w, c, g)) {
+                  auto a = tensor_a_(c, w, n, g);
+                  auto b = tensor_b_(c, s, k, g);
+                  accumulator += ElementAcc(a * b);
+                }
               }
             }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(k, q, n, g));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+            }
+            output = epi_activation(output);
+            tensor_d_(k, q, n, g) = output_converter(output);
           }
-          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-            epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-            epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(k, q, n));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-          }
-          output = epi_activation(output);
-          tensor_d_(k, q, n) = output_converter(output);
         }
       }
     }
@@ -238,6 +244,7 @@ private:
 
   // Specialization for 2D fprop kernel
   void fprop_reference(cute::Int<2> spatial_dims) {
+    int32_t G = size<4>(tensor_d_);
     int32_t N = size<3>(tensor_d_);
     int32_t P = size<2>(tensor_d_);
     int32_t Q = size<1>(tensor_d_);
@@ -249,35 +256,37 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(3)
 #endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t p = 0; p < P; ++p) {
-        for (int32_t q = 0; q < Q; ++q) {
-          for (int32_t k = 0; k < K; ++k) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t r = 0; r < R; ++r) {
-              for (int32_t s = 0; s < S; ++s) {
-                for (int32_t c = 0; c < C; ++c) {
-                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                  if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c)) {
-                    auto a = tensor_a_(c, w, h, n);
-                    auto b = tensor_b_(c, s, r, k);
-                    accumulator += ElementAcc(a * b);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t p = 0; p < P; ++p) {
+          for (int32_t q = 0; q < Q; ++q) {
+            for (int32_t k = 0; k < K; ++k) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t r = 0; r < R; ++r) {
+                for (int32_t s = 0; s < S; ++s) {
+                  for (int32_t c = 0; c < C; ++c) {
+                    int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                    int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                    if (detail::is_activation_in_bounds(tensor_a_, n, h, w, c, g)) {
+                      auto a = tensor_a_(c, w, h, n, g);
+                      auto b = tensor_b_(c, s, r, k, g);
+                      accumulator += ElementAcc(a * b);
+                    }
                   }
                 }
               }
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n, g));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+              }
+              output = epi_activation(output);
+              tensor_d_(k, q, p, n, g) = output_converter(output);
             }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-              epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-              epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(k, q, p, n));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            tensor_d_(k, q, p, n) = output_converter(output);
           }
         }
       }
@@ -287,6 +296,7 @@ private:
 
   // Specialization for 3D fprop kernel
   void fprop_reference(cute::Int<3> spatial_dims) {
+    int32_t G = size<5>(tensor_d_);
     int32_t N = size<4>(tensor_d_);
     int32_t Z = size<3>(tensor_d_);
     int32_t P = size<2>(tensor_d_);
@@ -300,39 +310,41 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(3)
 #endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t z = 0; z < Z; ++z) {
-        for (int32_t p = 0; p < P; ++p) {
-          for (int32_t q = 0; q < Q; ++q) {
-            for (int32_t k = 0; k < K; ++k) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t t = 0; t < T; ++t) {
-                for (int32_t r = 0; r < R; ++r) {
-                  for (int32_t s = 0; s < S; ++s) {
-                    for (int32_t c = 0; c < C; ++c) {
-                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                      if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c)) {
-                        auto a = tensor_a_(c, w, h, d, n);
-                        auto b = tensor_b_(c, s, r, t, k);
-                        accumulator += ElementAcc(a * b);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t z = 0; z < Z; ++z) {
+          for (int32_t p = 0; p < P; ++p) {
+            for (int32_t q = 0; q < Q; ++q) {
+              for (int32_t k = 0; k < K; ++k) {
+                auto accumulator = ElementAcc(0);
+                for (int32_t t = 0; t < T; ++t) {
+                  for (int32_t r = 0; r < R; ++r) {
+                    for (int32_t s = 0; s < S; ++s) {
+                      for (int32_t c = 0; c < C; ++c) {
+                        int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                        int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                        int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                        if (detail::is_activation_in_bounds(tensor_a_, n, d, h, w, c, g)) {
+                          auto a = tensor_a_(c, w, h, d, n, g);
+                          auto b = tensor_b_(c, s, r, t, k, g);
+                          accumulator += ElementAcc(a * b);
+                        }
                       }
                     }
                   }
                 }
+                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                  epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
+                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                  epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
+                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                        scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n, g));
+                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                  output += bias_converter(epi_fusion_params_.tensor_bias[k]);
+                }
+                output = epi_activation(output);
+                tensor_d_(k, q, p, z, n, g) = output_converter(output);
               }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-                epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-                epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(k, q, p, z, n));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              tensor_d_(k, q, p, z, n) = output_converter(output);
             }
           }
         }
@@ -343,6 +355,7 @@ private:
 
   // Specialization for 1D dgrad kernel
   void dgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t G = size<3>(tensor_d_);
     int32_t N = size<2>(tensor_d_);
     int32_t W = size<1>(tensor_d_);
     int32_t C = size<0>(tensor_d_);
@@ -352,36 +365,38 @@ private:
 #if defined(_OPENMP)
    #pragma omp parallel for collapse(2)
 #endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t w = 0; w < W; ++w) {
-        for (int32_t c = 0; c < C; ++c) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t k = 0; k < K; ++k) {
-            for (int32_t s = 0; s < S; ++s) {
-              int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t w = 0; w < W; ++w) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t k = 0; k < K; ++k) {
+              for (int32_t s = 0; s < S; ++s) {
+                int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
 
-              if (q % cute::get<0>(tstride_) == 0) {
-                q /= cute::get<0>(tstride_);
-              } else {
-                continue;
-              }
+                if (q % cute::get<0>(tstride_) == 0) {
+                  q /= cute::get<0>(tstride_);
+                } else {
+                  continue;
+                }
 
-              if (detail::is_activation_in_bounds(tensor_a_, n, q, k)) {
-                accumulator += ElementAcc(tensor_a_(k, q, n) * tensor_b_(c, s, k));
+                if (detail::is_activation_in_bounds(tensor_a_, n, q, k, g)) {
+                  accumulator += ElementAcc(tensor_a_(k, q, n, g) * tensor_b_(c, s, k, g));
+                }
               }
             }
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(c, w, n, g));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+            }
+            output = epi_activation(output);
+            tensor_d_(c, w, n, g) = output_converter(output);
           }
-          ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-            ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-          ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-            ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(c, w, n));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-          }
-          output = epi_activation(output);
-          tensor_d_(c, w, n) = output_converter(output);
         }
       }
     }
@@ -390,6 +405,7 @@ private:
 
   // Specialization for 2D dgrad kernel
   void dgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t G = size<4>(tensor_d_);
     int32_t N = size<3>(tensor_d_);
     int32_t H = size<2>(tensor_d_);
     int32_t W = size<1>(tensor_d_);
@@ -401,47 +417,49 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(3)
 #endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t h = 0; h < H; ++h) {
-        for (int32_t w = 0; w < W; ++w) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t k = 0; k < K; ++k) {
-              for (int32_t r = 0; r < R; ++r) {
-                for (int32_t s = 0; s < S; ++s) {
-                  int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                  int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t h = 0; h < H; ++h) {
+          for (int32_t w = 0; w < W; ++w) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t k = 0; k < K; ++k) {
+                for (int32_t r = 0; r < R; ++r) {
+                  for (int32_t s = 0; s < S; ++s) {
+                    int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                    int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
 
-                  if (q % cute::get<0>(tstride_) == 0) {
-                    q /= cute::get<0>(tstride_);
-                  } else {
-                    continue;
-                  }
+                    if (q % cute::get<0>(tstride_) == 0) {
+                      q /= cute::get<0>(tstride_);
+                    } else {
+                      continue;
+                    }
 
-                  if (p % cute::get<1>(tstride_) == 0) {
-                    p /= cute::get<1>(tstride_);
-                  } else {
-                    continue;
-                  }
+                    if (p % cute::get<1>(tstride_) == 0) {
+                      p /= cute::get<1>(tstride_);
+                    } else {
+                      continue;
+                    }
 
-                  if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k)) {
-                    accumulator += ElementAcc(tensor_a_(k, q, p, n) * tensor_b_(c, s, r, k));
+                    if (detail::is_activation_in_bounds(tensor_a_, n, p, q, k, g)) {
+                      accumulator += ElementAcc(tensor_a_(k, q, p, n, g) * tensor_b_(c, s, r, k, g));
+                    }
                   }
                 }
               }
-            }
-            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-              ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-              ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-            }
-            output = epi_activation(output);
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(c, w, h, n, g));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+              }
+              output = epi_activation(output);
 
-            tensor_d_(c, w, h, n) = output_converter(output);
+              tensor_d_(c, w, h, n, g) = output_converter(output);
+            }
           }
         }
       }
@@ -451,6 +469,7 @@ private:
 
   // Specialization for 3D dgrad kernel
   void dgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t G = size<5>(tensor_d_);
     int32_t N = size<4>(tensor_d_);
     int32_t D = size<3>(tensor_d_);
     int32_t H = size<2>(tensor_d_);
@@ -464,56 +483,58 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(3)
 #endif
-    for (int32_t n = 0; n < N; ++n) {
-      for (int32_t d = 0; d < D; ++d) {
-        for (int32_t h = 0; h < H; ++h) {
-          for (int32_t w = 0; w < W; ++w) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t k = 0; k < K; ++k) {
-                for (int32_t t = 0; t < T; ++t) {
-                  for (int32_t r = 0; r < R; ++r) {
-                    for (int32_t s = 0; s < S; ++s) {
-                      int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
-                      int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
-                      int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t n = 0; n < N; ++n) {
+        for (int32_t d = 0; d < D; ++d) {
+          for (int32_t h = 0; h < H; ++h) {
+            for (int32_t w = 0; w < W; ++w) {
+              for (int32_t c = 0; c < C; ++c) {
+                auto accumulator = ElementAcc(0);
+                for (int32_t k = 0; k < K; ++k) {
+                  for (int32_t t = 0; t < T; ++t) {
+                    for (int32_t r = 0; r < R; ++r) {
+                      for (int32_t s = 0; s < S; ++s) {
+                        int32_t q = w + cute::get<0>(padding_) - s * cute::get<0>(dilation_);
+                        int32_t p = h + cute::get<1>(padding_) - r * cute::get<1>(dilation_);
+                        int32_t z = d + cute::get<2>(padding_) - t * cute::get<2>(dilation_);
 
-                      if (q % cute::get<0>(tstride_) == 0) {
-                        q /= cute::get<0>(tstride_);
-                      } else {
-                        continue;
-                      }
+                        if (q % cute::get<0>(tstride_) == 0) {
+                          q /= cute::get<0>(tstride_);
+                        } else {
+                          continue;
+                        }
 
-                      if (p % cute::get<1>(tstride_) == 0) {
-                        p /= cute::get<1>(tstride_);
-                      } else {
-                        continue;
-                      }
+                        if (p % cute::get<1>(tstride_) == 0) {
+                          p /= cute::get<1>(tstride_);
+                        } else {
+                          continue;
+                        }
 
-                      if (z % cute::get<2>(tstride_) == 0) {
-                        z /= cute::get<2>(tstride_);
-                      } else {
-                        continue;
-                      }
+                        if (z % cute::get<2>(tstride_) == 0) {
+                          z /= cute::get<2>(tstride_);
+                        } else {
+                          continue;
+                        }
 
-                      if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k)) {
-                        accumulator += ElementAcc(tensor_a_(k, q, p, z, n) * tensor_b_(c, s, r, t, k));
+                        if (detail::is_activation_in_bounds(tensor_a_, n, z, p, q, k, g)) {
+                          accumulator += ElementAcc(tensor_a_(k, q, p, z, n, g) * tensor_b_(c, s, r, t, k, g));
+                        }
                       }
                     }
                   }
                 }
+                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
+                  ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
+                  ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                        scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n, g));
+                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                  output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+                }
+                output = epi_activation(output);
+                tensor_d_(c, w, h, d, n, g) = output_converter(output);
               }
-              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data())
-                ? epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
-              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data())
-                ? epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(c, w, h, d, n));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
-              }
-              output = epi_activation(output);
-              tensor_d_(c, w, h, d, n) = output_converter(output);
             }
           }
         }
@@ -524,6 +545,7 @@ private:
 
   // Specialization for 1D wgrad kernel
   void wgrad_reference(cute::Int<1> spatial_dims) {
+    int32_t G = size<3>(tensor_d_);
     int32_t N =
         size<2>(tensor_a_);
     int32_t Q =
@@ -536,35 +558,39 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(2)
 #endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t s = 0; s < S; ++s) {
-        for (int32_t c = 0; c < C; ++c) {
-          auto accumulator = ElementAcc(0);
-          for (int32_t n = 0; n < N; ++n) {
-            for (int32_t q = 0; q < Q; ++q) {
-              int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-              bool is_in_bounds =
-                  detail::is_activation_in_bounds(tensor_b_, n, w, c);
-              if (is_in_bounds) {
-                auto act =
-                    tensor_b_(c, w, n);
-                auto xformed_act =
-                    tensor_a_(k, q, n);
-                accumulator += ElementAcc(act * xformed_act);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t k = 0; k < K; ++k) {
+        for (int32_t s = 0; s < S; ++s) {
+          for (int32_t c = 0; c < C; ++c) {
+            auto accumulator = ElementAcc(0);
+            for (int32_t n = 0; n < N; ++n) {
+              for (int32_t q = 0; q < Q; ++q) {
+                int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                bool is_in_bounds =
+                    detail::is_activation_in_bounds(tensor_b_, n, w, c, g);
+                if (is_in_bounds) {
+                  auto act =
+                      tensor_b_(c, w, n, g);
+                  auto xformed_act =
+                      tensor_a_(k, q, n, g);
+                  accumulator += ElementAcc(act * xformed_act);
+                }
               }
             }
+
+            ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+              epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+            ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+              epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+
+            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                    scale_converter(beta) * residual_converter(tensor_c_(c, s, k, g));
+            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+              output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+            }
+            output = epi_activation(output);
+            tensor_d_(c, s, k, g) = output_converter(output);
           }
-          ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                  scale_converter(beta) * residual_converter(tensor_c_(c, s, k));
-          if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-            output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-          }
-          output = epi_activation(output);
-          tensor_d_(c, s, k) = output_converter(output);
         }
       }
     }
@@ -572,6 +598,7 @@ private:
 
   // Specialization for 2D wgrad kernel
   void wgrad_reference(cute::Int<2> spatial_dims) {
+    int32_t G = size<4>(tensor_d_);
     int32_t N =
         size<3>(tensor_a_);
     int32_t P =
@@ -587,39 +614,43 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(3)
 #endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t r = 0; r < R; ++r) {
-        for (int32_t s = 0; s < S; ++s) {
-          for (int32_t c = 0; c < C; ++c) {
-            auto accumulator = ElementAcc(0);
-            for (int32_t n = 0; n < N; ++n) {
-              for (int32_t p = 0; p < P; ++p) {
-                for (int32_t q = 0; q < Q; ++q) {
-                  int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                  int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                  bool is_in_bounds =
-                      detail::is_activation_in_bounds(tensor_b_, n, h, w, c);
-                  if (is_in_bounds) {
-                    auto act =
-                        tensor_b_(c, w, h, n);
-                    auto xformed_act =
-                        tensor_a_(k, q, p, n);
-                    accumulator += ElementAcc(act * xformed_act);
+    for (int32_t g = 0; g < G; ++g) {
+      for (int32_t k = 0; k < K; ++k) {
+        for (int32_t r = 0; r < R; ++r) {
+          for (int32_t s = 0; s < S; ++s) {
+            for (int32_t c = 0; c < C; ++c) {
+              auto accumulator = ElementAcc(0);
+              for (int32_t n = 0; n < N; ++n) {
+                for (int32_t p = 0; p < P; ++p) {
+                  for (int32_t q = 0; q < Q; ++q) {
+                    int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                    int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                    bool is_in_bounds =
+                        detail::is_activation_in_bounds(tensor_b_, n, h, w, c, g);
+                    if (is_in_bounds) {
+                      auto act =
+                          tensor_b_(c, w, h, n, g);
+                      auto xformed_act =
+                          tensor_a_(k, q, p, n, g);
+                      accumulator += ElementAcc(act * xformed_act);
+                    }
                   }
                 }
               }
+
+              ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+              ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+
+              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                      scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k, g));
+              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+              }
+              output = epi_activation(output);
+              tensor_d_(c, s, r, k, g) = output_converter(output);
             }
-            ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                    scale_converter(beta) * residual_converter(tensor_c_(c, s, r, k));
-            if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-              output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-            }
-            output = epi_activation(output);
-            tensor_d_(c, s, r, k) = output_converter(output);
           }
         }
       }
@@ -628,6 +659,7 @@ private:
 
   // Specialization for 3D wgrad kernel
   void wgrad_reference(cute::Int<3> spatial_dims) {
+    int32_t G = size<5>(tensor_d_);
     int32_t N =
         size<4>(tensor_a_);
     int32_t Z =
@@ -646,43 +678,47 @@ private:
 #if defined(_OPENMP)
     #pragma omp parallel for collapse(3)
 #endif
-    for (int32_t k = 0; k < K; ++k) {
-      ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
-        epi_fusion_params_.tensor_alpha[k] : epi_fusion_params_.alpha;
-      ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
-        epi_fusion_params_.tensor_beta[k] : epi_fusion_params_.beta;
-      for (int32_t t = 0; t < T; ++t) {
-        for (int32_t r = 0; r < R; ++r) {
-          for (int32_t s = 0; s < S; ++s) {
-            for (int32_t c = 0; c < C; ++c) {
-              auto accumulator = ElementAcc(0);
-              for (int32_t n = 0; n < N; ++n) {
-                for (int32_t z = 0; z < Z; ++z) {
-                  for (int32_t p = 0; p < P; ++p) {
-                    for (int32_t q = 0; q < Q; ++q) {
-                      int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
-                      int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
-                      int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
-                      bool is_in_bounds =
-                          detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c);
-                      if (is_in_bounds) {
-                        auto act =
-                            tensor_b_(c, w, h, d, n);
-                        auto xformed_act =
-                            tensor_a_(k, q, p, z, n);
-                        accumulator += ElementAcc(act * xformed_act);
+    for (int32_t g = 0 ; g < G; ++g) {
+      for (int32_t k = 0; k < K; ++k) {
+        for (int32_t t = 0; t < T; ++t) {
+          for (int32_t r = 0; r < R; ++r) {
+            for (int32_t s = 0; s < S; ++s) {
+              for (int32_t c = 0; c < C; ++c) {
+                auto accumulator = ElementAcc(0);
+                for (int32_t n = 0; n < N; ++n) {
+                  for (int32_t z = 0; z < Z; ++z) {
+                    for (int32_t p = 0; p < P; ++p) {
+                      for (int32_t q = 0; q < Q; ++q) {
+                        int32_t w =  q * cute::get<0>(tstride_) - cute::get<0>(padding_) + s * cute::get<0>(dilation_);
+                        int32_t h =  p * cute::get<1>(tstride_) - cute::get<1>(padding_) + r * cute::get<1>(dilation_);
+                        int32_t d =  z * cute::get<2>(tstride_) - cute::get<2>(padding_) + t * cute::get<2>(dilation_);
+                        bool is_in_bounds =
+                            detail::is_activation_in_bounds(tensor_b_, n, d, h, w, c, g);
+                        if (is_in_bounds) {
+                          auto act =
+                              tensor_b_(c, w, h, d, n, g);
+                          auto xformed_act =
+                              tensor_a_(k, q, p, z, n, g);
+                          accumulator += ElementAcc(act * xformed_act);
+                        }
                       }
                     }
                   }
                 }
+
+                ElementScalar alpha = raw_pointer_cast(epi_fusion_params_.tensor_alpha.data()) ?
+                  epi_fusion_params_.tensor_alpha[c] : epi_fusion_params_.alpha;
+                ElementScalar beta = raw_pointer_cast(epi_fusion_params_.tensor_beta.data()) ?
+                  epi_fusion_params_.tensor_beta[c] : epi_fusion_params_.beta;
+
+                ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
+                                        scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k, g));
+                if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
+                  output += bias_converter(epi_fusion_params_.tensor_bias[c]);
+                }
+                output = epi_activation(output);
+                tensor_d_(c, s, r, t, k, g) = output_converter(output);
               }
-              ElementCompute output = scale_converter(alpha) * acc_converter(accumulator) +
-                                      scale_converter(beta) * residual_converter(tensor_c_(c, s, r, t, k));
-              if (raw_pointer_cast(epi_fusion_params_.tensor_bias.data())) {
-                output += bias_converter(epi_fusion_params_.tensor_bias[k]);
-              }
-              output = epi_activation(output);
-              tensor_d_(c, s, r, t, k) = output_converter(output);
             }
           }
         }
diff --git a/tools/util/include/cutlass/util/reference/host/convolution.h b/tools/util/include/cutlass/util/reference/host/convolution.h
index f28b4a65..73298e57 100644
--- a/tools/util/include/cutlass/util/reference/host/convolution.h
+++ b/tools/util/include/cutlass/util/reference/host/convolution.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/error_metrics.h b/tools/util/include/cutlass/util/reference/host/error_metrics.h
index 86db65cc..12ead833 100644
--- a/tools/util/include/cutlass/util/reference/host/error_metrics.h
+++ b/tools/util/include/cutlass/util/reference/host/error_metrics.h
@@ -1,6 +1,6 @@
 
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h
index 03888131..dc5a2be6 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
index 92da343a..221a6040 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
index 094af8b3..507c37d9 100644
--- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/gett.hpp b/tools/util/include/cutlass/util/reference/host/gett.hpp
index 184d7737..98ad45e9 100644
--- a/tools/util/include/cutlass/util/reference/host/gett.hpp
+++ b/tools/util/include/cutlass/util/reference/host/gett.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k.h b/tools/util/include/cutlass/util/reference/host/rank_2k.h
index 2a99bc03..67867533 100644
--- a/tools/util/include/cutlass/util/reference/host/rank_2k.h
+++ b/tools/util/include/cutlass/util/reference/host/rank_2k.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
index 9e1ac76c..a7381016 100644
--- a/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/rank_2k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/rank_k_complex.h b/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
index 6f9d5dc4..1aad33fd 100644
--- a/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/rank_k_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/symm.h b/tools/util/include/cutlass/util/reference/host/symm.h
index a585caf7..34f9648f 100644
--- a/tools/util/include/cutlass/util/reference/host/symm.h
+++ b/tools/util/include/cutlass/util/reference/host/symm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/symm_complex.h b/tools/util/include/cutlass/util/reference/host/symm_complex.h
index 7a55bb39..79e146f6 100644
--- a/tools/util/include/cutlass/util/reference/host/symm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/symm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
index df164a37..c243ba30 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp b/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp
index a1f3f5b1..27ef969b 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp
+++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
index 0b963b72..d2a43b12 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
index 42ce2183..5470df29 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
index 85c70e41..645902f7 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp b/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp
index 86a54e2e..1b3df239 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp
+++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
index 43ff1736..bcb1af99 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
index 8a724066..d44dda1f 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
index 048352ae..887c5680 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
+++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp b/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp
index 5ea51541..ea711466 100644
--- a/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp
+++ b/tools/util/include/cutlass/util/reference/host/tensor_reduce.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/trmm.h b/tools/util/include/cutlass/util/reference/host/trmm.h
index 08b97925..09b1aff9 100644
--- a/tools/util/include/cutlass/util/reference/host/trmm.h
+++ b/tools/util/include/cutlass/util/reference/host/trmm.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/reference/host/trmm_complex.h b/tools/util/include/cutlass/util/reference/host/trmm_complex.h
index 86e58a03..e8db2a4d 100644
--- a/tools/util/include/cutlass/util/reference/host/trmm_complex.h
+++ b/tools/util/include/cutlass/util/reference/host/trmm_complex.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h
index 4f6bdd68..0ce1d8a6 100644
--- a/tools/util/include/cutlass/util/tensor_view_io.h
+++ b/tools/util/include/cutlass/util/tensor_view_io.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without
diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h
index dec3168e..5dfbfe27 100644
--- a/tools/util/include/cutlass/util/type_traits.h
+++ b/tools/util/include/cutlass/util/type_traits.h
@@ -1,5 +1,5 @@
 /***************************************************************************************************
- * Copyright (c) 2017 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Redistribution and use in source and binary forms, with or without