From 86931fef8538008a1a92036732b3eb7fe47b25d0 Mon Sep 17 00:00:00 2001 From: Andrew Kerr Date: Mon, 8 Jun 2020 16:17:35 -0700 Subject: [PATCH] CUTLASS 2.2 (#96) Adds support for NVIDIA Ampere Architecture features. CUDA 11 Toolkit recommended. --- CHANGELOG.md | 16 + CMakeLists.txt | 17 +- CONTRIBUTORS.md | 14 +- CUDA.cmake | 6 +- LICENSE.txt | 2 +- README.md | 98 +- cmake/nop.cu | 2 +- cuBLAS.cmake | 68 +- examples/00_basic_gemm/CMakeLists.txt | 2 +- examples/00_basic_gemm/basic_gemm.cu | 2 +- examples/01_cutlass_utilities/CMakeLists.txt | 2 +- .../01_cutlass_utilities/cutlass_utilities.cu | 2 +- examples/02_dump_reg_shmem/CMakeLists.txt | 2 +- examples/02_dump_reg_shmem/dump_reg_shmem.cu | 2 +- examples/03_visualize_layout/CMakeLists.txt | 2 +- examples/03_visualize_layout/options.h | 2 +- .../03_visualize_layout/register_layout.cu | 28 +- .../03_visualize_layout/register_layout.h | 2 +- .../03_visualize_layout/visualize_layout.cpp | 14 +- .../03_visualize_layout/visualize_layout.h | 2 +- examples/04_tile_iterator/CMakeLists.txt | 2 +- examples/04_tile_iterator/tile_iterator.cu | 2 +- examples/05_batched_gemm/CMakeLists.txt | 2 +- examples/05_batched_gemm/batched_gemm.cu | 2 +- examples/06_splitK_gemm/CMakeLists.txt | 2 +- examples/06_splitK_gemm/splitk_gemm.cu | 49 +- .../07_volta_tensorop_gemm/CMakeLists.txt | 2 +- .../volta_tensorop_gemm.cu | 46 +- .../08_turing_tensorop_gemm/CMakeLists.txt | 2 +- .../turing_tensorop_gemm.cu | 40 +- examples/10_planar_complex/planar_complex.cu | 12 +- .../planar_complex_array.cu | 12 +- examples/12_gemm_bias_relu/CMakeLists.txt | 27 + examples/12_gemm_bias_relu/gemm_bias_relu.cu | 282 ++ examples/13_fused_two_gemms/CMakeLists.txt | 33 + ...b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h | 190 ++ examples/13_fused_two_gemms/b2b_gemm_run.h | 608 ++++ .../b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h | 190 ++ .../b2b_interleaved_gemm_run.h | 633 +++++ examples/13_fused_two_gemms/device/b2b_gemm.h | 439 +++ examples/13_fused_two_gemms/fused_gemm.cu | 74 + examples/13_fused_two_gemms/kernel/b2b_gemm.h | 407 +++ .../kernel/default_b2b_gemm.h | 296 ++ .../threadblock/b2b_mma_base.h | 230 ++ .../threadblock/b2b_mma_pipelined.h | 509 ++++ .../threadblock/default_b2b_mma.h | 289 ++ examples/CMakeLists.txt | 4 +- include/cutlass/aligned_buffer.h | 2 +- include/cutlass/arch/arch.h | 6 +- include/cutlass/arch/cache_operation.h | 60 + include/cutlass/arch/memory.h | 262 +- include/cutlass/arch/memory_sm75.h | 70 +- include/cutlass/arch/memory_sm80.h | 238 ++ include/cutlass/arch/mma.h | 17 +- include/cutlass/arch/mma_sm50.h | 2 +- include/cutlass/arch/mma_sm60.h | 2 +- include/cutlass/arch/mma_sm61.h | 2 +- include/cutlass/arch/mma_sm70.h | 2 +- include/cutlass/arch/mma_sm75.h | 2 +- include/cutlass/arch/mma_sm80.h | 2091 ++++++++++++++ include/cutlass/arch/simd.h | 2 +- include/cutlass/arch/simd_sm60.h | 2 +- include/cutlass/arch/simd_sm61.h | 2 +- include/cutlass/arch/wmma.h | 2 +- include/cutlass/arch/wmma_sm70.h | 2 +- include/cutlass/arch/wmma_sm72.h | 2 +- include/cutlass/arch/wmma_sm75.h | 5 +- include/cutlass/array.h | 4 +- include/cutlass/array_subbyte.h | 2 +- include/cutlass/bfloat16.h | 461 ++++ include/cutlass/complex.h | 15 +- include/cutlass/coord.h | 26 +- include/cutlass/core_io.h | 68 +- include/cutlass/cutlass.h | 8 +- include/cutlass/device_kernel.h | 2 +- include/cutlass/epilogue/thread/activation.h | 119 + .../cutlass/epilogue/thread/conversion_op.h | 4 +- .../epilogue/thread/linear_combination.h | 24 +- .../thread/linear_combination_clamp.h | 106 +- .../linear_combination_planar_complex.h | 35 +- .../epilogue/thread/linear_combination_relu.h | 159 +- .../thread/linear_combination_sigmoid.h | 206 ++ .../cutlass/epilogue/thread/reduction_op.h | 2 +- .../default_epilogue_complex_tensor_op.h | 89 +- .../default_epilogue_planar_complex.h | 40 +- .../threadblock/default_epilogue_simt.h | 3 +- .../threadblock/default_epilogue_tensor_op.h | 216 +- .../default_epilogue_volta_tensor_op.h | 3 +- .../default_epilogue_wmma_tensor_op.h | 3 +- .../threadblock/default_thread_map_simt.h | 2 +- .../default_thread_map_tensor_op.h | 50 +- .../default_thread_map_volta_tensor_op.h | 2 +- .../default_thread_map_wmma_tensor_op.h | 2 +- .../threadblock/direct_epilogue_tensor_op.h | 2 +- .../cutlass/epilogue/threadblock/epilogue.h | 133 +- .../epilogue/threadblock/epilogue_base.h | 2 +- .../threadblock/epilogue_planar_complex.h | 2 +- .../epilogue/threadblock/epilogue_workspace.h | 2 +- .../threadblock/interleaved_epilogue.h | 2 +- .../threadblock/output_tile_thread_map.h | 63 +- .../threadblock/predicated_tile_iterator.h | 44 +- .../threadblock/shared_load_iterator.h | 22 +- .../threadblock/shared_load_iterator_mixed.h | 559 ++++ .../fragment_iterator_complex_tensor_op.h | 2 +- ...ment_iterator_gaussian_complex_tensor_op.h | 188 ++ .../epilogue/warp/fragment_iterator_simt.h | 2 +- .../warp/fragment_iterator_tensor_op.h | 2 +- .../warp/fragment_iterator_volta_tensor_op.h | 2 +- .../warp/fragment_iterator_wmma_tensor_op.h | 2 +- include/cutlass/epilogue/warp/simt_policy.h | 2 +- .../cutlass/epilogue/warp/tensor_op_policy.h | 2 +- .../epilogue/warp/tile_iterator_simt.h | 2 +- .../epilogue/warp/tile_iterator_tensor_op.h | 2 +- .../warp/tile_iterator_tensor_op_mixed.h | 675 +++++ .../warp/tile_iterator_volta_tensor_op.h | 2 +- .../warp/tile_iterator_wmma_tensor_op.h | 2 +- .../epilogue/warp/volta_tensor_op_policy.h | 2 +- .../epilogue/warp/wmma_tensor_op_policy.h | 2 +- include/cutlass/fast_math.h | 2 +- include/cutlass/functional.h | 218 +- .../gemm/device/default_gemm_configuration.h | 338 ++- include/cutlass/gemm/device/gemm.h | 4 +- include/cutlass/gemm/device/gemm_array.h | 2 +- include/cutlass/gemm/device/gemm_batched.h | 2 +- include/cutlass/gemm/device/gemm_complex.h | 8 +- .../gemm/device/gemm_splitk_parallel.h | 2 +- include/cutlass/gemm/device/gemm_universal.h | 4 +- .../gemm/device/gemm_universal_adapter.h | 116 +- .../cutlass/gemm/device/gemm_universal_base.h | 2 +- include/cutlass/gemm/gemm.h | 5 +- include/cutlass/gemm/kernel/default_gemm.h | 209 +- .../gemm/kernel/default_gemm_complex.h | 63 +- .../default_gemm_planar_complex_universal.h | 119 +- .../kernel/default_gemm_splitk_parallel.h | 2 +- .../gemm/kernel/default_gemm_universal.h | 2 +- include/cutlass/gemm/kernel/default_gemv.h | 2 +- include/cutlass/gemm/kernel/gemm.h | 2 +- include/cutlass/gemm/kernel/gemm_array.h | 2 +- include/cutlass/gemm/kernel/gemm_batched.h | 2 +- include/cutlass/gemm/kernel/gemm_pipelined.h | 2 +- .../cutlass/gemm/kernel/gemm_planar_complex.h | 9 +- .../gemm/kernel/gemm_planar_complex_array.h | 10 +- .../gemm/kernel/gemm_splitk_parallel.h | 2 +- include/cutlass/gemm/kernel/gemm_universal.h | 14 +- .../gemm/kernel/gemv_batched_strided.h | 2 +- include/cutlass/gemm/thread/mma.h | 2 +- include/cutlass/gemm/thread/mma_sm50.h | 2 +- include/cutlass/gemm/thread/mma_sm60.h | 2 +- include/cutlass/gemm/thread/mma_sm61.h | 2 +- .../gemm/threadblock/default_gemv_core.h | 2 +- .../cutlass/gemm/threadblock/default_mma.h | 264 +- .../gemm/threadblock/default_mma_core.h | 15 +- .../gemm/threadblock/default_mma_core_simt.h | 2 +- .../gemm/threadblock/default_mma_core_sm50.h | 2 +- .../gemm/threadblock/default_mma_core_sm70.h | 2 +- .../gemm/threadblock/default_mma_core_sm75.h | 519 +++- .../gemm/threadblock/default_mma_core_sm80.h | 2130 ++++++++++++++ .../gemm/threadblock/default_mma_core_wmma.h | 2 +- .../default_mma_planar_complex_multistage.h | 130 + .../default_multistage_mma_complex.h | 154 ++ .../default_multistage_mma_complex_core.h | 113 + ...default_multistage_mma_complex_core_sm80.h | 1113 ++++++++ include/cutlass/gemm/threadblock/gemv.h | 140 - include/cutlass/gemm/threadblock/mma_base.h | 2 +- .../cutlass/gemm/threadblock/mma_multistage.h | 526 ++++ .../cutlass/gemm/threadblock/mma_pipelined.h | 4 +- .../threadblock/mma_planar_complex_base.h | 2 +- .../mma_planar_complex_multistage.h | 642 +++++ .../gemm/threadblock/mma_singlestage.h | 2 +- .../gemm/threadblock/threadblock_swizzle.h | 67 +- .../gemm/warp/default_mma_complex_tensor_op.h | 401 +++ .../cutlass/gemm/warp/default_mma_tensor_op.h | 16 +- .../gemm/warp/default_mma_tensor_op_sm80.h | 186 ++ .../gemm/warp/default_mma_wmma_tensor_op.h | 14 +- include/cutlass/gemm/warp/mma.h | 2 +- .../cutlass/gemm/warp/mma_complex_tensor_op.h | 843 ++++++ ...mma_complex_tensor_op_tile_iterator_sm80.h | 2448 +++++++++++++++++ .../warp/mma_gaussian_complex_tensor_op.h | 357 +++ ...ian_complex_tensor_op_tile_iterator_sm80.h | 384 +++ include/cutlass/gemm/warp/mma_simt.h | 5 +- include/cutlass/gemm/warp/mma_simt_policy.h | 2 +- .../gemm/warp/mma_simt_tile_iterator.h | 2 +- include/cutlass/gemm/warp/mma_tensor_op.h | 105 +- .../warp/mma_tensor_op_fragment_iterator.h | 428 +++ .../cutlass/gemm/warp/mma_tensor_op_policy.h | 2 +- .../cutlass/gemm/warp/mma_tensor_op_sm70.h | 8 +- .../gemm/warp/mma_tensor_op_tile_iterator.h | 519 +++- .../warp/mma_tensor_op_tile_iterator_sm70.h | 2 +- .../warp/mma_tensor_op_tile_iterator_sm80.h | 1579 +++++++++++ .../warp/mma_tensor_op_tile_iterator_wmma.h | 2 +- .../cutlass/gemm/warp/mma_tensor_op_wmma.h | 21 +- include/cutlass/half.h | 2 +- include/cutlass/integer_subbyte.h | 2 +- include/cutlass/kernel_launch.h | 2 +- include/cutlass/layout/layout.h | 2 +- include/cutlass/layout/matrix.h | 2 +- include/cutlass/layout/pitch_linear.h | 2 +- include/cutlass/layout/tensor.h | 2 +- .../layout/tensor_op_multiplicand_sm70.h | 2 +- .../layout/tensor_op_multiplicand_sm75.h | 2 +- .../layout/tensor_op_multiplicand_sm80.h | 1133 ++++++++ include/cutlass/layout/vector.h | 2 +- include/cutlass/matrix_coord.h | 2 +- include/cutlass/matrix_shape.h | 2 +- include/cutlass/matrix_traits.h | 2 +- include/cutlass/numeric_conversion.h | 309 ++- include/cutlass/numeric_types.h | 6 +- include/cutlass/platform/platform.h | 2 +- include/cutlass/predicate_vector.h | 2 +- include/cutlass/real.h | 3 +- include/cutlass/reduction/batched_reduction.h | 2 +- .../reduction/batched_reduction_traits.h | 2 +- .../cutlass/reduction/device/reduce_split_k.h | 215 ++ .../cutlass/reduction/kernel/reduce_split_k.h | 10 +- include/cutlass/reduction/thread/reduce.h | 2 +- .../reduction/thread/reduction_operators.h | 2 +- .../cutlass/reduction/threadblock_swizzle.h | 2 +- include/cutlass/relatively_equal.h | 24 +- include/cutlass/semaphore.h | 10 +- include/cutlass/subbyte_reference.h | 2 +- include/cutlass/tensor_coord.h | 2 +- include/cutlass/tensor_ref.h | 2 +- include/cutlass/tensor_view.h | 14 +- include/cutlass/tfloat32.h | 453 +++ include/cutlass/thread/matrix.h | 2 +- .../transform/pitch_linear_thread_map.h | 2 +- include/cutlass/transform/thread/transpose.h | 2 +- include/cutlass/transform/thread/unaryOp.h | 101 + .../predicated_tile_access_iterator.h | 3 +- ...icated_tile_access_iterator_2dthreadtile.h | 2 +- .../threadblock/predicated_tile_iterator.h | 10 +- .../predicated_tile_iterator_2dthreadtile.h | 2 +- .../regular_tile_access_iterator.h | 2 +- ...egular_tile_access_iterator_pitch_linear.h | 2 +- .../regular_tile_access_iterator_tensor_op.h | 265 +- ...ular_tile_access_iterator_tensor_op_sm80.h | 1522 ++++++++++ .../threadblock/regular_tile_iterator.h | 2 +- .../regular_tile_iterator_pitch_linear.h | 2 +- ..._tile_iterator_pitch_linear_2dthreadtile.h | 2 +- .../regular_tile_iterator_tensor_op.h | 265 +- .../regular_tile_iterator_tensor_op_sm70.h | 2 +- include/cutlass/util/debug.h | 122 - include/cutlass/wmma_array.h | 2 +- media/docs/code_organization.md | 11 +- media/docs/doxygen_mainpage.md | 2 +- media/docs/efficient_gemm.md | 3 +- media/docs/functionality.md | 65 +- media/docs/fundamental_types.md | 37 +- media/docs/gemm_api.md | 2 +- media/docs/layout.md | 2 +- media/docs/profiler.md | 78 +- media/docs/programming_guidelines.md | 25 +- media/docs/quickstart.md | 90 +- media/docs/terminology.md | 2 +- media/docs/tile_iterator_concept.md | 2 +- media/docs/utilities.md | 11 +- media/images/cutlass-performance-plot.png | Bin 98106 -> 69902 bytes ...gemm-hierarchy-with-epilogue-no-labels.png | Bin 132936 -> 184294 bytes test/CMakeLists.txt | 2 +- test/unit/CMakeLists.txt | 5 +- test/unit/common/cutlass_unit_test.h | 2 +- test/unit/common/filter_architecture.cpp | 3 +- test/unit/core/CMakeLists.txt | 4 +- test/unit/core/array.cu | 10 +- test/unit/core/bfloat16.cu | 209 ++ test/unit/core/complex.cu | 2 +- test/unit/core/functional.cu | 12 +- test/unit/core/half.cu | 2 +- test/unit/core/matrix_coord.cu | 2 +- test/unit/core/numeric_conversion.cu | 2 +- test/unit/core/predicate_vector.cu | 2 +- test/unit/core/tensor_ref.cu | 2 +- test/unit/core/tensor_view.cu | 2 +- test/unit/core/test_unit_core.cpp | 2 +- test/unit/core/tfloat32.cu | 197 ++ test/unit/epilogue/CMakeLists.txt | 2 +- test/unit/epilogue/thread/CMakeLists.txt | 2 +- .../epilogue/thread/linear_combination.cu | 2 +- .../linear_combination_planar_complex.cu | 2 +- test/unit/epilogue/threadblock/CMakeLists.txt | 2 +- .../threadblock/epilogue_planar_complex.cu | 2 +- .../epilogue/threadblock/epilogue_simt.cu | 2 +- .../threadblock/epilogue_simt_sm60.cu | 2 +- .../threadblock/epilogue_simt_sm61.cu | 2 +- .../threadblock/epilogue_tensor_op.cu | 304 +- .../threadblock/epilogue_volta_tensor_op.cu | 2 +- .../epilogue_wmma_tensor_op_sm70.cu | 2 +- .../threadblock/output_tile_threadmap.cu | 2 +- .../threadblock/predicated_tile_iterator.cu | 2 +- test/unit/epilogue/threadblock/testbed.h | 2 +- .../threadblock/testbed_planar_complex.h | 2 +- test/unit/epilogue/warp/CMakeLists.txt | 2 +- .../warp/fragment_iterator_tensor_op.cu | 2 +- .../warp/fragment_iterator_volta_tensor_op.cu | 2 +- .../warp/fragment_iterator_wmma_tensor_op.cu | 2 +- test/unit/gemm/CMakeLists.txt | 2 +- test/unit/gemm/device/CMakeLists.txt | 61 +- .../gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu | 14 +- .../gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu | 373 +++ ...mm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu | 14 +- .../gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu | 14 +- .../gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu | 374 +++ ...mm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu | 14 +- ...emm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu | 353 +++ ...mm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu | 337 +++ ...32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu | 253 ++ ...32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu | 252 ++ ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu | 192 ++ ...mm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu | 246 ++ ...cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu | 191 ++ ...mm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu | 299 ++ ..._f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu | 338 +++ ...f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu | 24 +- .../gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu | 337 +++ ..._f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu | 340 +++ ...f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ..._f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- ...6n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu | 4 +- ...6n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu | 82 + .../gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu | 14 +- .../gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu | 338 +++ .../gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu | 77 + ...f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu | 16 +- ..._f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu | 4 +- ..._f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu | 339 +++ ...f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ...16n_singlestage_wmma_tensor_op_f16_sm70.cu | 18 +- ..._f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- ...16t_singlestage_wmma_tensor_op_f16_sm70.cu | 18 +- ...6t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu | 4 +- ...6t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu | 83 + .../gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu | 14 +- .../gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu | 339 +++ ...f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu | 16 +- ..._f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu | 24 +- ..._f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- ...32t_singlestage_wmma_tensor_op_f32_sm70.cu | 12 +- .../gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu | 338 +++ ...f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu | 16 +- ..._f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ..._f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu | 8 +- ..._f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu | 8 +- ..._f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu | 24 +- ..._f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu | 24 +- .../gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu | 338 +++ ..._f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu | 8 +- .../gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu | 14 +- .../gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu | 338 +++ ...f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu | 14 +- ..._f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu | 20 +- ..._f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu | 87 + .../gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu | 82 + .../gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu | 212 ++ .../gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu | 212 ++ ...anar_complex_f16_f16_f32_tensor_op_sm70.cu | 6 +- ...anar_complex_f16_f16_f32_tensor_op_sm75.cu | 217 ++ ...anar_complex_f16_f16_f32_tensor_op_sm80.cu | 216 ++ .../gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu | 10 +- .../gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu | 213 ++ .../gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu | 354 +++ ...mm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu | 357 +++ ...mm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu | 14 +- .../gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu | 2 +- .../gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu | 16 +- .../gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu | 361 +++ .../gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu | 14 +- ...mm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu | 8 +- .../gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu | 14 +- .../gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu | 355 +++ ...mm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu | 10 +- .../gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu | 14 +- .../gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu | 368 +++ ...emm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu | 10 +- .../gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu | 14 +- .../gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu | 368 +++ ...emm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu | 10 +- .../gemm_splitk_serial_tensor_op_sm75.cu | 4 +- .../unit/gemm/device/gemm_splitk_simt_sm50.cu | 2 +- .../gemm/device/gemm_splitk_tensor_op_sm70.cu | 4 +- .../gemm/device/gemm_splitk_tensor_op_sm75.cu | 4 +- ...emm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu | 549 ++++ ...emm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu | 549 ++++ ...emm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu | 487 ++++ ...emm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu | 550 ++++ ...mm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu | 10 +- ...al_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu | 193 ++ ...cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu | 194 ++ ...al_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu | 194 ++ ...ersal_f16n_f16t_f32t_tensor_op_f32_sm80.cu | 111 + test/unit/gemm/device/multistage_testbed.h | 251 ++ .../device/multistage_testbed_interleaved.h | 303 ++ test/unit/gemm/device/simt_cgemm_nn_sm50.cu | 86 +- test/unit/gemm/device/simt_cgemm_nt_sm50.cu | 86 +- test/unit/gemm/device/simt_cgemm_tn_sm50.cu | 86 +- test/unit/gemm/device/simt_cgemm_tt_sm50.cu | 86 +- test/unit/gemm/device/simt_dgemm_nn_sm50.cu | 74 +- test/unit/gemm/device/simt_dgemm_nt_sm50.cu | 74 +- test/unit/gemm/device/simt_dgemm_tn_sm50.cu | 74 +- test/unit/gemm/device/simt_dgemm_tt_sm50.cu | 74 +- test/unit/gemm/device/simt_hgemm_nn_sm50.cu | 144 +- test/unit/gemm/device/simt_hgemm_nt_sm50.cu | 144 +- test/unit/gemm/device/simt_hgemm_tn_sm50.cu | 144 +- test/unit/gemm/device/simt_hgemm_tt_sm50.cu | 144 +- test/unit/gemm/device/simt_igemm_nn_sm50.cu | 116 +- test/unit/gemm/device/simt_igemm_nt_sm50.cu | 116 +- test/unit/gemm/device/simt_igemm_tn_sm50.cu | 116 +- test/unit/gemm/device/simt_igemm_tt_sm50.cu | 116 +- test/unit/gemm/device/simt_int8_igemm_sm61.cu | 4 +- .../gemm/device/simt_int8_igemm_sm61_perf.cu | 10 +- .../device/simt_int8_igemm_sm61_sliced_k.cu | 18 +- test/unit/gemm/device/simt_sgemm_nn_sm50.cu | 116 +- test/unit/gemm/device/simt_sgemm_nt_sm50.cu | 116 +- test/unit/gemm/device/simt_sgemm_nt_sm80.cu | 249 ++ test/unit/gemm/device/simt_sgemm_tn_sm50.cu | 116 +- test/unit/gemm/device/simt_sgemm_tn_sm80.cu | 249 ++ test/unit/gemm/device/simt_sgemm_tt_sm50.cu | 116 +- test/unit/gemm/device/simt_sm50.py | 4 +- test/unit/gemm/device/simt_zgemm_nn_sm50.cu | 52 +- test/unit/gemm/device/simt_zgemm_nt_sm50.cu | 52 +- test/unit/gemm/device/simt_zgemm_tn_sm50.cu | 52 +- test/unit/gemm/device/simt_zgemm_tt_sm50.cu | 52 +- test/unit/gemm/device/testbed.h | 2 +- test/unit/gemm/device/testbed_complex.h | 2 +- test/unit/gemm/device/testbed_interleaved.h | 2 +- .../unit/gemm/device/testbed_planar_complex.h | 2 +- test/unit/gemm/device/testbed_sanity.h | 233 ++ test/unit/gemm/device/testbed_splitk.h | 2 +- test/unit/gemm/device/testbed_universal.h | 2 +- test/unit/gemm/device/testbed_utils.h | 3 +- test/unit/gemm/thread/CMakeLists.txt | 2 +- test/unit/gemm/thread/gemm_sm50.cu | 2 +- test/unit/gemm/thread/gemm_sm60.cu | 2 +- test/unit/gemm/thread/gemm_sm61.cu | 2 +- test/unit/gemm/thread/host/CMakeLists.txt | 2 +- test/unit/gemm/thread/host/gemm_sm60_host.cu | 2 +- test/unit/gemm/thread/host/testbed_host.h | 2 +- test/unit/gemm/thread/testbed.h | 2 +- test/unit/gemm/threadblock/CMakeLists.txt | 2 +- test/unit/gemm/threadblock/batched_gemv.cu | 2 +- .../gemm/threadblock/epilogue_workspace.cu | 2 +- .../gemm/threadblock/mma_pipelined_simt.cu | 2 +- .../gemm/threadblock/mma_pipelined_sm70.cu | 2 +- .../gemm/threadblock/mma_pipelined_sm75.cu | 337 ++- .../gemm/threadblock/mma_pipelined_testbed.h | 2 +- .../threadblock/mma_pipelined_wmma_sm70.cu | 2 +- .../threadblock/mma_pipelined_wmma_sm75.cu | 2 +- .../threadblock/mma_planar_complex_testbed.h | 2 +- .../threadblock/mma_singlestage_wmma_sm70.cu | 2 +- .../threadblock/mma_singlestage_wmma_sm75.cu | 2 +- test/unit/gemm/warp/CMakeLists.txt | 5 +- test/unit/gemm/warp/gemm_complex_sm80.cu | 635 +++++ .../gemm/warp/gemm_gaussian_complex_sm80.cu | 281 ++ test/unit/gemm/warp/gemm_sm50.cu | 2 +- test/unit/gemm/warp/gemm_sm60.cu | 2 +- test/unit/gemm/warp/gemm_sm61.cu | 2 +- test/unit/gemm/warp/gemm_sm70.cu | 2 +- test/unit/gemm/warp/gemm_sm75.cu | 6 +- test/unit/gemm/warp/gemm_sm80.cu | 1782 ++++++++++++ test/unit/gemm/warp/testbed.h | 3 +- test/unit/gemm/warp/wmma_sm70.cu | 2 +- test/unit/gemm/warp/wmma_sm72.cu | 2 +- test/unit/gemm/warp/wmma_sm75.cu | 2 +- test/unit/layout/CMakeLists.txt | 2 +- test/unit/layout/matrix.cu | 2 +- test/unit/layout/tensor.cu | 2 +- test/unit/layout/tensor_nhwc.cu | 2 +- test/unit/nvrtc/CMakeLists.txt | 2 +- test/unit/nvrtc/cutlass/nvrtc/environment.h | 2 +- .../unit/nvrtc/kernel/thread/testbed_kernel.h | 2 +- test/unit/nvrtc/stdlib/stdint.h | 2 +- test/unit/nvrtc/thread/CMakeLists.txt | 2 +- test/unit/nvrtc/thread/gemm_nvrtc.cu | 2 +- test/unit/nvrtc/thread/testbed.h | 2 +- test/unit/reduction/CMakeLists.txt | 2 +- test/unit/reduction/kernel/CMakeLists.txt | 2 +- test/unit/reduction/kernel/reduce_splitk.cu | 2 +- .../reduction/kernel/reduce_splitk_testbed.h | 2 +- test/unit/reduction/thread/CMakeLists.txt | 2 +- .../unit/reduction/thread/reduction_thread.cu | 2 +- test/unit/reduction/thread/testbed.h | 2 +- test/unit/test_unit.cpp | 2 +- test/unit/transform/CMakeLists.txt | 2 +- .../unit/transform/threadblock/CMakeLists.txt | 2 +- .../threadblock/predicated_tile_iterator.cu | 2 +- .../regular_tile_iterator_tensor_op.cu | 2 +- test/unit/util/complex.cu | 2 +- tools/CMakeLists.txt | 2 +- tools/library/CMakeLists.txt | 6 +- .../library/include/cutlass/library/handle.h | 60 +- .../library/include/cutlass/library/library.h | 112 +- .../include/cutlass/library/manifest.h | 7 + .../include/cutlass/library/operation_table.h | 69 +- tools/library/include/cutlass/library/util.h | 11 + tools/library/scripts/gemm_operation.py | 151 +- tools/library/scripts/generator.py | 868 +++++- tools/library/scripts/library.py | 178 +- tools/library/scripts/manifest.py | 2 +- tools/library/src/gemm_operation.h | 204 +- tools/library/src/handle.cu | 213 +- tools/library/src/library_internal.h | 36 +- tools/library/src/manifest.cpp | 16 +- tools/library/src/operation_table.cu | 114 +- tools/library/src/util.cu | 204 +- tools/profiler/CMakeLists.txt | 2 +- tools/profiler/src/cublas_helpers.cpp | 136 +- tools/profiler/src/cublas_helpers.h | 70 +- tools/profiler/src/cutlass_profiler.cu | 13 +- tools/profiler/src/cutlass_profiler.h | 3 +- tools/profiler/src/debug.h | 2 +- tools/profiler/src/device_allocation.cu | 49 +- tools/profiler/src/device_allocation.h | 2 +- tools/profiler/src/device_context.cu | 2 +- tools/profiler/src/device_context.h | 2 +- tools/profiler/src/enumerated_types.cpp | 2 +- tools/profiler/src/enumerated_types.h | 4 +- tools/profiler/src/gemm_operation_profiler.cu | 210 +- tools/profiler/src/gemm_operation_profiler.h | 21 +- tools/profiler/src/gpu_timer.cpp | 2 +- tools/profiler/src/gpu_timer.h | 2 +- tools/profiler/src/main.cpp | 2 +- tools/profiler/src/operation_profiler.cu | 119 +- tools/profiler/src/operation_profiler.h | 41 +- tools/profiler/src/options.cu | 124 +- tools/profiler/src/options.h | 2 +- tools/profiler/src/performance_report.cpp | 43 +- tools/profiler/src/performance_report.h | 2 +- tools/profiler/src/performance_result.cu | 55 + tools/profiler/src/performance_result.h | 5 +- tools/profiler/src/problem_space.cpp | 44 +- tools/profiler/src/problem_space.h | 13 +- tools/util/CMakeLists.txt | 2 +- .../util/include/cutlass/util/command_line.h | 2 +- tools/util/include/cutlass/util/debug.h | 2 +- tools/util/include/cutlass/util/device_dump.h | 2 +- .../util/include/cutlass/util/device_memory.h | 2 +- .../util/include/cutlass/util/distribution.h | 2 +- tools/util/include/cutlass/util/exceptions.h | 2 +- .../util/include/cutlass/util/host_reorder.h | 2 +- tools/util/include/cutlass/util/host_tensor.h | 2 +- .../cutlass/util/host_tensor_planar_complex.h | 2 +- .../util/reference/detail/inner_product.h | 2 +- .../cutlass/util/reference/device/gemm.h | 2 +- .../reference/device/gemm_planar_complex.h | 2 +- .../util/reference/device/kernel/gemm.h | 2 +- .../device/kernel/tensor_elementwise.h | 2 +- .../reference/device/kernel/tensor_foreach.h | 2 +- .../util/reference/device/tensor_compare.h | 2 +- .../util/reference/device/tensor_fill.h | 2 +- .../util/reference/device/tensor_foreach.h | 2 +- .../util/reference/device/tensor_relu.h | 135 + .../util/reference/device/thread/gemm.h | 2 +- .../cutlass/util/reference/host/gemm.h | 37 +- .../util/reference/host/gemm_complex.h | 2 +- .../util/reference/host/gemm_planar_complex.h | 2 +- .../util/reference/host/tensor_compare.h | 2 +- .../cutlass/util/reference/host/tensor_copy.h | 2 +- .../util/reference/host/tensor_elementwise.h | 2 +- .../cutlass/util/reference/host/tensor_fill.h | 2 +- .../util/reference/host/tensor_foreach.h | 2 +- .../cutlass/util/reference/host/tensor_norm.h | 2 +- .../include/cutlass/util/tensor_view_io.h | 2 +- tools/util/include/cutlass/util/type_traits.h | 2 +- 584 files changed, 51080 insertions(+), 3373 deletions(-) create mode 100644 examples/12_gemm_bias_relu/CMakeLists.txt create mode 100644 examples/12_gemm_bias_relu/gemm_bias_relu.cu create mode 100644 examples/13_fused_two_gemms/CMakeLists.txt create mode 100644 examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h create mode 100644 examples/13_fused_two_gemms/b2b_gemm_run.h create mode 100644 examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h create mode 100644 examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h create mode 100644 examples/13_fused_two_gemms/device/b2b_gemm.h create mode 100644 examples/13_fused_two_gemms/fused_gemm.cu create mode 100644 examples/13_fused_two_gemms/kernel/b2b_gemm.h create mode 100644 examples/13_fused_two_gemms/kernel/default_b2b_gemm.h create mode 100644 examples/13_fused_two_gemms/threadblock/b2b_mma_base.h create mode 100644 examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h create mode 100644 examples/13_fused_two_gemms/threadblock/default_b2b_mma.h create mode 100644 include/cutlass/arch/cache_operation.h create mode 100644 include/cutlass/arch/memory_sm80.h create mode 100644 include/cutlass/arch/mma_sm80.h create mode 100644 include/cutlass/bfloat16.h create mode 100644 include/cutlass/epilogue/thread/activation.h create mode 100644 include/cutlass/epilogue/thread/linear_combination_sigmoid.h create mode 100644 include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h create mode 100644 include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h create mode 100644 include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h create mode 100644 include/cutlass/gemm/threadblock/default_mma_core_sm80.h create mode 100644 include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h create mode 100644 include/cutlass/gemm/threadblock/default_multistage_mma_complex.h create mode 100644 include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h create mode 100644 include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h delete mode 100755 include/cutlass/gemm/threadblock/gemv.h create mode 100644 include/cutlass/gemm/threadblock/mma_multistage.h create mode 100644 include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h create mode 100644 include/cutlass/gemm/warp/default_mma_complex_tensor_op.h create mode 100644 include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h create mode 100644 include/cutlass/gemm/warp/mma_complex_tensor_op.h create mode 100644 include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h create mode 100644 include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h create mode 100644 include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h create mode 100644 include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h create mode 100644 include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h create mode 100644 include/cutlass/layout/tensor_op_multiplicand_sm80.h create mode 100644 include/cutlass/reduction/device/reduce_split_k.h create mode 100644 include/cutlass/tfloat32.h create mode 100644 include/cutlass/transform/thread/unaryOp.h create mode 100644 include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h delete mode 100644 include/cutlass/util/debug.h create mode 100644 test/unit/core/bfloat16.cu create mode 100644 test/unit/core/tfloat32.cu create mode 100644 test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu create mode 100644 test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu create mode 100644 test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu create mode 100644 test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu create mode 100644 test/unit/gemm/device/multistage_testbed.h create mode 100644 test/unit/gemm/device/multistage_testbed_interleaved.h create mode 100644 test/unit/gemm/device/simt_sgemm_nt_sm80.cu create mode 100644 test/unit/gemm/device/simt_sgemm_tn_sm80.cu create mode 100644 test/unit/gemm/device/testbed_sanity.h create mode 100644 test/unit/gemm/warp/gemm_complex_sm80.cu create mode 100644 test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu create mode 100644 test/unit/gemm/warp/gemm_sm80.cu create mode 100644 tools/profiler/src/performance_result.cu create mode 100644 tools/util/include/cutlass/util/reference/device/tensor_relu.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 367d6935..b92893e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,22 @@ # CUTLASS 2.x +## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08) + * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) + * Fast Tensor Core operations: + * Maximum performance via [`mma.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends) + * Tensor Float 32, BFloat16, and double-precision data types + * Mixed integer data types (int8, int4, bin1) + * Asynchronous copy for deep software pipelines via [`cp.async`](https://docs.nvidia.com/cuda/parallel-thread-execution) + * Features: + * SDK examples showing GEMM fused with bias+relu and fused GEMM+GEMM + * Complex-valued GEMMs targeting NVIDIA Ampere Tensor Cores in double-precision and Tensor Float 32 + * Gaussian complex GEMMs using 3m complex multiply algorithm + * Universal GEMM kernel supporting two batch modes and two algorithms for parallel reductions + * Policy updates: + * [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) needed to enable NVIDIA Ampere Architecture features + * Disabled F16C by default for compatibility - enable on cmake command line with `-DCUTLASS_ENABLE_F16C=ON` + ## [2.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.1.0) (2020-04-06) * BLAS-style host-side API added to [CUTLASS Library](/media/docs/quickstart.md#cutlass-library) * API to launch compiled kernel instances for GEMM and planar complex GEMM diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b7bbc48..b6583747 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -32,7 +32,7 @@ endif() message(STATUS "CMake Version: ${CMAKE_VERSION}") -project(CUTLASS VERSION 2.1.0 LANGUAGES CXX) +project(CUTLASS VERSION 2.2.0 LANGUAGES CXX) include(${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cmake) find_package(Doxygen QUIET) @@ -84,7 +84,7 @@ endif() set(CUTLASS_NVCC_ARCHS_SUPPORTED "") if (NOT CUDA_VERSION VERSION_LESS 7.5) - list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 50) + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 53) endif() if (NOT CUDA_VERSION VERSION_LESS 8.0) list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 60 61) @@ -98,6 +98,9 @@ endif() if (NOT CUDA_VERSION VERSION_LESS 10.0) list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 75) endif() +if (NOT CUDA_VERSION VERSION_LESS 11.0) + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 80) +endif() set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.") set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.") @@ -154,7 +157,7 @@ endif() set(CUTLASS_NVCC_EMBED_CUBIN ON CACHE BOOL "Embed compiled CUDA kernel binaries into executables.") set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.") set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.") -set(CUTLASS_ENABLE_F16C ON CACHE BOOL "Enable F16C x86 extensions in host code.") +set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.") # # CUTLASS generator cmake configuration @@ -248,8 +251,8 @@ if(CUDA_COMPILER MATCHES "[Cc]lang") endif() list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) - list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm=-pragma-unroll-threshold=100000) - list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm=-unroll-threshold=5000) + list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -pragma-unroll-threshold=100000) + list(APPEND CUTLASS_CUDA_CLANG_FLAGS -mllvm -unroll-threshold=5000) list(APPEND CUTLASS_CUDA_CLANG_FLAGS -Wno-unused-command-line-argument) string(REPLACE "." ";" CUDA_VERSION_PARTS ${CMAKE_CUDA_COMPILER_VERSION}) @@ -271,7 +274,7 @@ function(cutlass_apply_cuda_gencode_flags TARGET) set(NVCC_FLAGS) set(CLANG_FLAGS) foreach(ARCH ${CUTLASS_NVCC_ARCHS_ENABLED}) - list(APPEND CUTLASS_CUDA_CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH}) + list(APPEND CLANG_FLAGS --cuda-gpu-arch=sm_${ARCH}) set(CODES) if(CUTLASS_NVCC_EMBED_CUBIN) list(APPEND CODES sm_${ARCH}) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index fc95674d..f8778b80 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -9,15 +9,17 @@ This is the official list of CUTLASS developers and contributors. ## DEVELOPERS Andrew Kerr Haicheng Wu -Naila Farooqui +Manish Gupta Dustyn Blasig Pradeep Ramani -Manish Gupta -Aditya Atluri +Naila Farooqui +Piotr Majcher Paul Springer -David Tanner -Scott Yokim Jin Wang +Scott Yokim +Markus Hohnerbach +Aditya Atluri +David Tanner ## CONTRIBUTORS Timothy Costa @@ -25,12 +27,10 @@ Julien Demouth Brian Fahs Michael Goldfarb Mostafa Hagog -Markus Hohnerbach Fei Hu Alan Kaatz Tina Li Timmy Liu -Piotr Majcher Duane Merrill Kevin Siu Markus Tavenrath diff --git a/CUDA.cmake b/CUDA.cmake index d1eb4dbc..b8b343a7 100644 --- a/CUDA.cmake +++ b/CUDA.cmake @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -206,14 +206,14 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) function(cutlass_correct_source_file_language_property) if(CUDA_COMPILER MATCHES "clang") foreach(File ${ARGN}) - if(${File} MATCHES ".*\.cu$") + if(File MATCHES ".*\.cu$") set_source_files_properties(${File} PROPERTIES LANGUAGE CXX) endif() endforeach() endif() endfunction() -set(CUTLASS_UNITY_BUILD_ENABLED ON CACHE BOOL "Enable combined source compilation") +set(CUTLASS_UNITY_BUILD_ENABLED OFF CACHE BOOL "Enable combined source compilation") set(CUTLASS_UNITY_BUILD_BATCH_SIZE 16 CACHE STRING "Batch size for unified source files") function(cutlass_unify_source_files TARGET_ARGS_VAR) diff --git a/LICENSE.txt b/LICENSE.txt index 283345b5..64a49d68 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2017 - 2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017 - 2020, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index dd1c4c65..c1507c03 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ ![ALT](/media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") -# CUTLASS 2.1 +# CUTLASS 2.2 -_CUTLASS 2.1 - April 2020_ +_CUTLASS 2.2 - June 2020_ CUTLASS is a collection of CUDA C++ template abstractions for implementing high-performance matrix-multiplication (GEMM) at all levels and scales within CUDA. @@ -17,14 +17,28 @@ and applications. To support a wide variety of applications, CUTLASS provides extensive support for mixed-precision computations, providing specialized data-movement and multiply-accumulate abstractions for half-precision floating -point (FP16), single-precision floating point (FP32), double-precision floating +point (FP16), BFloat16 (BF16), Tensor Float 32 (TF32), +single-precision floating point (FP32), double-precision floating point (FP64) types, integer data types (4b and 8b), and binary data types (1b). -Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations for + +Furthermore, CUTLASS demonstrates warp-synchronous matrix multiply operations targeting the programmable, high-throughput _Tensor Cores_ implemented by -NVIDIA's Volta and Turing architectures. +NVIDIA's Volta, Turing, and Ampere architectures. See the [Quick Start Guide](/media/docs/quickstart.md) to get started quickly. +See the [functionality listing](media/docs/functionality.md) for the list of operations +supported at each level of the execution model hierarchy. + +# What's New in CUTLASS 2.2 + +CUTLASS 2.2 is a significant update to CUTLASS adding: + +- Coverage of [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) +- Tensor Core-accelerated GEMMs targeting Tensor Float 32, BFloat16, and double-precision data types +- Deep software pipelines using asynchronous copy +- Intended to be compiled with [CUDA 11 Toolkit](https://developer.nvidia.com/cuda-toolkit) + # What's New in CUTLASS 2.1 CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding: @@ -32,7 +46,6 @@ CUTLASS 2.1 is a minor update to CUTLASS 2.0 adding: - [Planar complex GEMM kernels](/examples/10_planar_complex/planar_complex.cu) targeting Volta and Turing Tensor Cores - BLAS-style API to launch kernels compiled into the [CUTLASS Library](/media/docs/quickstart.md#cutlass-library) - # What's New in CUTLASS 2.0 CUTLASS 2.0 is a substantial refactoring from the previous version, intended to offer: @@ -43,9 +56,6 @@ CUTLASS 2.0 is a substantial refactoring from the previous version, intended to **See the [CHANGELOG](CHANGELOG.md) for more details.** -See the [functionality listing](media/docs/functionality.md) for the list of operations -supported at each level of the execution model hierarchy. - # Performance

@@ -53,15 +63,15 @@ supported at each level of the execution model hierarchy. CUTLASS primitives are very efficient. When used to construct device-wide GEMM kernels, they exhibit performance comparable to cuBLAS for scalar GEMM computations. The above figure shows CUTLASS performance relative to cuBLAS -for large matrix dimensions on an NVIDIA GeForce 2080 Ti and an NVIDIA TitanV -using CUDA 10.2. Tensor Core operations are implemented using CUDA's +for large matrix dimensions on an NVIDIA GeForce 2080 Ti, an NVIDIA A100, and an NVIDIA TitanV +using CUDA 11.0 Toolkit. Tensor Core operations are implemented using CUDA's [mma instruction](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma). # Compatibility CUTLASS requires a C++11 host compiler and -performs best when built with the [CUDA 10.2 Toolkit](https://developer.nvidia.com/cuda-toolkit). -It is compatible with CUDA 9.2, CUDA 10.0, and CUDA 10.1. +performs best when built with the [CUDA 11.0 Toolkit](https://developer.nvidia.com/cuda-toolkit). +It is compatible with CUDA 9.2, CUDA 10.0, CUDA 10.1, and CUDA 10.2. We have tested the following environments. @@ -70,27 +80,28 @@ We have tested the following environments. | Windows 10 | Microsoft Visual Studio 2015| | | Microsoft Visual Studio 2017| | Ubuntu 16.04 | GCC 5.4.0 | -| Ubuntu 18.04 | GCC 7.3.0 | +| Ubuntu 18.04 | GCC 7.5.0 | Additionally, CUTLASS may be built with clang. See [these instructions](media/docs/quickstart.md#clang) for more details. CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on -any Maxwell-, Pascal-, Volta-, or Turing- architecture NVIDIA GPU. +any Maxwell-, Pascal-, Volta-, Turing-, or NVIDIA Ampere- architecture NVIDIA GPU. -|**GPU**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**| -|---|---|---| -|NVIDIA GeForce 1080|9.2| | -|NVIDIA TitanXP|9.2| | -|NVIDIA Tesla P100|9.2| | -|NVIDIA Tesla V100|9.2|10.1| -|NVIDIA TitanV|9.2|10.1| -|NVIDIA GeForce RTX 2080 TI, 2080, 2070|10.0|10.2| -|NVIDIA Tesla T4|10.0|10.2| +|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit**|**CUDA Toolkit Enabling Native Tensor Cores**| +|---|---|---|---| +|NVIDIA Tesla P100|6.0|9.2| | +|NVIDIA GeForce 1080|6.1|9.2| | +|NVIDIA TitanXP|6.1|9.2| | +|NVIDIA Tesla V100|7.0|9.2|10.1| +|NVIDIA TitanV|7.0|9.2|10.1| +|NVIDIA GeForce RTX 2080 TI, 2080, 2070|7.5|10.0|10.2| +|NVIDIA Tesla T4|7.5|10.0|10.2| +|NVIDIA A100|8.0|11.0|11.0| # Documentation -CUTLASS 2.1 is described in the following documents and the accompanying +CUTLASS 2.2 is described in the following documents and the accompanying [Doxygen documentation](https://nvidia.github.io/cutlass). - [Quick Start Guide](/media/docs/quickstart.md) - build and run CUTLASS @@ -124,7 +135,7 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc ``` Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels -for CUDA architecture versions 5.0, 6.0, 6.1, 7.0 and 7.5. To reduce compile time you can specify +for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, and 8.0. To reduce compile time you can specify the architectures to build CUTLASS for by changing the CMake configuration setting `CUTLASS_NVCC_ARCHS`. @@ -210,6 +221,10 @@ examples/ 10_planar_complex/ # example demonstrating planar complex GEMM kernels 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes + + 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu + + 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel ``` ### Tools @@ -255,29 +270,32 @@ $ make cutlass_profiler -j Example command line for profiling SGEMM kernels is as follows: ``` -$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096 +$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 ============================= Problem ID: 1 - Provider: CUTLASS - Operation: cutlass_simt_sgemm_128x128_nn + Provider: CUTLASS + OperationKind: gemm + Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1 - Disposition: Passed - Status: Success + Status: Success + Verification: ON + Disposition: Passed - Arguments: --m=4352 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 \ - --split_k_slices=1 --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 \ - --stages=2 --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 \ - --max_cc=1024 + cuBLAS: Passed - Bytes: 52428800 bytes - FLOPs: 146064539648 flops + Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ + --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 - Runtime: 10.5424 ms - Memory: 4.63158 GiB/s + Bytes: 180355072 bytes + FLOPs: 115992428544 flops - Math: 13854.9 GFLOP/s + Runtime: 6.73655 ms + Memory: 24.934 GiB/s + + Math: 17218.4 GFLOP/s ``` [Further details about the CUTLASS Profiler are described here.](media/docs/profiler.md) diff --git a/cmake/nop.cu b/cmake/nop.cu index 571c6c7c..518a582b 100644 --- a/cmake/nop.cu +++ b/cmake/nop.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/cuBLAS.cmake b/cuBLAS.cmake index d7f330cf..4c73a1db 100644 --- a/cuBLAS.cmake +++ b/cuBLAS.cmake @@ -10,28 +10,35 @@ if((DEFINED CUTLASS_ENABLE_CUBLAS AND NOT CUTLASS_ENABLE_CUBLAS) OR message(STATUS "cuBLAS Disabled.") elseif(NOT TARGET cublas) - + find_path( - _CUBLAS_INCLUDE_DIR cublas.h - PATHS - ${CUDA_TOOLKIT_ROOT_DIR}/include - $ENV{CUBLAS_PATH}/include - $ENV{CUDA_PATH}/include - ${CUBLAS_PATH}/include - /usr/include) + _CUBLAS_INCLUDE_DIR + NAMES cublas.h + HINTS + ${CUBLAS_INCLUDE_PATH} + ENV CUBLAS_INCLUDE_PATH + ${CUBLAS_PATH} + ENV CUBLAS_PATH + ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES + include + ) find_library( - _CUBLAS_LIBRARY cublas + _CUBLAS_LIBRARY + NAMES cublas HINTS - ${CUDA_TOOLKIT_ROOT_DIR}/lib64 - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - $ENV{CUBLAS_PATH}/lib64 - $ENV{CUBLAS_PATH}/lib/x64 - $ENV{CUDA_PATH}/lib64 - $ENV{CUDA_PATH}/lib/x64 - ${CUBLAS_PATH}/lib64 - ${CUBLAS_PATH}/lib/x64 - /usr/lib/x86_64-linux-gnu) + ${CUBLAS_LIBRARY_PATH} + ENV CUBLAS_LIBRARY_PATH + ${_CUBLAS_INCLUDE_DIR}/.. + ${CUBLAS_PATH} + ENV CUBLAS_PATH + ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES + lib64 + lib/x64 + lib + ) if(_CUBLAS_INCLUDE_DIR AND _CUBLAS_LIBRARY) @@ -79,17 +86,20 @@ if(CUTLASS_ENABLE_CUBLAS AND NOT TARGET cublas) $) find_library( - _CUBLASLT_LIBRARY cublasLt + _CUBLASLT_LIBRARY + NAMES cublasLt HINTS - ${CUDA_TOOLKIT_ROOT_DIR}/lib64 - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - $ENV{CUBLAS_PATH}/lib64 - $ENV{CUBLAS_PATH}/lib/x64 - $ENV{CUDA_PATH}/lib64 - $ENV{CUDA_PATH}/lib/x64 - ${CUBLAS_PATH}/lib64 - ${CUBLAS_PATH}/lib/x64 - /usr/lib/x86_64-linux-gnu) + ${CUBLAS_LIBRARY_PATH} + ENV CUBLAS_LIBRARY_PATH + ${_CUBLAS_INCLUDE_DIR}/.. + ${CUBLAS_PATH} + ENV CUBLAS_PATH + ${CUDA_TOOLKIT_ROOT_DIR} + PATH_SUFFIXES + lib64 + lib/x64 + lib + ) if(_CUBLASLT_LIBRARY AND NOT TARGET cublasLt) @@ -106,6 +116,8 @@ if(CUTLASS_ENABLE_CUBLAS AND NOT TARGET cublas) add_library(nvidia::cublasLt ALIAS cublasLt) + target_link_libraries(cublas INTERFACE cublasLt) + endif() endif() diff --git a/examples/00_basic_gemm/CMakeLists.txt b/examples/00_basic_gemm/CMakeLists.txt index 5b833b85..9ae257d9 100644 --- a/examples/00_basic_gemm/CMakeLists.txt +++ b/examples/00_basic_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/00_basic_gemm/basic_gemm.cu b/examples/00_basic_gemm/basic_gemm.cu index 41564632..bda012ab 100644 --- a/examples/00_basic_gemm/basic_gemm.cu +++ b/examples/00_basic_gemm/basic_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/01_cutlass_utilities/CMakeLists.txt b/examples/01_cutlass_utilities/CMakeLists.txt index 2dfa083c..5f22b7b1 100644 --- a/examples/01_cutlass_utilities/CMakeLists.txt +++ b/examples/01_cutlass_utilities/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/01_cutlass_utilities/cutlass_utilities.cu b/examples/01_cutlass_utilities/cutlass_utilities.cu index 0b6aa386..d1eaa57f 100644 --- a/examples/01_cutlass_utilities/cutlass_utilities.cu +++ b/examples/01_cutlass_utilities/cutlass_utilities.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/02_dump_reg_shmem/CMakeLists.txt b/examples/02_dump_reg_shmem/CMakeLists.txt index 4e9af4fb..5e6112e0 100644 --- a/examples/02_dump_reg_shmem/CMakeLists.txt +++ b/examples/02_dump_reg_shmem/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/02_dump_reg_shmem/dump_reg_shmem.cu b/examples/02_dump_reg_shmem/dump_reg_shmem.cu index 39d58db8..ed712aa8 100644 --- a/examples/02_dump_reg_shmem/dump_reg_shmem.cu +++ b/examples/02_dump_reg_shmem/dump_reg_shmem.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/examples/03_visualize_layout/CMakeLists.txt b/examples/03_visualize_layout/CMakeLists.txt index 81211df9..5a08c0f8 100644 --- a/examples/03_visualize_layout/CMakeLists.txt +++ b/examples/03_visualize_layout/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/03_visualize_layout/options.h b/examples/03_visualize_layout/options.h index c72b1228..dd7de198 100644 --- a/examples/03_visualize_layout/options.h +++ b/examples/03_visualize_layout/options.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/register_layout.cu b/examples/03_visualize_layout/register_layout.cu index 655d1f37..0d2b25eb 100644 --- a/examples/03_visualize_layout/register_layout.cu +++ b/examples/03_visualize_layout/register_layout.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -34,6 +34,8 @@ #include "cutlass/layout/pitch_linear.h" #include "cutlass/layout/tensor_op_multiplicand_sm70.h" #include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + #include "visualize_layout.h" #include "register_layout.h" @@ -59,18 +61,40 @@ void RegisterLayouts(std::map // Integer matrix multiply.int4 8832 TN kblock128 {"TensorOpMultiplicand<4,128>", new VisualizeLayout>}, + // Integer matrix multiply.int4 16864 TN kblock256 + {"TensorOpMultiplicand<4,256>", + new VisualizeLayout>}, // Integer matrix multiply 8816 Interleaved-32 {"TensorOpMultiplicand<8,32>", new VisualizeLayout>}, // Integer matrix multiply 8816 TN kblock64 {"TensorOpMultiplicand<8,64>", new VisualizeLayout>}, + {"TensorOpMultiplicand<8,128>", + new VisualizeLayout>}, // Matrix Multiply 1688 TN kblock32 {"TensorOpMultiplicand<16,32>", new VisualizeLayout>}, // Matrix multiply 1688 NT {"TensorOpMultiplicand<16,64>", new VisualizeLayout>}, + // Matrix multiply 1688.TF32 TN kblock16 + {"TensorOpMultiplicand<32,16>", + new VisualizeLayout>}, + // Matrix multiply 1688.TF32 TN kblock32 + {"TensorOpMultiplicand<32,32>", + new VisualizeLayout>}, + // Matrix multiply 1688 NT + {"TensorOpMultiplicandCongruous<32,32>", + new VisualizeLayout< + cutlass::layout::TensorOpMultiplicandCongruous<32, 32>>}, + // Matrix multiply 884 NT + {"TensorOpMultiplicandCongruous<64,16>", + new VisualizeLayout< + cutlass::layout::TensorOpMultiplicandCongruous<64, 16>>}, + // Matrix multiply 884 TN + {"TensorOpMultiplicand64bCrosswise", + new VisualizeLayout}, {"TensorOpMultiplicandCongruous<128,4>", new VisualizeLayout< cutlass::layout::TensorOpMultiplicandCongruous<128, 4>>}, @@ -82,7 +106,7 @@ void RegisterLayouts(std::map cutlass::layout::VoltaTensorOpMultiplicandCongruous<16>>}, {"VoltaTensorOpMultiplicandCrosswise<16,32>", new VisualizeLayout< - cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>}, + cutlass::layout::VoltaTensorOpMultiplicandCrosswise<16, 32>>} }; for (auto layout : layout_pairs) { diff --git a/examples/03_visualize_layout/register_layout.h b/examples/03_visualize_layout/register_layout.h index fee911f7..1518e433 100644 --- a/examples/03_visualize_layout/register_layout.h +++ b/examples/03_visualize_layout/register_layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/03_visualize_layout/visualize_layout.cpp b/examples/03_visualize_layout/visualize_layout.cpp index 8908d2c1..a0f27181 100644 --- a/examples/03_visualize_layout/visualize_layout.cpp +++ b/examples/03_visualize_layout/visualize_layout.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -65,14 +65,26 @@ void print_usage(std::ostream &out) { "--extent=64,64 --vectorize=32 --output-shape=256,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<4,128>\" " "--extent=128,32 --vectorize=32 --output-shape=256,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<4,256>\" " + "--extent=256,16 --vectorize=32 --output-shape=256,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<8,32>\" " "--extent=32,64 --vectorize=16 --output-shape=128,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<8,64>\" " "--extent=64,32 --vectorize=16 --output-shape=128,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<8,128>\" " + "--extent=128,16 --vectorize=16 --output-shape=128,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<16,32>\" " "--extent=32,32 --vectorize=8 --output-shape=64,4\n" << "$ 03_visualize_layout \"TensorOpMultiplicand<16,64>\" " "--extent=64,16 --vectorize=8 --output-shape=64,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<32,16>\" " + "--extent=16,32 --vectorize=4 --output-shape=32,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicand<32,32>\" " + "--extent=32,16 --vectorize=4 --output-shape=32,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<32,32>\" " + "--extent=32,16 --vectorize=4 --output-shape=32,4\n" + << "$ 03_visualize_layout \"TensorOpMultiplicandCongruous<64, 16>\" " + "--extent=16,16 --vectorize=2 --output-shape=16,4\n" << "$ 03_visualize_layout \"VoltaTensorOpMultiplicandCrosswise<16,32>\" " "--extent=32,64 --vectorize=4 --output-shape=64,4\n" << "$ 03_visualize_layout \"VotlaTensorOpMultiplicandCongruous<16>\" " diff --git a/examples/03_visualize_layout/visualize_layout.h b/examples/03_visualize_layout/visualize_layout.h index 031916c7..4093d277 100644 --- a/examples/03_visualize_layout/visualize_layout.h +++ b/examples/03_visualize_layout/visualize_layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/04_tile_iterator/CMakeLists.txt b/examples/04_tile_iterator/CMakeLists.txt index cef15624..cd32e228 100644 --- a/examples/04_tile_iterator/CMakeLists.txt +++ b/examples/04_tile_iterator/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/04_tile_iterator/tile_iterator.cu b/examples/04_tile_iterator/tile_iterator.cu index e6315760..5c56f33b 100644 --- a/examples/04_tile_iterator/tile_iterator.cu +++ b/examples/04_tile_iterator/tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/05_batched_gemm/CMakeLists.txt b/examples/05_batched_gemm/CMakeLists.txt index 6c9bf504..6cd0ca8d 100644 --- a/examples/05_batched_gemm/CMakeLists.txt +++ b/examples/05_batched_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/05_batched_gemm/batched_gemm.cu b/examples/05_batched_gemm/batched_gemm.cu index d1fecda6..a9d8a9c6 100644 --- a/examples/05_batched_gemm/batched_gemm.cu +++ b/examples/05_batched_gemm/batched_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/examples/06_splitK_gemm/CMakeLists.txt b/examples/06_splitK_gemm/CMakeLists.txt index 750c6205..7b30ae16 100644 --- a/examples/06_splitK_gemm/CMakeLists.txt +++ b/examples/06_splitK_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/06_splitK_gemm/splitk_gemm.cu b/examples/06_splitK_gemm/splitk_gemm.cu index 5fb513cb..f0e1d578 100644 --- a/examples/06_splitK_gemm/splitk_gemm.cu +++ b/examples/06_splitK_gemm/splitk_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,7 +39,7 @@ inner product (1/16th of output), they accumulate to single output matrix. Writing a single high performance matrix multiplication kernel is hard but do-able. Whereas writing high performance kernels at scale which works for multiple problem sizes with good abstractions is -really hard. CUTLASS solves this problem by providing simplified abstractions (knobs) to compose +really hard. CUTLASS solves this problem by providing simplified abstractions to compose multiple sections of gemm kernel. When used properly, the kernels can hit peak performance of GPU easily. @@ -144,7 +144,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 4 // This code section describes how threadblocks are scheduled on GPU -using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; // <- ?? +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? // This code section describes ? using EpilogueOp = cutlass::epilogue::thread::LinearCombination< @@ -172,17 +172,7 @@ using Gemm = cutlass::gemm::device::GemmSplitKParallel; -int main() { - - // - // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. - // - // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. - // - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { - std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; - } +int run() { cudaDeviceProp props; @@ -316,11 +306,30 @@ int main() { tensor_ref_d.sync_host(); // Check if output from CUTLASS kernel and reference kernel are equal or not - std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), - tensor_ref_d.host_view()) - ? "Passed" - : "Failed") - << std::endl; + bool passed = cutlass::reference::host::TensorEquals( + tensor_d.host_view(), + tensor_ref_d.host_view()); - CUTLASS_CHECK(status); + std::cout << (passed ? "Passed" : "Failed") << std::endl; + + return (passed ? 0 : -1); } + +int main() { + + // + // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + // + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero, so this test passes when built with older CUDA Toolkits. Its action are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/07_volta_tensorop_gemm/CMakeLists.txt b/examples/07_volta_tensorop_gemm/CMakeLists.txt index 56dfce9e..82e81722 100644 --- a/examples/07_volta_tensorop_gemm/CMakeLists.txt +++ b/examples/07_volta_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu index 447cc1cc..208c4f64 100644 --- a/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu +++ b/examples/07_volta_tensorop_gemm/volta_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -156,7 +156,7 @@ using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 4>; // <- MMA Op tile M = 8, N = 8, K = 4 // This code section describes how threadblocks are scheduled on GPU -using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; // <- ?? +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? // This code section describes ? using EpilogueOp = cutlass::epilogue::thread::LinearCombination< @@ -188,15 +188,7 @@ using Gemm = cutlass::gemm::device::Gemm; -int main() { - - // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. - // - // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. - if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { - std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; - } +int run() { cudaDeviceProp props; @@ -223,7 +215,7 @@ int main() { cutlass::HostTensor tensor_a( problem_size.mk()); // <- Create matrix A with dimensions M x K cutlass::HostTensor tensor_b( - problem_size.nk()); // <- Create matrix B with dimensions N x K + problem_size.kn()); // <- Create matrix B with dimensions K x N cutlass::HostTensor tensor_c( problem_size.mn()); // <- Create matrix C with dimensions M x N cutlass::HostTensor tensor_d( @@ -326,12 +318,28 @@ int main() { tensor_ref_d.sync_host(); // Check if output from CUTLASS kernel and reference kernel are equal or not - std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), - tensor_ref_d.host_view()) - ? "Passed" - : "Failed") - << std::endl; + bool passed = cutlass::reference::host::TensorEquals( + tensor_d.host_view(), + tensor_ref_d.host_view()); - CUTLASS_CHECK(status); - return 0; + std::cout << (passed ? "Passed" : "Failed") << std::endl; + + return (passed ? 0 : -1); } + +int main() { + + // Volta Tensor Core operations exposed with mma.sync are first available in CUDA 10.1. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { + std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; + + // Returning zero when built on older Toolkits so tests pass. The actions of this SDK example are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt index 9e011a1e..b4e4fe82 100644 --- a/examples/08_turing_tensorop_gemm/CMakeLists.txt +++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu index 3440d82f..d7ba8331 100644 --- a/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu +++ b/examples/08_turing_tensorop_gemm/turing_tensorop_gemm.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -150,12 +150,12 @@ using SmArch = cutlass::arch::Sm75; using ShapeMMAThreadBlock = cutlass::gemm::GemmShape<128, 256, 64>; // <- threadblock tile M = 128, N = 256, K = 64 // This code section describes tile size a warp will compute -using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>; // <- warp tile M = 64, N = 64, K = 16 +using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 64>; // <- warp tile M = 64, N = 64, K = 64 // This code section describes the size of MMA op using ShapeMMAOp = cutlass::gemm::GemmShape<8, 8, 16>; // <- MMA Op tile M = 8, N = 8, K = 16 // This code section describes how threadblocks are scheduled on GPU -using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle; // <- ?? +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? // This code section describes the epilogue part of the kernel using EpilogueOp = cutlass::epilogue::thread::LinearCombination< @@ -186,7 +186,7 @@ using Gemm = cutlass::gemm::device::Gemm; -int main() { +int run() { // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available // in CUDA 10.2. @@ -222,7 +222,7 @@ int main() { cutlass::HostTensor tensor_a( problem_size.mk()); // <- Create matrix A with dimensions M x K cutlass::HostTensor tensor_b( - problem_size.nk()); // <- Create matrix B with dimensions N x K + problem_size.kn()); // <- Create matrix B with dimensions K x N cutlass::HostTensor tensor_c( problem_size.mn()); // <- Create matrix C with dimensions M x N cutlass::HostTensor tensor_d( @@ -325,12 +325,28 @@ int main() { tensor_ref_d.sync_host(); // Check if output from CUTLASS kernel and reference kernel are equal or not - std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), - tensor_ref_d.host_view()) - ? "Passed" - : "Failed") - << std::endl; + bool passed = cutlass::reference::host::TensorEquals( + tensor_d.host_view(), + tensor_ref_d.host_view()); - CUTLASS_CHECK(status); - return 0; + std::cout << (passed ? "Passed" : "Failed") << std::endl; + + return (passed ? 0 : -1); } + +int main() { + // Turing Tensor Core operations exposed with mma.sync and ldmatrix are first available + // in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.2 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes when built on older Toolkits. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/10_planar_complex/planar_complex.cu b/examples/10_planar_complex/planar_complex.cu index 7fc92870..b7318b99 100644 --- a/examples/10_planar_complex/planar_complex.cu +++ b/examples/10_planar_complex/planar_complex.cu @@ -500,7 +500,9 @@ int main(int argc, char const **args) { if (props.major < 7) { std::cerr << "Volta Tensor Core operations must be run on a machine with compute capability at least 70." << std::endl; - return -1; + + // Returning zero so this test passes on older architectures even though its actions are no-op. + return 0; } else if (props.major == 7 && props.minor <= 2) { // @@ -508,7 +510,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; } } else if (props.major == 7 && props.minor >= 5) { @@ -517,7 +521,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this test passes on older Toolkits even though its actions are no-op. + return 0; } } diff --git a/examples/11_planar_complex_array/planar_complex_array.cu b/examples/11_planar_complex_array/planar_complex_array.cu index 3003a900..6a027053 100644 --- a/examples/11_planar_complex_array/planar_complex_array.cu +++ b/examples/11_planar_complex_array/planar_complex_array.cu @@ -560,7 +560,9 @@ int main(int argc, char const **args) { if (props.major < 7) { std::cerr << "Tensor Core operations must be run on a machine with compute capability at least 70." << std::endl; - return -1; + + // Returning zero so this passes on older architectures. Its actions are no-op. + return 0; } else if (props.major == 7 && props.minor <= 2) { // @@ -568,7 +570,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 1))) { std::cerr << "Volta Tensor Core operations must be compiled with CUDA 10.1 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; } } else if (props.major == 7 && props.minor >= 5) { @@ -577,7 +581,9 @@ int main(int argc, char const **args) { // if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; - return -1; + + // Returning zero so this passes on older Toolkits. Its actions are no-op. + return 0; } } diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt new file mode 100644 index 00000000..fb78d77f --- /dev/null +++ b/examples/12_gemm_bias_relu/CMakeLists.txt @@ -0,0 +1,27 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_example_add_executable( + 12_gemm_bias_relu + gemm_bias_relu.cu + ) + diff --git a/examples/12_gemm_bias_relu/gemm_bias_relu.cu b/examples/12_gemm_bias_relu/gemm_bias_relu.cu new file mode 100644 index 00000000..7faaa98a --- /dev/null +++ b/examples/12_gemm_bias_relu/gemm_bias_relu.cu @@ -0,0 +1,282 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** +*/ + +#include +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" +#include "helper.h" + +// The code section below describes datatype for input, output matrices and computation between +// elements in input matrices. +using ElementAccumulator = float; // <- data type of accumulator +using ElementComputeEpilogue = ElementAccumulator; // <- data type of epilogue operations +using ElementInputA = cutlass::half_t; // <- data type of elements in input matrix A +using ElementInputB = cutlass::half_t; // <- data type of elements in input matrix B +using ElementOutput = float; // <- data type of elements in output matrix D + +// The code section below describes matrix layout of input and output matrices. Column Major for +// Matrix A, Row Major for Matrix B and Row Major for Matrix C +using LayoutInputA = cutlass::layout::ColumnMajor; +using LayoutInputB = cutlass::layout::ColumnMajor; +using LayoutOutput = cutlass::layout::RowMajor; + +// This code section describes whether you want to use tensor cores or regular SIMT cores on GPU SM +using MMAOp = cutlass::arch::OpClassTensorOp; + +// This code section describes CUDA SM architecture number +using SmArch = cutlass::arch::Sm75; + +// This code section describes the tile size a thread block will compute +using ShapeMMAThreadBlock = + cutlass::gemm::GemmShape<128, 128, 32>; // <- threadblock tile M = 128, N = 128, K = 32 +// This code section describes tile size a warp will compute +using ShapeMMAWarp = cutlass::gemm::GemmShape<64, 64, 32>; // <- warp tile M = 64, N = 64, K = 32 +// This code section describes the size of MMA op +using ShapeMMAOp = cutlass::gemm::GemmShape<16, 8, 8>; // <- MMA Op tile M = 8, N = 8, K = 4 + +// This code section describes how threadblocks are scheduled on GPU +using SwizzleThreadBlock = cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>; // <- ?? + +// Define the epilogue operation as LinearCombinationRelu. This is approximately equal to +// +// d_ij = max(0, alpha * sum_k(a_ik * b_kj) + beta * c_ij ) +// +using EpilogueOp = cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- this is the number of elements per + // vectorized memory access. For half + // precision, it's 8 elements. This becomes + // the vector width of math instructions in + // epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + +// Number of pipelines you want to use +constexpr int NumStages = 2; + +using Gemm = cutlass::gemm::device::Gemm; + +int run() { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + + const int length_m = 5120; + const int length_n = 4096; + const int length_k = 4096; + + // Create a tuple of problem size for matrix multiplication + cutlass::gemm::GemmCoord problem_size(length_m, length_n, length_k); + + // Initialize tensors using CUTLASS helper functions + cutlass::HostTensor tensor_a( + problem_size.mk()); // <- Create matrix A with dimensions M x K + cutlass::HostTensor tensor_b( + problem_size.nk()); // <- Create matrix B with dimensions N x K + + cutlass::HostTensor tensor_c_bias( + {problem_size.m(), 1}); // <- Create matrix C with dimensions M x 1 + + cutlass::HostTensor tensor_d( + problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from + // CUTLASS kernel + cutlass::HostTensor tensor_ref_d( + problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from + // reference kernel + + // Fill input and output matrices on host using CUTLASS helper functions + cutlass::reference::host::TensorFillRandomUniform( + tensor_a.host_view(), + 1, + ElementInputA(4), + ElementInputA(-4), + 0); // <- Fill matrix A on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_b.host_view(), + 1, + ElementInputB(4), + ElementInputB(-4), + 0); // <- Fill matrix B on host with uniform-distribution random data + cutlass::reference::host::TensorFillRandomUniform( + tensor_c_bias.host_view(), + 1, + ElementOutput(4), + ElementOutput(-4), + 0); // <- Fill matrix C on host with uniform-distribution random data + cutlass::reference::host::TensorFill( + tensor_d.host_view()); // <- fill matrix D on host with zeros + cutlass::reference::host::TensorFill( + tensor_ref_d.host_view()); // <- fill matrix D for reference on host with zeros + + // Copy data from host to GPU + tensor_a.sync_device(); + tensor_b.sync_device(); + tensor_c_bias.sync_device(); + tensor_d.sync_device(); + tensor_ref_d.sync_device(); + + // Initialize alpha and beta for dot product computation + ElementComputeEpilogue alpha = ElementComputeEpilogue(1); + ElementComputeEpilogue beta = ElementComputeEpilogue(0); + + // Split K dimension into 1 partitions + int split_k_slices = 1; + + // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch + // instantiated CUTLASS kernel + typename Gemm::Arguments arguments{ + problem_size, // <- problem size of matrix multiplication + tensor_a.device_ref(), // <- reference to matrix A on device + tensor_b.device_ref(), // <- reference to matrix B on device + + {tensor_c_bias.device_data(), 0}, // <- the C matrix is treated as the bias vector. We can enable the GEMM + // to project away the N dimension by setting the stride to zero. + + tensor_d.device_ref(), // <- reference to matrix D on device + {alpha, beta}, // <- tuple of alpha and beta + split_k_slices}; // <- k-dimension split factor + + // Using the arguments, query for extra workspace required for matrix multiplication computation + size_t workspace_size = Gemm::get_workspace_size(arguments); + + // Allocate workspace memory + cutlass::device_memory::allocation workspace(workspace_size); + + // Instantiate CUTLASS kernel depending on templates + Gemm gemm_op; + + // Initialize CUTLASS kernel with arguments and workspace pointer + cutlass::Status status = gemm_op.initialize(arguments, workspace.get()); + CUTLASS_CHECK(status); + + // Launch initialized CUTLASS kernel + status = gemm_op(); + CUTLASS_CHECK(status); + + // + // Create instantiation for device reference gemm kernel + // + + cutlass::reference::device::Gemm + gemm_device_reference; + + // Launch device reference to compute strictly the product A * B + gemm_device_reference( + problem_size, + alpha, + tensor_a.device_ref(), + tensor_b.device_ref(), + 0, + tensor_c_bias.device_ref(), + tensor_ref_d.device_ref()); + + // Wait for kernels to finish + cudaDeviceSynchronize(); + + // Copy output data from CUTLASS and reference kernel to host for comparison + tensor_d.sync_host(); + tensor_ref_d.sync_host(); + + // Compute bias + relu in host code + for (int i = 0; i < problem_size.m(); ++i) { + for (int j = 0; j < problem_size.n(); ++j) { + tensor_ref_d.at({i, j}) = std::max( + ElementOutput(0), + ElementOutput(tensor_ref_d.at({i, j}) + beta * tensor_c_bias.at({i, 0})) + ); + } + } + + // Check if output from CUTLASS kernel and reference kernel are equal or not + std::cout << (cutlass::reference::host::TensorEquals(tensor_d.host_view(), + tensor_ref_d.host_view()) + ? "Passed" + : "Failed") + << std::endl; + + CUTLASS_CHECK(status); + return 0; +} + +int main() { + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/13_fused_two_gemms/CMakeLists.txt b/examples/13_fused_two_gemms/CMakeLists.txt new file mode 100644 index 00000000..ba51537c --- /dev/null +++ b/examples/13_fused_two_gemms/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without modification, are permitted +# provided that the following conditions are met: +# * Redistributions of source code must retain the above copyright notice, this list of +# conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright notice, this list of +# conditions and the following disclaimer in the documentation and/or other materials +# provided with the distribution. +# * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used +# to endorse or promote products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +cutlass_example_add_executable( + 13_fused_two_gemms + fused_gemm.cu + ) + +target_include_directories( + 13_fused_two_gemms + PRIVATE + . + ) + diff --git a/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h b/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h new file mode 100644 index 00000000..10a0d4bf --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h @@ -0,0 +1,190 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "device/b2b_gemm.h" +#include "b2b_gemm_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +void run_nonfused_gemm_f16() { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Gemm0 = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + using Gemm1 = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bNonFusedGemmRun nonFusedGemm; + + std::cout << "Running Non-fused back-to-back FP16 TN GEMMs...\n"; + bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; +} + +void run_fused_gemm_f16() { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + using ElementCompute = cutlass::half_t; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 32>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bGemm = cutlass::gemm::device::B2bGemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bFusedGemmRun fusedGemm; + + std::cout << "Running Fused back-to-back FP16 TN GEMMs...\n"; + bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(passed) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} +//////////////////////////////////////////////////////////////////////////////// + +#endif //#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) diff --git a/examples/13_fused_two_gemms/b2b_gemm_run.h b/examples/13_fused_two_gemms/b2b_gemm_run.h new file mode 100644 index 00000000..053064d7 --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_gemm_run.h @@ -0,0 +1,608 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include +#include +#include + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/device/gemm.h" +#include "cutlass/util/reference/device/tensor_relu.h" + +#include "helper.h" + +#define CHECK_GT(val1, val2) \ + if((val1) <= (val2)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; +#define CHECK_TRUE(val) \ + if(!(val)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; + +//////////////////////////////////////////////////////////////////////////////// + +template +struct B2bNonFusedGemmRun +{ + + using Gemm0 = Gemm0_; + using Gemm1 = Gemm1_; + using ElementAccumulator = typename Gemm0::ElementAccumulator; + using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bNonFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename Gemm0::ElementA, + typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename Gemm0::ElementB, + typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementB, + typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + cutlass::reference::host::TensorFill( + tensor_D0.host_view()); + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_C0.sync_device(); + tensor_D0.sync_device(); + tensor_B1.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm0::Arguments arguments_0{ + problem_size_0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_D0.device_ref(), + {alpha0, beta0} + }; + + typename Gemm1::Arguments arguments_1{ + problem_size_1, + tensor_D0.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha1, beta1} + }; + + + Gemm0 gemm_op_0; + Gemm1 gemm_op_1; + + cutlass::Status status = gemm_op_0.initialize(arguments_0); + + CUTLASS_CHECK(status); + + status = gemm_op_1.initialize(arguments_1); + + CUTLASS_CHECK(status); + // + // Run the GEMM + // + + cudaEvent_t start, stop1, stop2; + cudaEventCreate(&start); + cudaEventCreate(&stop1); + cudaEventCreate(&stop2); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = gemm_op_0(); + + CUTLASS_CHECK(status); + } + cudaEventRecord(stop1); + for(int i = 0; i < 100; i++) { + + status = gemm_op_1(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop2); + cudaDeviceSynchronize(); + float gemm0Time, gemm1Time, totalTime; + cudaEventElapsedTime(&gemm0Time, start, stop1); + cudaEventElapsedTime(&gemm1Time, stop1, stop2); + cudaEventElapsedTime(&totalTime, start, stop2); + std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n"; + std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n"; + std::cout << "total time " << totalTime / 100.0 << " ms\n"; + + tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename Gemm0::ElementA, typename Gemm0::LayoutA, + typename Gemm0::ElementB, typename Gemm0::LayoutB, + typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm0::Operator> + reference_gemm_0; + + cutlass::reference::device::Gemm< + typename Gemm1::ElementA, typename Gemm1::LayoutA, + typename Gemm1::ElementB, typename Gemm1::LayoutB, + typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm1::Operator> + reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + reference_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + // Wait for kernels to finish + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_nonfused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nC0 =\n" << tensor_C0.host_view() + << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } +}; + +template +struct B2bFusedGemmRun +{ + + using B2bGemm = B2bGemm_; + using ElementAccumulator = typename B2bGemm::ElementAccumulator; + using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Gaussian) { + + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename B2bGemm::ElementA, + typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); + +// cutlass::HostTensor< +// typename B2bGemm::ElementC, +// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_C0.sync_device(); + tensor_B1.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename B2bGemm::Arguments arguments{ + problem_size_0, + problem_size_1, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + tensor_C0.device_ref(), + tensor_B1.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha0, beta0}, + {alpha1, beta1}, + }; + + B2bGemm b2b_gemm_op; + + cutlass::Status status = b2b_gemm_op.initialize(arguments); + + CUTLASS_CHECK(status); + + // + // Run the GEMM + // + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = b2b_gemm_op(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop); + cudaDeviceSynchronize(); + float gemmTime; + cudaEventElapsedTime(&gemmTime, start, stop); + std::cout << "time " << gemmTime / 100.0 << " ms\n"; + + //tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename B2bGemm::ElementA, typename B2bGemm::LayoutA, + typename B2bGemm::ElementB, typename B2bGemm::LayoutB, + typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute, + ElementAccumulator, typename B2bGemm::Operator> + reference_gemm_0, reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + reference_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_fused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nC0 =\n" << tensor_C0.host_view() +// << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } + +}; + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h b/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h new file mode 100644 index 00000000..1c3f15c2 --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h @@ -0,0 +1,190 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "device/b2b_gemm.h" +#include "b2b_interleaved_gemm_run.h" + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +void run_nonfused_gemm_s8() { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 32, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<64, 64, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using Gemm0 = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + WarpShape0, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + using Gemm1 = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape1, + WarpShape1, + InstructionShape, + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bInterleavedNonFusedGemmRun nonFusedGemm; + + std::cout << "Running Non-fused back-to-back INT8 NT interleaved GEMMs...\n"; + bool pass = nonFusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(pass) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; +} + +void run_fused_gemm_s8() { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + cutlass::gemm::GemmCoord problem_size_0(128*1600, 64, 576); + cutlass::gemm::GemmCoord problem_size_1(128*1600, 128, 64); + ElementCompute alpha0 = ElementCompute(2); + ElementCompute beta0 = ElementCompute(0); + ElementCompute alpha1 = ElementCompute(2); + ElementCompute beta1 = ElementCompute(1); + + using ThreadblockShape0 = cutlass::gemm::GemmShape<128, 64, 64>; + using WarpShape0 = cutlass::gemm::GemmShape<32, 64, 64>; + using ThreadblockShape1 = cutlass::gemm::GemmShape<128, 128, 64>; + using WarpShape1 = cutlass::gemm::GemmShape<32, 128, 64>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + + using EpilogueOutputOp0 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + InstructionShape::kM * InstructionShape::kN / 32, + ElementAccumulator, + ElementCompute + >; + + using EpilogueOutputOp1 = + cutlass::epilogue::thread::LinearCombinationRelu< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >; + + + + using B2bGemm = cutlass::gemm::device::B2bGemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>, + 2 + >; + + B2bInterleavedFusedGemmRun fusedGemm; + + std::cout << "Running Fused back-to-back INT8 NT interleaved GEMMs...\n"; + bool passed = fusedGemm.run(problem_size_0, problem_size_1, alpha0, beta0, alpha1, beta1); + if(passed) + std::cout << "Pass\n"; + else + std::cout << "Fail\n"; + +} +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) diff --git a/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h b/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h new file mode 100644 index 00000000..906cabb4 --- /dev/null +++ b/examples/13_fused_two_gemms/b2b_interleaved_gemm_run.h @@ -0,0 +1,633 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once + +#include +#include +#include + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/host_reorder.h" +#include "cutlass/util/reference/device/gemm.h" +#include "helper.h" + +#define CHECK_GT(val1, val2) \ + if((val1) <= (val2)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_GT failed\n"; +#define CHECK_TRUE(val) \ + if(!(val)) \ + std::cerr << __FILE__ << " " << __LINE__ << ": CHECK_TRUE failed\n"; + +template +struct B2bInterleavedNonFusedGemmRun +{ + + using Gemm0 = Gemm0_; + using Gemm1 = Gemm1_; + using ElementAccumulator = typename Gemm0::ElementAccumulator; + using ElementCompute = typename Gemm0::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bInterleavedNonFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename Gemm0::ElementA, + typename Gemm0::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename Gemm0::ElementB, + typename Gemm0::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename Gemm0::ElementB, + typename Gemm0::LayoutB> tensor_B0_reordered(problem_size_0.kn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_C0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm0::ElementC, + typename Gemm0::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementB, + typename Gemm1::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename Gemm1::ElementB, + typename Gemm1::LayoutB> tensor_B1_reordered(problem_size_1.kn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename Gemm1::ElementC, + typename Gemm1::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + //Reorder B0 and B1 + cutlass::reorder_column( + tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0); + cutlass::reorder_column( + tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1); + + cutlass::reference::host::TensorFill( + tensor_D0.host_view()); + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_B0_reordered.sync_device(); + tensor_C0.sync_device(); + tensor_D0.sync_device(); + tensor_B1.sync_device(); + tensor_B1_reordered.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm0::Arguments arguments_0{ + problem_size_0, + tensor_A0.device_ref(), + tensor_B0_reordered.device_ref(), + tensor_C0.device_ref(), + tensor_D0.device_ref(), + {alpha0, beta0} + }; + + typename Gemm1::Arguments arguments_1{ + problem_size_1, + tensor_D0.device_ref(), + tensor_B1_reordered.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha1, beta1} + }; + + + Gemm0 gemm_op_0; + Gemm1 gemm_op_1; + + cutlass::Status status = gemm_op_0.initialize(arguments_0); + + CUTLASS_CHECK(status); + + status = gemm_op_1.initialize(arguments_1); + + CUTLASS_CHECK(status); + // + // Run the GEMM + // + cudaEvent_t start, stop1, stop2; + cudaEventCreate(&start); + cudaEventCreate(&stop1); + cudaEventCreate(&stop2); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = gemm_op_0(); + + CUTLASS_CHECK(status); + } + cudaEventRecord(stop1); + + for(int i = 0; i < 100; i++) { + status = gemm_op_1(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop2); + cudaDeviceSynchronize(); + float gemm0Time, gemm1Time, totalTime; + cudaEventElapsedTime(&gemm0Time, start, stop1); + cudaEventElapsedTime(&gemm1Time, stop1, stop2); + cudaEventElapsedTime(&totalTime, start, stop2); + std::cout << "gemm 0 time " << gemm0Time / 100.0 << " ms\n"; + std::cout << "gemm 1 time " << gemm1Time / 100.0 << " ms\n"; + std::cout << "total time " << totalTime / 100.0 << " ms\n"; + + tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename Gemm0::ElementA, typename Gemm0::LayoutA, + typename Gemm0::ElementB, typename Gemm0::LayoutB, + typename Gemm0::ElementC, typename Gemm0::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm0::Operator> + reference_gemm_0; + + cutlass::reference::device::Gemm< + typename Gemm1::ElementA, typename Gemm1::LayoutA, + typename Gemm1::ElementB, typename Gemm1::LayoutB, + typename Gemm1::ElementC, typename Gemm1::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm1::Operator> + reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + tensor_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_interleaved_nonfused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() + << "\nC0 =\n" << tensor_C0.host_view() + << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } +}; + +template +struct B2bInterleavedFusedGemmRun +{ + + using B2bGemm = B2bGemm_; + using ElementAccumulator = typename B2bGemm::ElementAccumulator; + using ElementCompute = typename B2bGemm::B2bGemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + B2bInterleavedFusedGemmRun( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + std::cerr << "Not implemented\n"; + return false; + } + + return true; + } + + + + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size_0, + cutlass::gemm::GemmCoord problem_size_1, + ElementCompute alpha0 = ElementCompute(1), + ElementCompute beta0 = ElementCompute(0), + ElementCompute alpha1 = ElementCompute(1), + ElementCompute beta1 = ElementCompute(0), + bool relu = true) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename B2bGemm::ElementA, + typename B2bGemm::LayoutA> tensor_A0(problem_size_0.mk()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B0(problem_size_0.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B0_reordered(problem_size_0.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C0(problem_size_0.mn()); + +// cutlass::HostTensor< +// typename B2bGemm::ElementC, +// typename B2bGemm::LayoutC> tensor_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D0(problem_size_0.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B1(problem_size_1.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementB, + typename B2bGemm::LayoutB> tensor_B1_reordered(problem_size_1.kn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_C1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> tensor_D1(problem_size_1.mn()); + + cutlass::HostTensor< + typename B2bGemm::ElementC, + typename B2bGemm::LayoutC> reference_D1(problem_size_1.mn()); + + + CHECK_TRUE(initialize_tensor(tensor_A0.host_view(), init_A, seed + 2019)); + CHECK_TRUE(initialize_tensor(tensor_B0.host_view(), init_B, seed + 2018)); + CHECK_TRUE(initialize_tensor(tensor_C0.host_view(), init_C, seed + 2017)); + CHECK_TRUE(initialize_tensor(tensor_B1.host_view(), init_B, seed + 2016)); + CHECK_TRUE(initialize_tensor(tensor_C1.host_view(), init_C, seed + 2015)); + + //Reorder B0 + cutlass::reorder_column( + tensor_B0_reordered.host_ref(), tensor_B0.host_ref(), problem_size_0); + cutlass::reorder_column( + tensor_B1_reordered.host_ref(), tensor_B1.host_ref(), problem_size_1); + + cutlass::reference::host::TensorFill( + tensor_D1.host_view()); + cutlass::reference::host::TensorFill( + reference_D0.host_view()); + cutlass::reference::host::TensorFill( + reference_D1.host_view()); + + tensor_A0.sync_device(); + tensor_B0.sync_device(); + tensor_B0_reordered.sync_device(); + tensor_C0.sync_device(); + //tensor_D0.sync_device(); + tensor_B1.sync_device(); + tensor_B1_reordered.sync_device(); + tensor_C1.sync_device(); + tensor_D1.sync_device(); + reference_D0.sync_device(); + reference_D1.sync_device(); + + // + // Initialize the GEMM operator + // + + typename B2bGemm::Arguments arguments{ + problem_size_0, + problem_size_1, + tensor_A0.device_ref(), + tensor_B0_reordered.device_ref(), + tensor_C0.device_ref(), + tensor_B1_reordered.device_ref(), + tensor_C1.device_ref(), + tensor_D1.device_ref(), + {alpha0, beta0}, + {alpha1, beta1}, + 1, /*threadblock_swizzle_k_tile*/ + }; + + B2bGemm b2b_gemm_op; + + cutlass::Status status = b2b_gemm_op.initialize(arguments); + + CUTLASS_CHECK(status); + + // + // Run the GEMM + // + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + for(int i = 0; i < 100; i++) { + status = b2b_gemm_op(); + + CUTLASS_CHECK(status); + } + + cudaEventRecord(stop); + cudaDeviceSynchronize(); + float gemmTime; + cudaEventElapsedTime(&gemmTime, start, stop); + std::cout << "time " << gemmTime / 100.0 << " ms\n"; + + //tensor_D0.sync_host(); + tensor_D1.sync_host(); + + // + // Verify + // + cutlass::reference::device::Gemm< + typename B2bGemm::ElementA, typename B2bGemm::LayoutA, + typename B2bGemm::ElementB, typename B2bGemm::LayoutB, + typename B2bGemm::ElementC, typename B2bGemm::LayoutC, ElementCompute, + ElementAccumulator, typename B2bGemm::Operator> + reference_gemm_0, reference_gemm_1; + + reference_gemm_0( + problem_size_0, + alpha0, + tensor_A0.device_ref(), + tensor_B0.device_ref(), + beta0, + tensor_C0.device_ref(), + reference_D0.device_ref() + ); + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D0.device_view()); + } + + reference_gemm_1( + problem_size_1, + alpha1, + reference_D0.device_ref(), + tensor_B1.device_ref(), + beta1, + tensor_C1.device_ref(), + reference_D1.device_ref() + ); + + + if(relu) { + cutlass::reference::device::TensorReLu(reference_D1.device_view()); + } + + cudaDeviceSynchronize(); + reference_D0.sync_host(); + reference_D1.sync_host(); + + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D0.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(tensor_D1.host_view()), 0); + CHECK_GT(cutlass::reference::host::TensorNorm(reference_D1.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D1.host_view(), + tensor_D1.host_view()); + + CHECK_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_B2bGemm_device_interleaved_fused.txt"; + std::cerr << "Dumping results in " << fname.str() << "\n"; + + std::ofstream file(fname.str()); + + file + << "A0 =\n" << tensor_A0.host_view() + << "\nB0 =\n" << tensor_B0.host_view() + << "\nB0_reordered =\n" << tensor_B0_reordered.host_view() + << "\nC0 =\n" << tensor_C0.host_view() +// << "\nD0 =\n" << tensor_D0.host_view() + << "\nB1 =\n" << tensor_B1.host_view() + << "\nB1_reordered =\n" << tensor_B1_reordered.host_view() + << "\nC1 =\n" << tensor_C1.host_view() + << "\n\nReference =\n" << reference_D1.host_view() + << "\nComputed =\n" << tensor_D1.host_view(); + } + + return passed; + } + +}; + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/device/b2b_gemm.h b/examples/13_fused_two_gemms/device/b2b_gemm.h new file mode 100644 index 00000000..3f161435 --- /dev/null +++ b/examples/13_fused_two_gemms/device/b2b_gemm.h @@ -0,0 +1,439 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/arch.h" +#include "cutlass/device_kernel.h" + +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" + +#include "cutlass/gemm/device/default_gemm_configuration.h" +#include "cutlass/epilogue/thread/linear_combination_relu.h" + +#include "kernel/b2b_gemm.h" +#include "kernel/default_b2b_gemm.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator_ = ElementC_, + /// Operator class tag + typename OperatorClass_ = arch::OpClassSimt, + /// Tag indicating architecture to tune for + typename ArchTag_ = arch::Sm70, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::ThreadblockShape, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::EpilogueOutputOp, + /// Epilogue output operator + typename EpilogueOutputOp1_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, + /// Number of stages used in the pipelined mainloop + int Stages = + DefaultGemmConfiguration::kStages, + /// Access granularity of A matrix in units of elements + int AlignmentA = + DefaultGemmConfiguration::kAlignmentA, + /// Access granularity of B matrix in units of elements + int AlignmentB = + DefaultGemmConfiguration::kAlignmentB, + /// If true, kernel supports split-K with serial reduction + bool SplitKSerial = false, + /// Operation performed by GEMM + typename Operator_ = typename DefaultGemmConfiguration< + OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, + ElementAccumulator_>::Operator, + /// Whether Beta is zero or not + bool IsBetaZero = false> +class B2bGemm { + public: + + using ElementA = ElementA_; + using LayoutA = LayoutA_; + using TensorRefA = TensorRef; + using ElementB = ElementB_; + using LayoutB = LayoutB_; + using TensorRefB = TensorRef; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + using TensorRefC = TensorRef; + using TensorRefD = TensorRef; + using ElementAccumulator = ElementAccumulator_; + using OperatorClass = OperatorClass_; + using ArchTag = ArchTag_; + using ThreadblockShape0 = ThreadblockShape0_; + using ThreadblockShape1 = ThreadblockShape1_; + using WarpShape0 = WarpShape0_; + using WarpShape1 = WarpShape1_; + using InstructionShape = InstructionShape_; + using EpilogueOutputOp0 = EpilogueOutputOp0_; + using EpilogueOutputOp1 = EpilogueOutputOp1_; + using ThreadblockSwizzle = ThreadblockSwizzle_; + using Operator = Operator_; + static int const kStages = Stages; + static int const kAlignmentA = AlignmentA; + static int const kAlignmentB = AlignmentB; + static int const kAlignmentC = EpilogueOutputOp1::kCount; + static bool const kSplitKSerial = SplitKSerial; + static bool const kIsBetaZero = IsBetaZero; + static ComplexTransform const kTransformA = ComplexTransform::kNone; + static ComplexTransform const kTransformB = ComplexTransform::kNone; + + /// Define the kernel + using B2bGemmKernel = typename kernel::DefaultB2bGemm< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + kStages, + kSplitKSerial, + Operator, + kIsBetaZero + >::B2bGemmKernel; + + /// Argument structure + struct Arguments { + + // + // Data members + // + + GemmCoord problem_size_0; + GemmCoord problem_size_1; + TensorRef ref_A0; + TensorRef ref_B0; + TensorRef ref_C0; + TensorRef ref_B1; + TensorRef ref_C1; + TensorRef ref_D1; + typename EpilogueOutputOp0::Params epilogue0; + typename EpilogueOutputOp1::Params epilogue1; + int split_k_slices; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + Arguments(): problem_size_0(0, 0, 0), problem_size_1(0, 0, 0), split_k_slices(1) { + + } + + /// Constructs an Arguments structure + CUTLASS_HOST_DEVICE + Arguments( + GemmCoord problem_size_0_, + GemmCoord problem_size_1_, + TensorRef ref_A0_, + TensorRef ref_B0_, + TensorRef ref_C0_, + TensorRef ref_B1_, + TensorRef ref_C1_, + TensorRef ref_D1_, + typename EpilogueOutputOp0::Params epilogue0_ = + typename EpilogueOutputOp0::Params(), + typename EpilogueOutputOp1::Params epilogue1_ = + typename EpilogueOutputOp1::Params(), + int split_k_slices_ = 1 + ): + problem_size_0(problem_size_0_), + problem_size_1(problem_size_1_), + ref_A0(ref_A0_), + ref_B0(ref_B0_), + ref_C0(ref_C0_), + ref_B1(ref_B1_), + ref_C1(ref_C1_), + ref_D1(ref_D1_), + epilogue0(epilogue0_), + epilogue1(epilogue1_), + split_k_slices(split_k_slices_) { + + } + }; + +private: + + /// Kernel parameters object + typename B2bGemmKernel::Params params_; + +public: + + /// Constructs the GEMM. + B2bGemm() { } + + /// Determines whether the GEMM can execute the given problem. + static Status can_implement(Arguments const &args) { + + if (!kSplitKSerial && args.split_k_slices > 1) { + return Status::kErrorInvalidProblem; + } + + Status status = B2bGemmKernel::can_implement( + args.problem_size_0, + args.problem_size_1, + args.ref_A0.non_const_ref(), + args.ref_B0.non_const_ref(), + args.ref_C0.non_const_ref(), + args.ref_B1.non_const_ref(), + args.ref_C1.non_const_ref(), + args.ref_D1 + ); + + if (status != Status::kSuccess) { + return status; + } + + return Status::kSuccess; + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const &args) { + + size_t bytes = 0; + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord tiled_shape = threadblock_swizzle.get_tiled_shape( + args.problem_size_0, + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.split_k_slices); + + if (kSplitKSerial && args.split_k_slices > 1) { + + + bytes += sizeof(int) * size_t(tiled_shape.m()) * size_t(tiled_shape.n()); + } + + return bytes; + } + + /// Initializes GEMM state from arguments. + Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) { + + // Determine grid shape + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord grid_shape = threadblock_swizzle.get_tiled_shape( + args.problem_size_0, + {ThreadblockShape0::kM, ThreadblockShape0::kN, ThreadblockShape0::kK}, + args.split_k_slices); +// cutlass::gemm::GemmCoord grid_shape_1 = threadblock_swizzle.get_tiled_shape( +// args.problem_size_1, +// {ThreadblockShape1::kM, ThreadblockShape1::kN, ThreadblockShape1::kK}, +// args.split_k_slices); + + if (kSplitKSerial) { + if (args.split_k_slices > 1) { + if (!workspace) { + return Status::kErrorWorkspaceNull; + } + + size_t bytes = get_workspace_size(args); + + cudaError_t result = cudaMemsetAsync(workspace, 0, bytes, stream); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } + } + else { + + if (args.split_k_slices > 1) { + return Status::kErrorInvalidProblem; + } + } + + // Initialize the Params structure + params_ = typename B2bGemmKernel::Params{ + args.problem_size_0, + args.problem_size_1, + grid_shape, + args.ref_A0.non_const_ref(), + args.ref_B0.non_const_ref(), + args.ref_C0.non_const_ref(), + args.ref_B1.non_const_ref(), + args.ref_C1.non_const_ref(), + args.ref_D1, + args.epilogue0, + args.epilogue1, + static_cast(workspace), + }; + + return Status::kSuccess; + } + + /// Lightweight update given a subset of arguments + Status update(Arguments const &args, void *workspace = nullptr) { + + if (kSplitKSerial && args.split_k_slices > 1) { + if (!workspace) { + return Status::kErrorWorkspaceNull; + } + } + + params_.ref_A0.reset(args.ref_A.non_const_ref().data()); + params_.ref_B0.reset(args.ref_B.non_const_ref().data()); + params_.ref_C0.reset(args.ref_C.non_const_ref().data()); + params_.ref_B1.reset(args.ref_B.non_const_ref().data()); + params_.ref_C1.reset(args.ref_C.non_const_ref().data()); + params_.ref_D1.reset(args.ref_D.data()); + params_.output_op_0 = args.epilogue0; + params_.output_op_1 = args.epilogue1; + params_.semaphore = static_cast(workspace); + + return Status::kSuccess; + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + ThreadblockSwizzle threadblock_swizzle; + + dim3 grid = threadblock_swizzle.get_grid_shape(params_.grid_tiled_shape); + dim3 block(B2bGemmKernel::kThreadCount, 1, 1); + + cudaError_t result; + + int smem_size = int(sizeof(typename B2bGemmKernel::SharedStorage)); + if (smem_size >= (48 << 10)) { + result = cudaFuncSetAttribute(Kernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + + result = cudaFuncSetAttribute( + Kernel, + cudaFuncAttributePreferredSharedMemoryCarveout, 100); + + if (result != cudaSuccess) { + return Status::kErrorInternal; + } + } + + cutlass::Kernel<<>>(params_); + + result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { + return run(stream); + } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } +}; + +} // namespace device +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/fused_gemm.cu b/examples/13_fused_two_gemms/fused_gemm.cu new file mode 100644 index 00000000..8f5d4f2c --- /dev/null +++ b/examples/13_fused_two_gemms/fused_gemm.cu @@ -0,0 +1,74 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/** +*/ + +#include "b2b_gemm_f16t_f16n_f16t_tensor_op_f16_sm75.h" +#include "b2b_gemm_s8n_s8t_s8n_tensor_op_s32_sm75.h" + +int run() { + + cudaDeviceProp props; + + cudaError_t error = cudaGetDeviceProperties(&props, 0); + if (error != cudaSuccess) { + std::cerr << "cudaGetDeviceProperties() returned an error: " << cudaGetErrorString(error) << std::endl; + return -1; + } + + if (!(props.major * 10 + props.minor >= 75)) { + std::cerr << "Turing Tensor Ops must be run on a machine with compute capability at least 75." + << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + run_nonfused_gemm_f16(); + run_fused_gemm_f16(); + run_nonfused_gemm_s8(); + run_fused_gemm_s8(); +#endif + + return 0; +} + +int main() { + // Turing Tensor Core operations exposed with mma.sync are first available in CUDA 10.2. + // + // CUTLASS must be compiled with CUDA 10.1 Toolkit to run these examples. + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { + std::cerr << "Turing Tensor Core operations must be compiled with CUDA 10.2 Toolkit or later." << std::endl; + + // Returning zero so this test passes on older Toolkits. Its actions are no-op. + return 0; + } + else { + return run(); + } +} + diff --git a/examples/13_fused_two_gemms/kernel/b2b_gemm.h b/examples/13_fused_two_gemms/kernel/b2b_gemm.h new file mode 100644 index 00000000..d106fa46 --- /dev/null +++ b/examples/13_fused_two_gemms/kernel/b2b_gemm.h @@ -0,0 +1,407 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_coord.h" +#include "cutlass/semaphore.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename B2bMma_, ///! Threadblock-scoped matrix multiply-accumulate + typename Epilogue_, ///! Epilogue + typename ThreadblockSwizzle_, ///! Threadblock swizzling function + bool SplitKSerial ///! If true, code supporting split-K via serial reduction is enabled. +> +struct B2bGemm { + + using B2bMma = B2bMma_; + using Epilogue = Epilogue_; + using OutputOp0 = typename B2bMma::OutputOp; + using OutputOp1 = typename Epilogue::OutputOp; + using ThreadblockSwizzle = ThreadblockSwizzle_; + static bool const kSplitKSerial = SplitKSerial; + + /// Warp count (concept: GemmShape) + using WarpCount0 = typename B2bMma::WarpCount0; + static int const kThreadCount = 32 * WarpCount0::kCount; + + /// Parameters structure + struct Params { + cutlass::gemm::GemmCoord problem_size_0; + cutlass::gemm::GemmCoord problem_size_1; + cutlass::gemm::GemmCoord grid_tiled_shape; + typename B2bMma::IteratorA0::Params params_A0; + typename B2bMma::IteratorA0::TensorRef ref_A0; + typename B2bMma::IteratorB0::Params params_B0; + typename B2bMma::IteratorB0::TensorRef ref_B0; + typename Epilogue::OutputTileIterator::Params params_C0; + typename Epilogue::OutputTileIterator::TensorRef ref_C0; + typename B2bMma::IteratorB1::Params params_B1; + typename B2bMma::IteratorB1::TensorRef ref_B1; + typename Epilogue::OutputTileIterator::Params params_C1; + typename Epilogue::OutputTileIterator::TensorRef ref_C1; + typename Epilogue::OutputTileIterator::Params params_D1; + typename Epilogue::OutputTileIterator::TensorRef ref_D1; + typename OutputOp0::Params output_op_0; + typename OutputOp1::Params output_op_1; + int *semaphore; + int gemm_k_iterations_0; + int gemm_k_size_0; + int gemm_k_iterations_1; + int gemm_k_size_1; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): semaphore(0), gemm_k_iterations_0(0), gemm_k_size_0(0), + gemm_k_iterations_1(0), gemm_k_size_1(0) { } + + CUTLASS_HOST_DEVICE + Params( + cutlass::gemm::GemmCoord const & problem_size_0, + cutlass::gemm::GemmCoord const & problem_size_1, + cutlass::gemm::GemmCoord const & grid_tiled_shape, + typename B2bMma::IteratorA0::TensorRef ref_A0, + typename B2bMma::IteratorB0::TensorRef ref_B0, + typename Epilogue::OutputTileIterator::TensorRef ref_C0, + typename B2bMma::IteratorB1::TensorRef ref_B1, + typename Epilogue::OutputTileIterator::TensorRef ref_C1, + typename Epilogue::OutputTileIterator::TensorRef ref_D1, + typename OutputOp0::Params output_op_0 = typename OutputOp0::Params(), + typename OutputOp1::Params output_op_1 = typename OutputOp1::Params(), + int *workspace = nullptr + ): + problem_size_0(problem_size_0), + problem_size_1(problem_size_1), + grid_tiled_shape(grid_tiled_shape), + params_A0(ref_A0.layout()), + ref_A0(ref_A0), + params_B0(ref_B0.layout()), + ref_B0(ref_B0), + params_C0(ref_C0.layout()), + ref_C0(ref_C0), + params_B1(ref_B1.layout()), + ref_B1(ref_B1), + params_C1(ref_C1.layout()), + ref_C1(ref_C1), + params_D1(ref_D1.layout()), + ref_D1(ref_D1), + output_op_0(output_op_0), + output_op_1(output_op_1) { + + int total_gemm_k_iterations_0 = (problem_size_0.k() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK; + int gemm_k_iterations_0 = (total_gemm_k_iterations_0 + grid_tiled_shape.k() - 1) / grid_tiled_shape.k(); + gemm_k_size_0 = gemm_k_iterations_0 * B2bMma::Shape0::kK; + int total_gemm_k_iterations_1 = (problem_size_1.k() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK; + int gemm_k_iterations_1 = (total_gemm_k_iterations_1 + grid_tiled_shape.k() - 1) / grid_tiled_shape.k(); + gemm_k_size_1 = gemm_k_iterations_1 * B2bMma::Shape1::kK; + + semaphore = workspace; + } + }; + + /// Shared memory storage structure + union SharedStorage { + typename B2bMma::B2bMmaSharedStorage main_loop; + typename Epilogue::SharedStorage epilogue; + }; + + // + // Methods + // + + CUTLASS_HOST_DEVICE + B2bGemm() { } + + /// Determines whether kernel satisfies alignment + static Status can_implement( + cutlass::gemm::GemmCoord const & problem_size_0, + cutlass::gemm::GemmCoord const & problem_size_1, + typename B2bMma::IteratorA0::TensorRef ref_A0, + typename B2bMma::IteratorB0::TensorRef ref_B0, + typename Epilogue::OutputTileIterator::TensorRef ref_C0, + typename B2bMma::IteratorB1::TensorRef ref_B1, + typename Epilogue::OutputTileIterator::TensorRef ref_C1, + typename Epilogue::OutputTileIterator::TensorRef ref_D1) { + + static int const kAlignmentA = B2bMma::IteratorA0::AccessType::kElements; + static int const kAlignmentB = B2bMma::IteratorB0::AccessType::kElements; + static int const kAlignmentC = Epilogue::OutputTileIterator::kElementsPerAccess; + + if (!TensorRef_aligned(ref_A0, kAlignmentA)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_B0, kAlignmentB)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_C0, kAlignmentC)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_B1, kAlignmentB)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_C1, kAlignmentC)) { + return Status::kErrorMisalignedOperand; + } + + if (!TensorRef_aligned(ref_D1, kAlignmentC)) { + return Status::kErrorMisalignedOperand; + } + + if ((problem_size_0.m() % kAlignmentA) || (problem_size_0.k() % kAlignmentA) || + (problem_size_0.n() % kAlignmentB) || (problem_size_0.k() % kAlignmentB) || + (problem_size_0.m() % kAlignmentC) || (problem_size_0.n() % kAlignmentC) || + (problem_size_1.m() % kAlignmentA) || (problem_size_1.k() % kAlignmentA) || + (problem_size_1.n() % kAlignmentB) || (problem_size_1.k() % kAlignmentB) || + (problem_size_1.m() % kAlignmentC) || (problem_size_1.n() % kAlignmentC)) { + + return Status::kErrorMisalignedOperand; + } + + return Status::kSuccess; + } + + /// Executes one GEMM + CUTLASS_DEVICE + void operator()(Params const ¶ms, SharedStorage &shared_storage) { + + // Compute threadblock location + ThreadblockSwizzle threadblock_swizzle; + + cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() || + params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) { + + return; + } + + // Compute initial location in logical coordinates + cutlass::MatrixCoord tb_offset_A0{ + threadblock_tile_offset.m() * B2bMma::Shape0::kM, + threadblock_tile_offset.k() * params.gemm_k_size_0, + }; + + cutlass::MatrixCoord tb_offset_B0{ + threadblock_tile_offset.k() * params.gemm_k_size_0, + threadblock_tile_offset.n() * B2bMma::Shape0::kN + }; + + cutlass::MatrixCoord tb_offset_B1{ + threadblock_tile_offset.k() * params.gemm_k_size_1, + threadblock_tile_offset.n() * B2bMma::Shape1::kN + }; + + // Problem size is a function of threadblock index in the K dimension + int problem_size_k_0 = min( + params.problem_size_0.k(), + (threadblock_tile_offset.k() + 1) * params.gemm_k_size_0); + + // Compute threadblock-scoped matrix multiply-add + int gemm_k_iterations_0 = (problem_size_k_0 - tb_offset_A0.column() + B2bMma::Shape0::kK - 1) / B2bMma::Shape0::kK; + + // Problem size is a function of threadblock index in the K dimension + int problem_size_k_1 = min( + params.problem_size_1.k(), + (threadblock_tile_offset.k() + 1) * params.gemm_k_size_1); + + // Compute threadblock-scoped matrix multiply-add +// int gemm_k_iterations_1 = (problem_size_k_1 - tb_offset_B1.row() + B2bMma::Shape1::kK - 1) / B2bMma::Shape1::kK; + + + // Compute position within threadblock + int thread_idx = threadIdx.x; + + // Construct iterators to A and B operands + typename B2bMma::IteratorA0 iterator_A0( + params.params_A0, + params.ref_A0.data(), + {params.problem_size_0.m(), problem_size_k_0}, + thread_idx, + tb_offset_A0); + + typename B2bMma::IteratorB0 iterator_B0( + params.params_B0, + params.ref_B0.data(), + {problem_size_k_0, params.problem_size_0.n()}, + thread_idx, + tb_offset_B0); + + typename B2bMma::IteratorB1 iterator_B1( + params.params_B1, + params.ref_B1.data(), + {problem_size_k_1, params.problem_size_1.n()}, + thread_idx, + tb_offset_B1); + + + // Broadcast the warp_id computed by lane 0 to ensure dependent code + // is compiled as warp-uniform. + int warp_idx = __shfl_sync(0x1f, threadIdx.x / 32, 0); + int lane_idx = threadIdx.x % 32; + + // + // Main loop + // + + OutputOp0 output_op_0(params.output_op_0); + + // Construct thread-scoped matrix multiply + B2bMma b2bMma(shared_storage.main_loop, thread_idx, warp_idx, lane_idx); + + typename B2bMma::FragmentC0 src_accum; + typename B2bMma::FragmentC1 accumulators; + + src_accum.clear(); + accumulators.clear(); + + if (!kSplitKSerial || gemm_k_iterations_0 > 0) { + // Compute threadblock-scoped matrix multiply-add + b2bMma(gemm_k_iterations_0, accumulators, iterator_A0, iterator_B0, iterator_B1, src_accum, output_op_0); + } + + // + // Epilogue + // + + OutputOp1 output_op_1(params.output_op_1); + + // + // Masked tile iterators constructed from members + // + + threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + + //assume identity swizzle + MatrixCoord threadblock_offset( + threadblock_tile_offset.m() * B2bMma::Shape1::kM, + threadblock_tile_offset.n() * B2bMma::Shape1::kN + ); + + int block_idx = threadblock_tile_offset.m() + threadblock_tile_offset.n() * params.grid_tiled_shape.m(); + + // Construct the semaphore. + Semaphore semaphore(params.semaphore + block_idx, thread_idx); + + // If performing a reduction via split-K, fetch the initial synchronization + if (kSplitKSerial && params.grid_tiled_shape.k() > 1) { + + // Fetch the synchronization lock initially but do not block. + semaphore.fetch(); + + // Indicate which position in a serial reduction the output operator is currently updating + output_op_1.set_k_partition(threadblock_tile_offset.k()); + } + + // Tile iterator loading from source tensor. + typename Epilogue::OutputTileIterator iterator_C1( + params.params_C1, + params.ref_C1.data(), + params.problem_size_1.mn(), + thread_idx, + threadblock_offset + ); + + // Tile iterator writing to destination tensor. + typename Epilogue::OutputTileIterator iterator_D1( + params.params_D1, + params.ref_D1.data(), + params.problem_size_1.mn(), + thread_idx, + threadblock_offset + ); + + Epilogue epilogue( + shared_storage.epilogue, + thread_idx, + warp_idx, + lane_idx); + + // Wait on the semaphore - this latency may have been covered by iterator construction + if (kSplitKSerial && params.grid_tiled_shape.k() > 1) { + + // For subsequent threadblocks, the source matrix is held in the 'D' tensor. + if (threadblock_tile_offset.k()) { + iterator_C1 = iterator_D1; + } + + semaphore.wait(threadblock_tile_offset.k()); + + __threadfence(); + } + + // Execute the epilogue operator to update the destination tensor. + epilogue(output_op_1, iterator_D1, accumulators, iterator_C1); + + // + // Release the semaphore + // + + if (kSplitKSerial && params.grid_tiled_shape.k() > 1) { + + int lock = 0; + if (params.grid_tiled_shape.k() == threadblock_tile_offset.k() + 1) { + + // The final threadblock resets the semaphore for subsequent grids. + lock = 0; + } + else { + // Otherwise, the semaphore is incremented + lock = threadblock_tile_offset.k() + 1; + } + + __threadfence(); + semaphore.release(lock); + } + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace gemm +} // namespace cutlass + diff --git a/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h b/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h new file mode 100644 index 00000000..45b2d545 --- /dev/null +++ b/examples/13_fused_two_gemms/kernel/default_b2b_gemm.h @@ -0,0 +1,296 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + *modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, + *this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + *notice, this list of conditions and the following disclaimer in the + *documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its + *contributors may be used to endorse or promote products derived from this + *software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + *DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, + *INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + *DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + *OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TOR (INCLUDING + *NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + *EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief + Default kernel-level GEMM definitions combine threadblock-scoped matrix multiply-add with + the appropriate threadblock-scoped epilogue. + + Note, CUTLASS epilogues universally target row-major outputs. Column-major outputs are + accommodated by exchanging A and B operands and assuming transposed layouts. Partial + specializations here choose 'device::GemmTransposed' to implement this functionality. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/epilogue/threadblock/epilogue.h" +#include "cutlass/epilogue/thread/linear_combination.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/kernel/gemm_pipelined.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm75.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/gemm/threadblock/default_mma_core_simt.h" +#include "cutlass/gemm/threadblock/threadblock_swizzle.h" +#include "cutlass/epilogue/threadblock/default_epilogue_tensor_op.h" +#include "cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h" +#include "cutlass/epilogue/threadblock/default_epilogue_simt.h" + +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" + +#include "kernel/b2b_gemm.h" +#include "threadblock/default_b2b_mma.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace kernel { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator, + /// Beta is zero or not + bool IsBetaZero = false +> +struct DefaultB2bGemm; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Turing Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator +> +struct DefaultB2bGemm< + ElementA, LayoutA, kAlignmentA, + ElementB, LayoutB, kAlignmentB, + ElementC, layout::RowMajor, + ElementAccumulator, + arch::OpClassTensorOp, + arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + EpilogueOutputOp0, + EpilogueOutputOp1, + ThreadblockSwizzle, + 2, + SplitKSerial, + Operator +> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + layout::RowMajor, + arch::OpClassTensorOp, + arch::Sm75, + ThreadblockShape0, + ThreadblockShape1, + WarpShape0, + WarpShape1, + InstructionShape, + 2, + Operator, + EpilogueOutputOp0 + >::ThreadblockB2bMma; + + static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK; + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape1, + typename B2bMma::Operator1, + kPartitionsK1, + EpilogueOutputOp1, + EpilogueOutputOp1::kCount + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using B2bGemmKernel = kernel::B2bGemm; +}; + + +/// Partial specialization for Turing IMMA Interleaved layout +template < + /// Element type for A matrix operand + typename ElementA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp0, + /// Epilogue output operator + typename EpilogueOutputOp1, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of Interleaved k + int InterleavedK, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator, + /// Is Beta zero or not + bool IsBetaZero> +struct DefaultB2bGemm, + kAlignmentA, ElementB, + layout::RowMajorInterleaved, kAlignmentB, + ElementC, layout::ColumnMajorInterleaved, + int32_t, arch::OpClassTensorOp, arch::Sm75, + ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, + InstructionShape, EpilogueOutputOp0, EpilogueOutputOp1, + ThreadblockSwizzle, 2, SplitKSerial, Operator, IsBetaZero> { + using LayoutA = layout::ColumnMajorInterleaved; + using LayoutB = layout::RowMajorInterleaved; + using LayoutC = layout::ColumnMajorInterleaved; + + using ElementAccumulator = int32_t; + + /// Define the threadblock-scoped matrix multiply-accumulate + using B2bMma = typename cutlass::gemm::threadblock::DefaultB2bMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, ElementAccumulator, LayoutC, + arch::OpClassTensorOp, arch::Sm75, ThreadblockShape0, ThreadblockShape1, + WarpShape0, WarpShape1, InstructionShape, 2, Operator, EpilogueOutputOp0, true>::ThreadblockB2bMma; + + static const int kPartitionsK1 = ThreadblockShape1::kK / WarpShape1::kK; + + /// Define the epilogue for the 2nd Gemm + using Epilogue = typename cutlass::epilogue::threadblock:: + DefaultInterleavedEpilogueTensorOp< + ThreadblockShape1, typename B2bMma::Operator1, kPartitionsK1, EpilogueOutputOp1, + 64 / sizeof_bits::value, InterleavedK, + IsBetaZero>::Epilogue; + + /// Define the kernel-level GEMM operator. + using B2bGemmKernel = kernel::B2bGemm; +}; + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace gemm +} // namespace cutlass diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h b/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h new file mode 100644 index 00000000..01cca8b7 --- /dev/null +++ b/examples/13_fused_two_gemms/threadblock/b2b_mma_base.h @@ -0,0 +1,230 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape0_, + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy0_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy1_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class B2bMmaBase { + public: + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape0 = Shape0_; + using Shape1 = Shape1_; + + ///< Policy describing tuning details + using Policy0 = Policy0_; + using Policy1 = Policy1_; + + // + // Dependent types + // + + /// Warp-level Mma + using Operator0 = typename Policy0::Operator; + using Operator1 = typename Policy1::Operator; + + /// Shape describing the overall GEMM computed from shared memory + /// by each warp. + using WarpGemm0 = typename Policy0::Operator::Shape; + using WarpGemm1 = typename Policy1::Operator::Shape; + + /// Shape describing the number of warps filling the CTA + using WarpCount0 = GemmShape; + using WarpCount1 = GemmShape; + + /// Number of warp-level GEMM oeprations + static int const kWarpGemmIterations0 = + (WarpGemm0::kK / Operator0::Policy::MmaShape::kK); + static int const kWarpGemmIterations1 = + (WarpGemm1::kK / Operator1::Policy::MmaShape::kK); + + /// Number of stages + static int const kStages = Stages; + + // + // Nested structs + // + + /// Shared storage object needed by threadblock-scoped GEMM + template< + typename Shape_, + typename Policy_ + > + class SharedStorage { + public: + // + // Type definitions + // + using Shape = Shape_; + using Policy = Policy_; + using Operator = typename Policy::Operator; + + /// Tensor reference to the A operand + using TensorRefA = TensorRef; + + /// Tensor reference to the B operand + using TensorRefB = TensorRef; + + + /// Shape of the A matrix operand in shared memory + using ShapeA = MatrixShape; + + /// Shape of the B matrix operand in shared memory + using ShapeB = + MatrixShape; + + public: + // + // Data members + // + + /// Buffer for A operand + AlignedBuffer operand_A; + + /// Buffer for B operand + AlignedBuffer operand_B; + + public: + + // + // Methods + // + + /// Returns a layout object for the A matrix + CUTLASS_DEVICE + static typename Operator::LayoutA LayoutA() { + return Operator::LayoutA::packed({ShapeA::kRow, ShapeA::kColumn}); + } + + /// Returns a layout object for the B matrix + CUTLASS_HOST_DEVICE + static typename Operator::LayoutB LayoutB() { + return Operator::LayoutB::packed({ShapeB::kRow, ShapeB::kColumn}); + } + + /// Returns a TensorRef to the A operand + CUTLASS_HOST_DEVICE + TensorRefA operand_A_ref() { + return TensorRefA{operand_A.data(), LayoutA()}; + } + + /// Returns a TensorRef to the B operand + CUTLASS_HOST_DEVICE + TensorRefB operand_B_ref() { + return TensorRefB{operand_B.data(), LayoutB()}; + } + }; + + using SharedStorage0 = SharedStorage; + using SharedStorage1 = SharedStorage; + union B2bMmaSharedStorage { + SharedStorage0 sharedStorage0; + SharedStorage1 sharedStorage1; + }; + + + protected: + + // + // Data members + // + + /// Iterator to load a warp-scoped tile of A0 operand from shared memory + typename Operator0::IteratorA warp_tile_iterator_A0_; + + /// Iterator to load a warp-scoped tile of B0 operand from shared memory + typename Operator0::IteratorB warp_tile_iterator_B0_; + + /// Iterator to load a warp-scoped tile of B0 operand from shared memory + typename Operator1::IteratorB warp_tile_iterator_B1_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + B2bMmaBase( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + B2bMmaSharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + warp_tile_iterator_A0_(shared_storage.sharedStorage0.operand_A_ref(), lane_idx), + warp_tile_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), lane_idx), + warp_tile_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), lane_idx) { + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h b/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h new file mode 100644 index 00000000..ca89cf0b --- /dev/null +++ b/examples/13_fused_two_gemms/threadblock/b2b_mma_pipelined.h @@ -0,0 +1,509 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped Back-to-back fused GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/aligned_buffer.h" +#include "cutlass/numeric_conversion.h" + +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "threadblock/b2b_mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////////////////////// +template +struct chk_val { + static_assert(a==0, "check value"); +}; + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape0_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorA0_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA0_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB0_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB0_, + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape1_, + /// Iterates over the intermediate accumulator tile + // (concept::MmaTensorOpFragmentIterator) + typename FragmentIteratorA1_, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | MaskedTileIterator) + typename IteratorB1_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB1_, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Output operator for 1st Gemm(concept: epilogue::thread::LinearCombinationClamp, etc...) + typename OutputOp_, + /// Policy describing tuning details (concept: MmaPipelinedPolicy) + typename Policy0_, + /// Policy describing tuning details (concept: MmaPipelinedPolicy) + typename Policy1_, + /// Transformation applied to A0 operand + typename TransformA0_ = NumericArrayConverter< + typename SmemIteratorA0_::Element, + typename IteratorA0_::Element, + IteratorA0_::Fragment::kElements>, + /// + /// Transformation applied to B0 operand + typename TransformB0_ = NumericArrayConverter< + typename SmemIteratorB0_::Element, + typename IteratorB0_::Element, + IteratorB0_::Fragment::kElements>, + /// + /// Transformation applied to B1 operand + typename TransformB1_ = NumericArrayConverter< + typename SmemIteratorB1_::Element, + typename IteratorB1_::Element, + IteratorB1_::Fragment::kElements>, + /// Used for partial specialization + typename Enable = bool +> +class B2bMmaPipelined : public B2bMmaBase { +public: + + ///< Base class + using Base = B2bMmaBase; + + using Shape0 = Shape0_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using IteratorA0 = IteratorA0_; ///< Iterates over tiles of A operand in global memory + using IteratorB0 = IteratorB0_; ///< Iterates over tiles of B operand in global memory + using Policy0 = Policy0_; ///< Policy describing tuning details + + using SmemIteratorA0 = SmemIteratorA0_; + using SmemIteratorB0 = SmemIteratorB0_; + + using Shape1 = Shape1_; ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using FragmentIteratorA1 = FragmentIteratorA1_; ///< Iterates over intermediate accumulator tile + using IteratorB1 = IteratorB1_; ///< Iterates over tiles of B operand in global memory + using Policy1 = Policy1_; ///< Policy describing tuning details + + using SmemIteratorB1 = SmemIteratorB1_; + + + using ElementC = ElementC_; ///< Data type of accumulator matrix + using LayoutC = LayoutC_; ///< Layout of accumulator matrix + + using OutputOp = OutputOp_; ///< Epilogue after 1st Gemm + + using TransformA0 = TransformA0_; + using TransformB0 = TransformB0_; + using TransformB1 = TransformB1_; + + // + // Dependent types + // + + /// Fragment of operand A loaded from global memory + using FragmentA0 = typename IteratorA0::Fragment; + + /// Fragment of operand B loaded from global memory + using FragmentB0 = typename IteratorB0::Fragment; + + /// Fragment of accumulator tile + using FragmentC0 = typename Policy0::Operator::FragmentC; + + /// Warp-level Mma + using Operator0 = typename Policy0::Operator; + + /// Fragment of operand B loaded from global memory + using FragmentB1 = typename IteratorB1::Fragment; + + /// Fragment of accumulator tile + using FragmentC1 = typename Policy1::Operator::FragmentC; + + /// Warp-level Mma + using Operator1 = typename Policy1::Operator; + + /// Obtain the arch tag from the warp-level operator + using ArchTag = typename Policy0::Operator::ArchTag; + + /// Complex transform on A0 operand + static ComplexTransform const kTransformA0 = Operator0::kTransformA; + + /// Complex transform on B0 operand + static ComplexTransform const kTransformB0 = Operator0::kTransformB; + + /// Complex transform on B1 operand + static ComplexTransform const kTransformB1 = Operator1::kTransformB; + + // staticaly assert kStages for MmaPipelined is two (Double-buffered pipeline) + static_assert((Base::kStages==2), "MmaPipelined requires kStages set to value 2"); + +private: + + using WarpFragmentA0 = typename Operator0::FragmentA; + using WarpFragmentB0 = typename Operator0::FragmentB; + /// Warp Fragment of operand A1 loaded from accmulator tile + using WarpFragmentA1 = typename FragmentIteratorA1::Fragment; + using WarpFragmentB1 = typename Operator1::FragmentB; + +protected: + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA0 smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B0 operand to shared memory + SmemIteratorB0 smem_iterator_B0_; + + /// Iterator to write threadblock-scoped tile of B1 operand to shared memory + SmemIteratorB1 smem_iterator_B1_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + B2bMmaPipelined( + typename Base::B2bMmaSharedStorage &shared_storage, ///< Shared storage needed for internal use by threadblock-scoped GEMM + int thread_idx, ///< ID within the threadblock + int warp_idx, ///< ID of warp + int lane_idx ///< ID of each thread within a warp + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.sharedStorage0.operand_A_ref(), thread_idx), + smem_iterator_B0_(shared_storage.sharedStorage0.operand_B_ref(), thread_idx), + smem_iterator_B1_(shared_storage.sharedStorage1.operand_B_ref(), thread_idx) { + + + // Compute warp location within threadblock tile by mapping the warp_id to three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + //These should stay the same across different GEMM layers + int warp_idx_mn = warp_idx % (Base::WarpCount0::kM * Base::WarpCount0::kN); + int warp_idx_k = warp_idx / (Base::WarpCount0::kM * Base::WarpCount0::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount0::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount0::kM; + + //These may change across different GEMM layers + int tile_offset_k_0 = Base::kWarpGemmIterations0 * warp_idx_k; + int tile_offset_k_1 = Base::kWarpGemmIterations1 * warp_idx_k; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A0_.add_tile_offset({warp_idx_m, tile_offset_k_0}); + this->warp_tile_iterator_B0_.add_tile_offset({tile_offset_k_0, warp_idx_n}); + this->warp_tile_iterator_B1_.add_tile_offset({tile_offset_k_1, warp_idx_n}); + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + int gemm_k_iterations_0, ///< number of iterations of the mainloop + FragmentC1 &accum, ///< destination accumulator tile + IteratorA0 iterator_A, ///< iterator over A operand in global memory + IteratorB0 iterator_B0, ///< iterator over B0 operand in global memory + IteratorB1 iterator_B1, ///< iterator over B1 operand in global memory + FragmentC0 const &src_accum, ///< source accumualtor tile + OutputOp output_op_0, ///< epilogue operation after 1st Gemm + TransformA0 transform_A0 = TransformA0(), ///< transformation applied to A0 fragment + TransformB0 transform_B0 = TransformB0(), ///< transformation applied to B0 fragment + TransformB1 transform_B1 = TransformB1()) { ///< transformation applied to B1 fragment + + // + // Prologue + // + + // Perform accumulation in the 'd' output operand + FragmentC0 accum0 = src_accum; + + FragmentA0 tb_frag_A; + FragmentB0 tb_frag_B0; + + tb_frag_A.clear(); + tb_frag_B0.clear(); + + // The last kblock is loaded in the prolog + iterator_A.load(tb_frag_A); + iterator_B0.load(tb_frag_B0); + + ++iterator_A; + ++iterator_B0; + + this->smem_iterator_A_.store(tb_frag_A); + this->smem_iterator_B0_.store(tb_frag_B0); + + ++this->smem_iterator_A_; + ++this->smem_iterator_B0_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA0 warp_frag_A0[2]; + WarpFragmentB0 warp_frag_B0[2]; + + this->warp_tile_iterator_A0_.set_kgroup_index(0); + this->warp_tile_iterator_B0_.set_kgroup_index(0); + + this->warp_tile_iterator_A0_.load(warp_frag_A0[0]); + this->warp_tile_iterator_B0_.load(warp_frag_B0[0]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + Operator0 warp_mma0; + + int smem_write_stage_idx = 1; + + // Avoid reading out of bounds + if (gemm_k_iterations_0 <= 1) { + iterator_A.clear_mask(); + iterator_B0.clear_mask(); + } + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + iterator_A.load(tb_frag_A); + + // + // Mainloop + // + + // Note: The main loop does not support Base::WarpGemmIterations == 2. + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations_0 > 0; --gemm_k_iterations_0) { + + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations0; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations0 - 1) { + + // Write fragments to shared memory + this->smem_iterator_A_.store(tb_frag_A); + + this->smem_iterator_B0_.store(tb_frag_B0); + + __syncthreads(); + + // Issue loads during the first warp-level matrix multiply-add *AFTER* issuing + // shared memory loads (which have the tighest latency requirement). + iterator_A.load(tb_frag_A); + + ++this->smem_iterator_B0_; + ++this->smem_iterator_A_; + + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B0_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_A0_.add_tile_offset( + {0, -Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0}); + this->warp_tile_iterator_B0_.add_tile_offset( + {-Base::kStages * Policy0::kPartitionsK * Base::kWarpGemmIterations0, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_A0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + this->warp_tile_iterator_B0_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations0); + + this->warp_tile_iterator_A0_.load(warp_frag_A0[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B0_.load(warp_frag_B0[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A0_; + ++this->warp_tile_iterator_B0_; + + if (warp_mma_k == 0) { + + iterator_B0.load(tb_frag_B0); + + ++iterator_A; + ++iterator_B0; + + // Avoid reading out of bounds if this was the last loop iteration + if (gemm_k_iterations_0 <= 2) { + iterator_A.clear_mask(); + iterator_B0.clear_mask(); + } + } + + warp_mma0(accum0, warp_frag_A0[warp_mma_k % 2], warp_frag_B0[warp_mma_k % 2], accum0); + } + } + + //2nd Gemm + + /// Iterator to load a warp-scoped tile of A1 operand from intermediate accumulator tile + FragmentIteratorA1 warp_tile_iterator_A1_(accum0); + + // + // Prologue + // + + FragmentB1 tb_frag_B1; + + tb_frag_B1.clear(); + + // The last kblock is loaded in the prolog + iterator_B1.load(tb_frag_B1); + + ++iterator_B1; + + this->smem_iterator_B1_.store(tb_frag_B1); + + ++this->smem_iterator_B1_; + + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math instructions + WarpFragmentA1 warp_frag_A1[2]; + WarpFragmentB1 warp_frag_B1[2]; + + //warp_tile_iterator_A1_.set_kgroup_index(0); + this->warp_tile_iterator_B1_.set_kgroup_index(0); + + warp_tile_iterator_A1_.load(warp_frag_A1[0], output_op_0); + this->warp_tile_iterator_B1_.load(warp_frag_B1[0]); + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + Operator1 warp_mma1; + + smem_write_stage_idx = 1; + + int gemm_k_iterations_1 = FragmentIteratorA1::Policy::kIterations / Base::kWarpGemmIterations1; + + // Avoid reading out of bounds + if (gemm_k_iterations_1 <= 1) { + iterator_B1.clear_mask(); + } + + // + // Mainloop + // + + // Note: The main loop does not support Base::WarpGemmIterations == 2. + CUTLASS_PRAGMA_UNROLL + for (; gemm_k_iterations_1 > 0; --gemm_k_iterations_1) { + + // + // Loop over GEMM K dimension + // + + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations1; ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if this is the last group + // as the case may be. + + if (warp_mma_k == Base::kWarpGemmIterations1 - 1) { + + // Write fragments to shared memory + + this->smem_iterator_B1_.store(tb_frag_B1); + + __syncthreads(); + ++smem_iterator_B1_; + + // Add negative offsets to return iterators to the 'start' of the circular buffer in shared memory + if (smem_write_stage_idx == 1) { + smem_iterator_B1_.add_tile_offset({-Base::kStages, 0}); + } + else { + this->warp_tile_iterator_B1_.add_tile_offset( + {-Base::kStages * Policy1::kPartitionsK * + Base::kWarpGemmIterations1, + 0}); + } + + smem_write_stage_idx ^= 1; + } + + this->warp_tile_iterator_B1_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations1); + + warp_tile_iterator_A1_.load(warp_frag_A1[(warp_mma_k + 1) % 2], output_op_0); + this->warp_tile_iterator_B1_.load(warp_frag_B1[(warp_mma_k + 1) % 2]); + + + ++warp_tile_iterator_A1_; + ++this->warp_tile_iterator_B1_; + + if (warp_mma_k == 0) { + + iterator_B1.load(tb_frag_B1); + ++iterator_B1; + + + // Avoid reading out of bounds if this was the last loop iteration + if (gemm_k_iterations_1 <= 2) { + iterator_B1.clear_mask(); + } + } + + warp_mma1(accum, warp_frag_A1[warp_mma_k % 2], warp_frag_B1[warp_mma_k % 2], accum); + } + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass diff --git a/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h b/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h new file mode 100644 index 00000000..cd1403c7 --- /dev/null +++ b/examples/13_fused_two_gemms/threadblock/default_b2b_mma.h @@ -0,0 +1,289 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a pipelined GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/arch.h" + +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" +#include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm75.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h" + +#include "threadblock/b2b_mma_pipelined.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Operator class tag + typename OperatorClass_, + /// Tag indicating architecture to tune for + typename ArchTag_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1_, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation perfomed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor = false> +struct DefaultB2bMma; + +//////////////////////////////////////////////////////////////////////////////// +/// Specialization for row-major output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Operation performed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp> +struct DefaultB2bMma { + // Define the MmaCore components + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, + OperatorClass, 2, Operator>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, + OperatorClass, 2, Operator>; + + // Define iterators over tiles from the A operand + using IteratorA0 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, typename MmaCore0::IteratorThreadMapA, kAlignmentA>; + + // Define iterators over tiles from the B operand + using IteratorB0 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, typename MmaCore0::IteratorThreadMapB, kAlignmentB>; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::ColumnMajor; + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, InstructionShape, EpilogueOutputOp, true>; + + // Define iterators over tiles from the B operand + using IteratorB1 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>; + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined< + typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA, + IteratorB0, typename MmaCore0::SmemIteratorB, + typename MmaCore1::Shape, FragmentIteratorA1, + IteratorB1, typename MmaCore1::SmemIteratorB, + ElementAccumulator, layout::RowMajor, + EpilogueOutputOp, + typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>; + +}; +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for column-major-interleaved output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape0, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape1, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape0, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape1, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Operation performed by GEMM + typename Operator, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Number of Interleaved K + int InterleavedK> +struct DefaultB2bMma, OperatorClass, ArchTag, + ThreadblockShape0, ThreadblockShape1, WarpShape0, WarpShape1, + InstructionShape, 2, Operator, EpilogueOutputOp, true> { + // Define the MmaCore components + using MmaCore0 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape0, WarpShape0, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, + layout::ColumnMajorInterleaved, OperatorClass, 2, Operator, + true>; + using MmaCore1 = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape1, WarpShape1, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, + layout::ColumnMajorInterleaved, OperatorClass, 2, Operator, + true>; + + static_assert(kAlignmentA == 128 / sizeof_bits::value, + "Alignment must match thread data map's vector length"); + + static_assert(kAlignmentB ==128 / sizeof_bits::value, + "Alignment must match thread data map's vector length"); + + // Define iterators over tiles from the A operand + using IteratorA0 = cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, ElementA, + LayoutA, 1, typename MmaCore0::IteratorThreadMapA>; + + // Define iterators over tiles from the B operand + using IteratorB0 = cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, ElementB, + LayoutB, 0, typename MmaCore0::IteratorThreadMapB>; + + // Use fragment iterator for A operand + using AccumulatorLayout = cutlass::layout::RowMajor; //AccumulatorsInRowMajor = true + using FragmentIteratorA1 = + cutlass::gemm::warp::MmaTensorOpFragmentIterator< + cutlass::MatrixShape, //warp shape + cutlass::MatrixShape, //accumulator shape + MmaCore1::Shape::kK, //kBlocksColumn + ElementAccumulator, ElementA, AccumulatorLayout, + InstructionShape, EpilogueOutputOp, true /*only handle beta=0 for 1st Gemm epilogue*/>; + + // Define iterators over tiles from the B operand + using IteratorB1 = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, typename MmaCore1::IteratorThreadMapB>; + + + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockB2bMma = cutlass::gemm::threadblock::B2bMmaPipelined< + typename MmaCore0::Shape, IteratorA0, typename MmaCore0::SmemIteratorA, + IteratorB0, typename MmaCore0::SmemIteratorB, + typename MmaCore1::Shape, FragmentIteratorA1, + IteratorB1, typename MmaCore1::SmemIteratorB, + ElementAccumulator, layout::ColumnMajorInterleaved, + EpilogueOutputOp, + typename MmaCore0::MmaPolicy, typename MmaCore1::MmaPolicy>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d5c503e9..3da7ae45 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -60,6 +60,8 @@ foreach(EXAMPLE 08_turing_tensorop_gemm 10_planar_complex 11_planar_complex_array + 12_gemm_bias_relu + 13_fused_two_gemms ) add_subdirectory(${EXAMPLE}) diff --git a/include/cutlass/aligned_buffer.h b/include/cutlass/aligned_buffer.h index 3232ef87..8b3bb071 100644 --- a/include/cutlass/aligned_buffer.h +++ b/include/cutlass/aligned_buffer.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/arch.h b/include/cutlass/arch/arch.h index b38a347a..faf01cc6 100644 --- a/include/cutlass/arch/arch.h +++ b/include/cutlass/arch/arch.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -52,6 +52,10 @@ struct Sm72 { struct Sm75 { static int const kMinComputeCapability = 75; }; +struct Sm80 { + static int const kMinComputeCapability = 80; +}; + //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace arch diff --git a/include/cutlass/arch/cache_operation.h b/include/cutlass/arch/cache_operation.h new file mode 100644 index 00000000..646b51de --- /dev/null +++ b/include/cutlass/arch/cache_operation.h @@ -0,0 +1,60 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Directives related to cache operations +*/ +#pragma once + +#include "cutlass/cutlass.h" + +namespace cutlass { +namespace arch { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Controls PTX cache operations +struct CacheOperation { + enum Kind { + /// Cache at all levels - accessed again + Always, + /// Cache at global level + Global, + /// Streaming - likely to be accessed once + Streaming, + /// Indicates the line will not be used again + LastUse, + /// Don't cache, and fetch again + Volatile, + /// Write back at all coherent levels + WriteBack, + /// Write through to system memory + WriteThrough + }; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cutlass diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h index fc939053..48ef02cd 100644 --- a/include/cutlass/arch/memory.h +++ b/include/cutlass/arch/memory.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -28,13 +28,271 @@ #pragma once +#include "cutlass/cutlass.h" + namespace cutlass { namespace arch { ///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Fragment type to store loaded data + typename AccessType, + /// The bytes of loading + int LoadBytes + > +struct global_load; ///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint4 *data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %9, 0;\n" + " mov.b32 %0, %10;\n" + " mov.b32 %1, %11;\n" + " mov.b32 %2, %12;\n" + " mov.b32 %3, %13;\n" + " mov.b32 %4, %14;\n" + " mov.b32 %5, %15;\n" + " mov.b32 %6, %16;\n" + " mov.b32 %7, %17;\n" + " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n" + " @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n" + "}\n" + : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w), + "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w) + : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y), + "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y), + "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16)); + } +}; + + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint4 &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " mov.b32 %0, %6;\n" + " mov.b32 %1, %7;\n" + " mov.b32 %2, %8;\n" + " mov.b32 %3, %9;\n" + " @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n" + "}\n" + : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w) + : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint2 &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " mov.b32 %0, %4;\n" + " mov.b32 %1, %5;\n" + " @p ld.global.v2.u32 {%0, %1}, [%2];\n" + "}\n" + : "=r"(data.x), "=r"(data.y) + : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + unsigned &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b32 %0, %3;\n" + " @p ld.global.u32 %0, [%1];\n" + "}\n" + : "=r"(data) + : "l"(ptr), "r"((int)pred_guard), "r"(data)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + uint16_t &data = reinterpret_cast(D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " mov.b16 %0, %3;\n" + " @p ld.global.u16 %0, [%1];\n" + "}\n" + : "=h"(data) + : "l"(ptr), "r"((int)pred_guard), "h"(data)); + } +}; + +template +struct global_load { + CUTLASS_DEVICE + global_load(AccessType &D, void const *ptr, bool pred_guard) { + if (pred_guard) D = *(reinterpret_cast(ptr)); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Fragment type to store loaded data + typename AccessType, + /// The bytes of loading + int LoadBytes + > +struct global_store; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Specializations +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint4 const *data = reinterpret_cast(&D); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + " @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n" + "}\n" + : + : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z), + "r"(data[0].w), "r"((int)pred_guard), "l"(((uint8_t *)ptr) + 16), + "r"(data[1].x), "r"(data[1].y), "r"(data[1].z), "r"(data[1].w)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint4 const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %5, 0;\n" + " @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint2 const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %3, 0;\n" + " @p st.global.v2.u32 [%0], {%1, %2};\n" + "}\n" + : + : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint32_t const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u32 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "r"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + uint16_t const &data = reinterpret_cast(D); + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %2, 0;\n" + " @p st.global.u16 [%0], %1;\n" + "}\n" + : + : "l"(ptr), "h"(data), "r"((int)pred_guard)); + } +}; + +template +struct global_store { + CUTLASS_DEVICE + global_store(AccessType const &D, void *ptr, bool pred_guard) { + if (pred_guard) *(reinterpret_cast(ptr)) = D; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace arch } // namespace cutlass @@ -42,4 +300,6 @@ namespace arch { ///////////////////////////////////////////////////////////////////////////////////////////////// #include "memory_sm75.h" +#include "memory_sm80.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/memory_sm75.h b/include/cutlass/arch/memory_sm75.h index 195f8abf..3fd121b9 100644 --- a/include/cutlass/arch/memory_sm75.h +++ b/include/cutlass/arch/memory_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -50,20 +50,20 @@ inline __device__ void ldsm(Array & D, void const* ptr); // ///////////////////////////////////////////////////////////////////////////////////////////////// -#if ! defined(CUDA_LDMATRIX_SUPPORTED) - #define CUDA_LDMATRIX_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 2)) +#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) || (__CUDACC_VER_MAJOR__ >= 11) + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) +#define CUDA_LDMATRIX_ACTIVATED 1 #endif -#if ! defined(CUDA_LDMATRIX_ENABLED) - #define CUDA_LDMATRIX_ENABLED CUDA_LDMATRIX_SUPPORTED -#endif - -#if CUDA_LDMATRIX_ENABLED && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) - #define CUDA_LDMATRIX_ACTIVATED 1 +#define CUDA_LDMATRIX_SUPPORTED 1 #endif ///////////////////////////////////////////////////////////////////////////////////////////////// - +/* +#if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) && (__CUDACC_VER_MAJOR__ > 10) + #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED 1 +#endif #if ! defined(CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED) #define CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED ((__CUDACC_VER_MAJOR__ == 10) && (__CUDACC_VER_MINOR__ >= 1)) #endif @@ -71,8 +71,9 @@ inline __device__ void ldsm(Array & D, void const* ptr); #if ! defined(CUDA_NVVM_GET_SMEM_POINTER_ENABLED) #define CUDA_NVVM_GET_SMEM_POINTER_ENABLED CUDA_NVVM_GET_SMEM_POINTER_SUPPORTED #endif +*/ -#if CUDA_NVVM_GET_SMEM_POINTER_ENABLED +#if (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) extern "C" { // // This NVVM intrinsic is subject to change in future versions of CUDA. @@ -85,19 +86,49 @@ inline __device__ void ldsm(Array & D, void const* ptr); ///////////////////////////////////////////////////////////////////////////////////////////////// -#if CUDA_NVVM_GET_SMEM_POINTER_ENABLED +/// CUTLASS helper to get SMEM pointer +inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { + +// We prefer to use the new CVTA intrinsics if they are available, otherwise we will fall back to +// the previous internal intrinsics if they are available. +#if (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ >= 11) + // + // This NVVM intrinsic converts an address in shared memory to a plain + // unsigned integer. This is necessary to pass to shared memory instructions + // in inline PTX. + // + // In CUDA 11 and beyond, this replaces __nvvm_get_smem_pointer() [only available in 10.2]. + // + //__device__ size_t __cvta_generic_to_shared(void* ptr); /// CUTLASS helper to get SMEM pointer - inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) { - return __nvvm_get_smem_pointer(const_cast(ptr)); - } + return static_cast(__cvta_generic_to_shared(ptr)); - /// CUTLASS helper to get SMEM pointer - inline __device__ unsigned cutlass_get_smem_pointer(void *ptr) { - return __nvvm_get_smem_pointer(ptr); - } +#elif (defined(__CUDA_ARCH__) && __CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) + return __nvvm_get_smem_pointer(ptr); + +#elif defined(__CUDA_ARCH__) + + uint32_t smem_ptr; + + asm( + "{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n" + : "=r"(smem_ptr) : "l"(ptr)); + + return smem_ptr; + +#else + + return 0; #endif +} + +/// CUTLASS helper to get SMEM pointer +inline __device__ unsigned cutlass_get_smem_pointer(void const *ptr) { + return cutlass_get_smem_pointer(const_cast(ptr)); +} + ///////////////////////////////////////////////////////////////////////////////////////////////// template <> @@ -235,5 +266,6 @@ inline __device__ void ldsm( } ///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace arch } // namespace cutlass diff --git a/include/cutlass/arch/memory_sm80.h b/include/cutlass/arch/memory_sm80.h new file mode 100644 index 00000000..04c56876 --- /dev/null +++ b/include/cutlass/arch/memory_sm80.h @@ -0,0 +1,238 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Architecture-specific operators on memory added for SM80 +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/arch/cache_operation.h" + +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + #define CUDA_CP_ASYNC_ACTIVATED 1 +#else + #define CUDA_CP_ASYNC_ACTIVATED 0 +#endif + +namespace cutlass { +namespace arch { + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Initiates an asynchronous copy from global memory to shared memory. +/// +/// LDGSTS +/// +template < + /// Size of the access in bytes + int SizeInBytes, + /// Cache operation + CacheOperation::Kind cache_op = CacheOperation::Always> +struct cp_async; + +/// Initiates an asynchronous copy from global memory to shared memory. Rather than predicate +/// the entire transfer, zeros are written to SMEM if the guard predicate is false. +/// +/// LDGSTS +/// +template < + /// Size of the access in bytes + int SizeInBytes, + /// Cache operation + CacheOperation::Kind cache_op = CacheOperation::Always> +struct cp_async_zfill; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async { + /// Copy + CUTLASS_DEVICE + cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { + #if CUDA_CP_ASYNC_ACTIVATED + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.ca.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred_guard), + "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + #endif + } +}; + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async_zfill { + /// Copy with zero fill + CUTLASS_DEVICE + cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard) { + #if CUDA_CP_ASYNC_ACTIVATED + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + int src_in_bytes = (pred_guard ? SizeInBytes : 0); + + asm volatile( + "cp.async.ca.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr), + "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + else { + AccessType zeros; + zeros.clear(); + *static_cast(smem_ptr) = zeros; + } + #endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async { + /// Copy + CUTLASS_DEVICE + cp_async(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { + #if CUDA_CP_ASYNC_ACTIVATED + + static_assert(SizeInBytes == 16, + "cp.async only supports CacheOperation::Global when access size is 16B."); + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + + asm volatile( + "{\n" + " .reg .pred p;\n" + " setp.ne.b32 p, %0, 0;\n" + " @p cp.async.cg.shared.global [%1], [%2], %3;\n" + "}\n" ::"r"((int)pred_guard), + "r"(smem_int_ptr), "l"(global_ptr), "n"(SizeInBytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + #endif + } +}; + +/// Partial specialization +template < + /// Size of the access in bytes + int SizeInBytes> +struct cp_async_zfill { + /// Copy with zero fill + CUTLASS_DEVICE + cp_async_zfill(void *smem_ptr, void const *global_ptr, bool pred_guard = true) { + #if CUDA_CP_ASYNC_ACTIVATED + + static_assert(SizeInBytes == 16, + "cp.async only supports CacheOperation::Global when access size is 16B."); + + unsigned smem_int_ptr = cutlass_get_smem_pointer(smem_ptr); + int src_in_bytes = (pred_guard ? SizeInBytes : 0); + + asm volatile( + "cp.async.cg.shared.global [%0], [%1], %2, %3;\n" ::"r"(smem_int_ptr), + "l"(global_ptr), "n"(SizeInBytes), "r"(src_in_bytes)); + + #else + using AccessType = Array; + + if (pred_guard) { + *static_cast(smem_ptr) = *static_cast(global_ptr); + } + else { + AccessType zeros; + zeros.clear(); + *static_cast(smem_ptr) = zeros; + } + #endif + } +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Establishes an ordering w.r.t previously issued cp.async instructions. Does not block. +CUTLASS_DEVICE +void cp_async_fence() { + #if CUDA_CP_ASYNC_ACTIVATED + asm volatile("cp.async.commit_group;\n" ::); + #endif +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Blocks until all but previous cp.async.commit_group operations have committed. +template +CUTLASS_DEVICE void cp_async_wait() { + #if CUDA_CP_ASYNC_ACTIVATED + asm volatile("cp.async.wait_group %0;\n" ::"n"(N)); + #endif +} + +/// Blocks until all previous cp.async.commit_group operations have committed. +template <> +CUTLASS_DEVICE void cp_async_wait<0>() { + #if CUDA_CP_ASYNC_ACTIVATED + asm volatile("cp.async.wait_all;\n" ::); + #endif +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/arch/mma.h b/include/cutlass/arch/mma.h index e59b710f..74c24695 100644 --- a/include/cutlass/arch/mma.h +++ b/include/cutlass/arch/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -51,11 +51,26 @@ struct OpMultiplyAddSaturate; ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Tag indicating the input is converted to a narrower type (BF16) +struct OpMultiplyAddFastBF16; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tag indicating the input is converted to a narrower type (F16) +struct OpMultiplyAddFastF16; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Tag indicating the complex multiply-add operation struct OpMultiplyAddComplex; ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Tag indicating the gaussian complex multiply-add operation +struct OpMultiplyAddGaussianComplex; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Tag indicating the inner product is defined by (XOR, POPC) struct OpXorPopc; diff --git a/include/cutlass/arch/mma_sm50.h b/include/cutlass/arch/mma_sm50.h index 8698a8b3..fce521dc 100644 --- a/include/cutlass/arch/mma_sm50.h +++ b/include/cutlass/arch/mma_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm60.h b/include/cutlass/arch/mma_sm60.h index 6e513ced..ab0481ae 100644 --- a/include/cutlass/arch/mma_sm60.h +++ b/include/cutlass/arch/mma_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm61.h b/include/cutlass/arch/mma_sm61.h index 68a1b145..9ec8857e 100644 --- a/include/cutlass/arch/mma_sm61.h +++ b/include/cutlass/arch/mma_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm70.h b/include/cutlass/arch/mma_sm70.h index 57b50e00..b03ce2c1 100644 --- a/include/cutlass/arch/mma_sm70.h +++ b/include/cutlass/arch/mma_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm75.h b/include/cutlass/arch/mma_sm75.h index fb8a3dc5..ef65f20b 100644 --- a/include/cutlass/arch/mma_sm75.h +++ b/include/cutlass/arch/mma_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/mma_sm80.h b/include/cutlass/arch/mma_sm80.h new file mode 100644 index 00000000..445ec388 --- /dev/null +++ b/include/cutlass/arch/mma_sm80.h @@ -0,0 +1,2091 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Matrix multiply +*/ + +#pragma once + +#if defined(__CUDACC_RTC__) +#include +#else +#include +#endif + +#include "mma.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/numeric_types.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if ((__CUDACC_VER_MAJOR__ > 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 0)) + +#define CUTLASS_ARCH_MMA_SM80_SUPPORTED 1 + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) +#define CUTLASS_ARCH_MMA_SM80_ENABLED +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace arch { + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 1688 - Float BF16, FP32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation - F32 = bf16 * bf16 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 8>, + 32, + bfloat16_t, + layout::RowMajor, + bfloat16_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 8>; + + using ElementA = bfloat16_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = bfloat16_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm( + "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : + "r"(A[0]), "r"(A[1]), + "r"(B[0]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3]) + ); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 1684 - Float TF32 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 4>, + 32, + tfloat32_t, + layout::RowMajor, + tfloat32_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 4>; + + using ElementA = tfloat32_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = tfloat32_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + float const *C = reinterpret_cast(&c); + float *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k4.row.col.f32.tf32.tf32.f32 {%0,%1,%2,%3}, {%4,%5}, {%6}, {%7,%8,%9,%10};\n" + : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) + : + "r"(A[0]), "r"(A[1]), + "r"(B[0]), + "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]) + ); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 1688 - Float TF32 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = tf32 * tf32 + F32 +template <> +struct Mma, 32, tfloat32_t, layout::RowMajor, + tfloat32_t, layout::ColumnMajor, float, layout::RowMajor, + OpMultiplyAdd> { + using Shape = gemm::GemmShape<16, 8, 8>; + + using ElementA = tfloat32_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = tfloat32_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + float const *C = reinterpret_cast(&c); + float *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16816 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F16 = F16 * F16 + F16 +template <> +struct Mma< + gemm::GemmShape<16, 8, 16>, + 32, + half_t, + layout::RowMajor, + half_t, + layout::ColumnMajor, + half_t, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 16>; + + using ElementA = half_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = half_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = half_t; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm volatile("mma.sync.aligned.m16n8k16.row.col.f16.f16.f16.f16 {%0,%1}, {%2,%3,%4,%5}, {%6,%7}, {%8,%9};\n" + : "=r"(D[0]), "=r"(D[1]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), + "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]) + ); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = bf16 * bf16 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 16>, + 32, + bfloat16_t, + layout::RowMajor, + bfloat16_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 16>; + + using ElementA = bfloat16_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = bfloat16_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32 " + "{%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F32 = F16 * F16 + F32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 16>, + 32, + half_t, + layout::RowMajor, + half_t, + layout::ColumnMajor, + float, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 16>; + + using ElementA = half_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = half_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = float; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + uint32_t const *C = reinterpret_cast(&c); + uint32_t *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, {%8,%9}, " + "{%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 884 - F64 +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: F64 = F64 * F64 + F64 +template <> +struct Mma< + gemm::GemmShape<8,8,4>, + 32, + double, + layout::RowMajor, + double, + layout::ColumnMajor, + double, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<8,8,4>; + + using ElementA = double; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = double; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + + using ArchTag = arch::Sm80; + + CUTLASS_HOST_DEVICE + void operator()(FragmentC &d, FragmentA const &a, FragmentB const &b, + FragmentC const &c) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint64_t const & A = reinterpret_cast(a); + uint64_t const & B = reinterpret_cast(b); + + uint64_t const *C = reinterpret_cast(&c); + uint64_t *D = reinterpret_cast(&d); + + asm volatile("mma.sync.aligned.m8n8k4.row.col.f64.f64.f64.f64 {%0,%1}, {%2}, {%3}, {%4,%5};\n" + : "=l"(D[0]), "=l"(D[1]) + : "l"(A), "l"(B), "l"(C[0]), "l"(C[1])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16816 - S8 input, S32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5}, {%6}, " + "{%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + + +#else + assert(0); +#endif + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16816 - S8 input, S32 accumulation - SATURATE +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,16>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,16>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const &B = reinterpret_cast(b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k16.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, {%4,%5}, " + "{%6}, {%7,%8,%9,%10};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(B), "r"(C[0]), "r"(C[1]), "r"(C[2]), + "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16832 - S8 input, S32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16832 - S8 input, S32 accumulation - SATURATE +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const * A = reinterpret_cast(&a); + uint32_t const * B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * S8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + int8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = int8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.s8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + int8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = int8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.s8.u8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U8 * U8 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,32>, + 32, + uint8_t, + layout::RowMajor, + uint8_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16,8,32>; + + using ElementA = uint8_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = uint8_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k32.row.col.s32.u8.u8.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16864 - S4 input, S32 accumulation +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32 {%0,%1,%2,%3}, {%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 16864 - S4 input, S32 accumulation - SATURATE +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = S4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const * A = reinterpret_cast(&a); + uint32_t const * B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.s4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * S4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::int4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::int4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.s4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = S4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::int4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::int4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.s4.u4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = U4 * U4 + S32 +template <> +struct Mma< + gemm::GemmShape<16, 8, 64>, + 32, + cutlass::uint4b_t, + layout::RowMajor, + cutlass::uint4b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAddSaturate> { + + using Shape = gemm::GemmShape<16, 8, 64>; + + using ElementA = cutlass::uint4b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint4b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAddSaturate; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k64.row.col.s32.u4.u4.s32.satfinite {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, {%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +/// Matrix multiply-add operation: S32 = B1 & B1 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,256>, + 32, + cutlass::uint1b_t, + layout::RowMajor, + cutlass::uint1b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpMultiplyAdd> { + + using Shape = gemm::GemmShape<16,8,256>; + + using ElementA = cutlass::uint1b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint1b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int32_t; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpMultiplyAdd; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.and.popc {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// +// Matrix Multiply 168256 - B1 input, S32 accumulation - XOR,POPC +// +//////////////////////////////////////////////////////////////////////////////// + +/// Matrix multiply-add operation: S32 = B1 & B1 + S32 +template <> +struct Mma< + gemm::GemmShape<16,8,256>, + 32, + cutlass::uint1b_t, + layout::RowMajor, + cutlass::uint1b_t, + layout::ColumnMajor, + int, + layout::RowMajor, + OpXorPopc> { + + using Shape = gemm::GemmShape<16,8,256>; + + using ElementA = cutlass::uint1b_t; + using LayoutA = layout::RowMajor; + using FragmentA = Array; + + using ElementB = cutlass::uint1b_t; + using LayoutB = layout::ColumnMajor; + using FragmentB = Array; + + using ElementC = int; + using LayoutC = layout::RowMajor; + using FragmentC = Array; + + using Operator = OpXorPopc; + using ArchTag = arch::Sm80; + + /// Computes multiply-add + CUTLASS_HOST_DEVICE + void operator()( + FragmentC &d, + FragmentA const &a, + FragmentB const &b, + FragmentC const &c + ) const { + +#if defined(CUTLASS_ARCH_MMA_SM80_ENABLED) + + uint32_t const *A = reinterpret_cast(&a); + uint32_t const *B = reinterpret_cast(&b); + + int const *C = reinterpret_cast(&c); + int *D = reinterpret_cast(&d); + + asm volatile( + "mma.sync.aligned.m16n8k256.row.col.s32.b1.b1.s32.xor.popc {%0,%1,%2,%3}, " + "{%4,%5,%6,%7}, " + "{%8,%9}, {%10,%11,%12,%13};\n" + : "=r"(D[0]), "=r"(D[1]), "=r"(D[2]), "=r"(D[3]) + : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), + "r"(C[0]), "r"(C[1]), "r"(C[2]), "r"(C[3])); + +#else + assert(0); +#endif + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace arch +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/arch/simd.h b/include/cutlass/arch/simd.h index 75b38001..4520acc9 100644 --- a/include/cutlass/arch/simd.h +++ b/include/cutlass/arch/simd.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd_sm60.h b/include/cutlass/arch/simd_sm60.h index cd0babd5..36030a36 100644 --- a/include/cutlass/arch/simd_sm60.h +++ b/include/cutlass/arch/simd_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/simd_sm61.h b/include/cutlass/arch/simd_sm61.h index e8d5c889..94f1c617 100644 --- a/include/cutlass/arch/simd_sm61.h +++ b/include/cutlass/arch/simd_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma.h b/include/cutlass/arch/wmma.h index 9843e134..88968abd 100644 --- a/include/cutlass/arch/wmma.h +++ b/include/cutlass/arch/wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm70.h b/include/cutlass/arch/wmma_sm70.h index 6c989c9a..94eeb93d 100644 --- a/include/cutlass/arch/wmma_sm70.h +++ b/include/cutlass/arch/wmma_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm72.h b/include/cutlass/arch/wmma_sm72.h index 477a72c3..1b8cc116 100644 --- a/include/cutlass/arch/wmma_sm72.h +++ b/include/cutlass/arch/wmma_sm72.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/arch/wmma_sm75.h b/include/cutlass/arch/wmma_sm75.h index 2985be58..f630712f 100644 --- a/include/cutlass/arch/wmma_sm75.h +++ b/include/cutlass/arch/wmma_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -120,8 +120,7 @@ struct Wmma< //////////////////////////////////////////////////////////////////////////////// // // WMMA template structure defines nvcuda::wmma::fragments and static assert for -// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1) -// (nvcuda::wmma targetting SASS instruction BMMA) +// wmma native instruction sizes supported for cutlass::uint1b_t (experimental::b1). // //////////////////////////////////////////////////////////////////////////////// template < diff --git a/include/cutlass/array.h b/include/cutlass/array.h index be14a879..0018b76f 100644 --- a/include/cutlass/array.h +++ b/include/cutlass/array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -167,7 +167,7 @@ public: class const_iterator { /// Pointer to object - T *ptr_; + const T *ptr_; public: diff --git a/include/cutlass/array_subbyte.h b/include/cutlass/array_subbyte.h index b340c890..78081fac 100644 --- a/include/cutlass/array_subbyte.h +++ b/include/cutlass/array_subbyte.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/bfloat16.h b/include/cutlass/bfloat16.h new file mode 100644 index 00000000..c3bd1782 --- /dev/null +++ b/include/cutlass/bfloat16.h @@ -0,0 +1,461 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Defines a proxy class for storing non-standard 16-bit floating point values with + 8 bits of exponent and 7 bit of mantissa. +*/ +#pragma once + +#if !defined(__CUDACC_RTC__) +#include +#include +#include +#endif + +#include "cutlass/cutlass.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Floating-point type with 8 bits of exponent and 7 bits of mantissa. +struct alignas(2) bfloat16_t { + + // + // Data members + // + + /// Storage type + uint16_t storage; + + // + // Methods + // + + /// Constructs from an unsigned short + CUTLASS_HOST_DEVICE + static bfloat16_t bitcast(uint16_t x) { + bfloat16_t h; + h.storage = x; + return h; + } + + /// Default constructor + CUTLASS_HOST_DEVICE + bfloat16_t() { } + + /// Floating-point conversion - round toward nearest + CUTLASS_HOST_DEVICE + explicit bfloat16_t(float x) { + + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDACC_VER_MAJOR__ >= 11) + + asm("cvt.rn.bf16.f32 %0, %1;\n" : "=h"(storage) : "f"(x)); + + #else + uint32_t bits = reinterpret_cast(x); + + if ((bits & 0x7f800000) != 0x7f800000) { + + bool mantissa_bit = ((bits & (1 << 16)) != 0); + bool round_bit = ((bits & (1 << 15)) != 0); + bool sticky_bit = ((bits & ((1 << 15) - 1)) != 0); + + if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) { + bits += uint32_t(1 << 16); + } + } + else if (bits & ~0xff800000) { + bits = 0x7fffffff; + } + + storage = uint16_t((bits >> 16) & 0xffff); + #endif + } + + /// Floating-point conversion - round toward nearest + CUTLASS_HOST_DEVICE + explicit bfloat16_t(double x): bfloat16_t(float(x)) { + + } + + /// Integer conversion - round toward nearest + CUTLASS_HOST_DEVICE + explicit bfloat16_t(int x) { + float flt = static_cast(x); + storage = uint16_t(reinterpret_cast(flt) >> 16); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator float() const { + unsigned bits = (unsigned(storage) << 16); + return reinterpret_cast(bits); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator double() const { + return double(float(*this)); + } + + /// Converts to int + CUTLASS_HOST_DEVICE + explicit operator int() const { + return int(float(*this)); + } + + /// Casts to bool + CUTLASS_HOST_DEVICE + operator bool() const { + return (float(*this) != 0.0f); + } + + /// Obtains raw bits + CUTLASS_HOST_DEVICE + uint16_t raw() const { + return storage; + } + /// Returns the sign bit + CUTLASS_HOST_DEVICE + bool signbit() const { + return ((raw() & 0x8000) != 0); + } + + /// Returns the biased exponent + CUTLASS_HOST_DEVICE + int exponent_biased() const { + return int((raw() >> 7) & 0x0ff); + } + + /// Returns the unbiased exponent + CUTLASS_HOST_DEVICE + int exponent() const { + return exponent_biased() - 127; + } + + /// Returns the mantissa + CUTLASS_HOST_DEVICE + int mantissa() const { + return int(raw() & 0x7f); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool signbit(cutlass::bfloat16_t const& h) { + return h.signbit(); +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t abs(cutlass::bfloat16_t const& h) { + return cutlass::bfloat16_t::bitcast(h.raw() & 0x7fffffff); +} + +CUTLASS_HOST_DEVICE +bool isnan(cutlass::bfloat16_t const& h) { + return (h.exponent_biased() == 0x0ff) && h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isfinite(cutlass::bfloat16_t const& h) { + return (h.exponent_biased() != 0x0ff); +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t nan_bf16(const char*) { + // NVIDIA canonical NaN + return cutlass::bfloat16_t::bitcast(0x7fff); +} + +CUTLASS_HOST_DEVICE +bool isinf(cutlass::bfloat16_t const& h) { + return (h.exponent_biased() == 0x0ff) && !h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isnormal(cutlass::bfloat16_t const& h) { + return h.exponent_biased() && h.exponent_biased() != 0x0ff; +} + +CUTLASS_HOST_DEVICE +int fpclassify(cutlass::bfloat16_t const& h) { + int exp = h.exponent_biased(); + int mantissa = h.mantissa(); + if (exp == 0x0ff) { + if (mantissa) { + return FP_NAN; + } + else { + return FP_INFINITE; + } + } + else if (!exp) { + if (mantissa) { + return FP_SUBNORMAL; + } + else { + return FP_ZERO; + } + } + return FP_NORMAL; +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t sqrt(cutlass::bfloat16_t const& h) { +#if defined(__CUDACC_RTC__) + return cutlass::bfloat16_t(sqrtf(float(h))); +#else + return cutlass::bfloat16_t(std::sqrt(float(h))); +#endif +} + +CUTLASS_HOST_DEVICE +bfloat16_t copysign(bfloat16_t const& a, bfloat16_t const& b) { + + uint16_t a_mag = (reinterpret_cast(a) & 0x7fff); + uint16_t b_sign = (reinterpret_cast(b) & 0x8000); + uint16_t result = (a_mag | b_sign); + + return reinterpret_cast(result); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Standard Library operations and definitions +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace std { + +#if !defined(__CUDACC_RTC__) +/// Numeric limits +template <> +struct numeric_limits { + static bool const is_specialized = true; + static bool const is_signed = true; + static bool const is_integer = false; + static bool const is_exact = false; + static bool const has_infinity = true; + static bool const has_quiet_NaN = true; + static bool const has_signaling_NaN = false; + static std::float_denorm_style const has_denorm = std::denorm_present; + static bool const has_denorm_loss = true; + static std::float_round_style const round_style = std::round_to_nearest; + static bool const is_iec559 = false; + static bool const is_bounded = true; + static bool const is_modulo = false; + static int const digits = 7; + + /// Least positive value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t min() { return cutlass::bfloat16_t::bitcast(0x01); } + + /// Minimum finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t lowest() { return cutlass::bfloat16_t::bitcast(0xff7f); } + + /// Maximum finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t max() { return cutlass::bfloat16_t::bitcast(0x7f7f); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t epsilon() { return cutlass::bfloat16_t::bitcast(0x1000); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t round_error() { return cutlass::bfloat16_t(0.5f); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t infinity() { return cutlass::bfloat16_t::bitcast(0x7f80); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t quiet_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t signaling_NaN() { return cutlass::bfloat16_t::bitcast(0x7fff); } + + /// Returns smallest finite value + CUTLASS_HOST_DEVICE + static cutlass::bfloat16_t denorm_min() { return cutlass::bfloat16_t::bitcast(0x1); } +}; +#endif + +} // namespace std + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Arithmetic operators +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool operator==(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) == float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator!=(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) != float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) < float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<=(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) <= float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) > float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>=(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return float(lhs) >= float(rhs); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator+(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) + float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator-(bfloat16_t const& lhs) { + return bfloat16_t(-float(lhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator-(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) - float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator*(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) * float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator/(bfloat16_t const& lhs, bfloat16_t const& rhs) { + return bfloat16_t(float(lhs) / float(rhs)); +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator+=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) + float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator-=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) - float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator*=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) * float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator/=(bfloat16_t & lhs, bfloat16_t const& rhs) { + lhs = bfloat16_t(float(lhs) / float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator++(bfloat16_t & lhs) { + float tmp(lhs); + ++tmp; + lhs = bfloat16_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t& operator--(bfloat16_t & lhs) { + float tmp(lhs); + --tmp; + lhs = bfloat16_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator++(bfloat16_t & lhs, int) { + bfloat16_t ret(lhs); + float tmp(lhs); + tmp++; + lhs = bfloat16_t(tmp); + return ret; +} + +CUTLASS_HOST_DEVICE +bfloat16_t operator--(bfloat16_t & lhs, int) { + bfloat16_t ret(lhs); + float tmp(lhs); + tmp--; + lhs = bfloat16_t(tmp); + return ret; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// User-defined literals +// + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t operator "" _bf16(long double x) { + return cutlass::bfloat16_t(float(x)); +} + +CUTLASS_HOST_DEVICE +cutlass::bfloat16_t operator "" _bf16(unsigned long long int x) { + return cutlass::bfloat16_t(int(x)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/complex.h b/include/cutlass/complex.h index 20c4a64a..6f7d73bb 100644 --- a/include/cutlass/complex.h +++ b/include/cutlass/complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -35,6 +35,9 @@ #include "cutlass/half.h" #include "cutlass/real.h" +#include "cutlass/bfloat16.h" +#include "cutlass/tfloat32.h" + #if !defined(__CUDACC_RTC__) #include #endif @@ -370,6 +373,15 @@ template CUTLASS_HOST_DEVICE complex conj(complex const &z) { return complex(real(z), -imag(z)); } +/// Indentity transform for non-complex types +template +CUTLASS_HOST_DEVICE T conj(T const &z) { + static_assert( !std::is_same::value && + !std::is_same::value && + !std::is_same>::value && + !std::is_same>::value, "May not be a complex data type"); + return z; +} /// Projects the complex number z onto the Riemann sphere template @@ -429,6 +441,7 @@ template struct RealType< complex > { using Type = T; +CUTLASS_HOST_DEVICE static complex from_real(double x) { return complex(static_cast(x)); } diff --git a/include/cutlass/coord.h b/include/cutlass/coord.h index e2615755..82613c24 100644 --- a/include/cutlass/coord.h +++ b/include/cutlass/coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -360,6 +360,29 @@ public: namespace cutlass { + +/// Scalar multiplication +template +CUTLASS_HOST_DEVICE +Coord operator*(Index s, Coord coord) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Rank; ++i) { + coord[i] *= s; + } + return coord; +} + +/// Scalar multiplication +template +CUTLASS_HOST_DEVICE +Coord operator*(Coord coord, Index s) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Rank; ++i) { + coord[i] *= s; + } + return coord; +} + /// Scalar division template CUTLASS_HOST_DEVICE @@ -419,3 +442,4 @@ Coord<4> make_Coord(int _0, int _1, int _2, int _3) { //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass + diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h index d9dc7890..a87ecfa7 100644 --- a/include/cutlass/core_io.h +++ b/include/cutlass/core_io.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -33,9 +33,14 @@ #include "cutlass/coord.h" #include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/gemm/gemm.h" namespace cutlass { +/////////////////////////////////////////////////////////////////////////////////////////////////// +// stream operators for cutlass namespace // /////////////////////////////////////////////////////////////////////////////////////////////////// template @@ -47,8 +52,6 @@ std::ostream& operator<<(std::ostream& out, Coord const& coord) { return out; } -/////////////////////////////////////////////////////////////////////////////////////////////////// - inline std::istream & operator>>(std::istream &stream, half_t &x) { float tmp; @@ -62,6 +65,16 @@ std::ostream & operator<<(std::ostream &out, half_t const &x) { return out << float(x); } +inline +std::ostream & operator<<(std::ostream &out, bfloat16_t const &x) { + return out << float(x); +} + +inline +std::ostream & operator<<(std::ostream &out, tfloat32_t const &x) { + return out << float(x); +} + /////////////////////////////////////////////////////////////////////////////////////////////////// /// Helper to enable formatted printing of CUTLASS scalar types to an ostream @@ -98,7 +111,54 @@ inline std::ostream &operator<<(std::ostream &out, ScalarIO const &scal return out << unsigned(scalar.value); } + +/// Default printing to ostream for MatrixShape +template +inline +std::ostream & operator<<(std::ostream &out, cutlass::MatrixShape const &matrix_shape) { + out << "cutlass::MatrixShape::(kRow, kColumn) {" + << cutlass::MatrixShape::kRow <<"," + << cutlass::MatrixShape::kColumn <<"}"; + return out; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// stream operators for cutlass::gemm namespace // +/////////////////////////////////////////////////////////////////////////////////////////////////// +namespace gemm { + +/// Default printing to ostream for GemmShape +template +inline +std::ostream & operator<<(std::ostream &out, cutlass::gemm::GemmShape const &gemm_shape) { + out << "cutlass::GemmShape::(kM, kN, kK) {" + << cutlass::gemm::GemmShape::kM <<"," + << cutlass::gemm::GemmShape::kN <<"," + << cutlass::gemm::GemmShape::kK << "}"; + return out; +} + +} //namespace gemm +/////////////////////////////////////////////////////////////////////////////////////////////////// + + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// stream operators for cutlass::layout namespace // +/////////////////////////////////////////////////////////////////////////////////////////////////// +namespace layout { + +/// Default printing to ostream for PitchLinearShape +template < int Contiguous, int Strided> +inline +std::ostream & operator<<(std::ostream &out, cutlass::layout::PitchLinearShape const &pitch_linear_shape) { + out << "cutlass::layout::PitchLinearShape::(kContiguous, kStrided) {" + << cutlass::layout::PitchLinearShape::kContiguous <<"," + << cutlass::layout::PitchLinearShape::kStrided <<"}"; + return out; +} + +} //namespace layout /////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass - +/////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/cutlass.h b/include/cutlass/cutlass.h index b5a0e5f4..860dc3e5 100644 --- a/include/cutlass/cutlass.h +++ b/include/cutlass/cutlass.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -55,6 +55,8 @@ enum class Status { kErrorNotSupported, ///< Operation is not supported on current device. kErrorWorkspaceNull, ///< The given workspace is null when it is required to be non-null. kErrorInternal, ///< An error within CUTLASS occurred. + kErrorArchMismatch, ///< CUTLASS runs on a device that it was not compiled for. + kErrorInsufficientDriver, ///< CUTLASS runs with a driver that is too old. kInvalid ///< Status is unspecified. }; @@ -78,6 +80,10 @@ static char const* cutlassGetStatusString(cutlass::Status status) { return "Error Workspace Null"; case cutlass::Status::kErrorInternal: return "Error Internal"; + case cutlass::Status::kErrorInsufficientDriver: + return "Error Insufficient Driver"; + case cutlass::Status::kErrorArchMismatch: + return "Erroor Architecture Mismatch"; case cutlass::Status::kInvalid: break; } diff --git a/include/cutlass/device_kernel.h b/include/cutlass/device_kernel.h index 4a992bb3..f5166ab1 100644 --- a/include/cutlass/device_kernel.h +++ b/include/cutlass/device_kernel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h new file mode 100644 index 00000000..c0f42146 --- /dev/null +++ b/include/cutlass/epilogue/thread/activation.h @@ -0,0 +1,119 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief This extends the contents of cutlass/functional.h with frequently used activation functions. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/complex.h" + +#include "cutlass/array.h" +#include "cutlass/half.h" +#include "cutlass/functional.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// ReLu operator - propagates NaNs +template +struct ReLu { + CUTLASS_HOST_DEVICE + T operator()(T const & threshold, T const &value) const { + if (value < threshold) { + value = threshold; + } + return value; + } +}; + +template +struct ReLu> { + CUTLASS_HOST_DEVICE + Array operator()(T const & threshold, Array const &frag) const { + Array result; + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + T value = frag[i]; + if (value < threshold) { + value = threshold; + } + result[i] = value; + } + return result; + } +}; + +// Sigmoid operator +template +struct Sigmoid { + CUTLASS_HOST_DEVICE + T operator()(T const &scalar) const { + return T(1) / (T(1) + exp(-scalar)); + } +}; + +template <> +struct Sigmoid { + CUTLASS_HOST_DEVICE + float operator()(float const &scalar) const { + return 1.0f / (1.0f + expf(-scalar)); + } +}; + +template +struct Sigmoid > { + CUTLASS_HOST_DEVICE + Array operator()(Array const &rhs) const { + Array y; + Sigmoid sigmoid_op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < int(rhs.size()); ++i) { + y[i] = sigmoid_op(rhs[i]); + } + + return y; + } +}; + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/epilogue/thread/conversion_op.h b/include/cutlass/epilogue/thread/conversion_op.h index 32b885bc..ad17d414 100644 --- a/include/cutlass/epilogue/thread/conversion_op.h +++ b/include/cutlass/epilogue/thread/conversion_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -101,7 +101,7 @@ public: CUTLASS_HOST_DEVICE FragmentOutput operator()( FragmentAccumulator const &accumulator, - FragmentOutput const &source, + FragmentOutput const &source = FragmentOutput(), ElementCompute uniform = ElementCompute(0)) const { // Convert to destination numeric type diff --git a/include/cutlass/epilogue/thread/linear_combination.h b/include/cutlass/epilogue/thread/linear_combination.h index dd8236b3..8b5f6ead 100644 --- a/include/cutlass/epilogue/thread/linear_combination.h +++ b/include/cutlass/epilogue/thread/linear_combination.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -165,6 +165,28 @@ public: return destination_converter(intermediate); } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + multiplies mul_accumulator; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_clamp.h b/include/cutlass/epilogue/thread/linear_combination_clamp.h index 9fe4b2b3..25611bd3 100644 --- a/include/cutlass/epilogue/thread/linear_combination_clamp.h +++ b/include/cutlass/epilogue/thread/linear_combination_clamp.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -178,6 +178,40 @@ public: return destination_converter(intermediate); } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_accumulator; + + minimum min_accumulator; + maximum max_accumulator; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + /// Clamping constant value + ElementCompute const kClamp = + ElementCompute((1U << (sizeof_bits::value - 1)) - 1); + + intermediate = max_accumulator(intermediate, -kClamp - ElementCompute(1)); + intermediate = min_accumulator(intermediate, kClamp); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -278,7 +312,7 @@ public: beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( @@ -316,6 +350,37 @@ public: return destination_converter(scaled_accumulator); } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Compute linear scaling in floating point + ComputeFragment intermediate; + + multiplies mul_add_accumulator; + + // Float min-max + intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Convert floats back to INT + FragmentAccumulator scaled_accumulator; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + scaled_accumulator[i] = static_cast(intermediate[i]); + } + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(scaled_accumulator); + } }; #endif // Conditional guards to enable partial specialization for packed integers @@ -410,7 +475,7 @@ class FastLinearCombinationClamp { beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()(FragmentAccumulator const &accumulator, @@ -453,6 +518,41 @@ class FastLinearCombinationClamp { return destination_converter(intermediate); } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()(FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + FastNumericArrayConverter + accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Compute linear scaling in floating point + ComputeFragment intermediate; + + multiplies mul_accumulator; + + minimum min_accumulator; + maximum max_accumulator; + + // Float min-max + intermediate = mul_accumulator(alpha_, converted_accumulator); + + /// Clamping constant value + ElementCompute const kClamp = + ElementCompute(1 << (sizeof_bits::value - 1)); + + intermediate = max_accumulator(intermediate, -kClamp); + intermediate = min_accumulator(intermediate, kClamp - ElementCompute(1)); + + // Convert to destination numeric type + FastNumericArrayConverter + destination_converter; + + return destination_converter(intermediate); + } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h index bfe6be78..3934af10 100644 --- a/include/cutlass/epilogue/thread/linear_combination_planar_complex.h +++ b/include/cutlass/epilogue/thread/linear_combination_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -185,6 +185,39 @@ public: destination_converter(intermediate.real), destination_converter(intermediate.imag)); } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator( + accumulator_converter(accumulator.real), + accumulator_converter(accumulator.imag)); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies > mul_op; + multiply_add > mul_add_op; + + // complex multiply-add: I = alpha * AB + I + intermediate.real = mul_add_op(alpha_.real(), converted_accumulator.real); + intermediate.imag = mul_add_op(alpha_.real(), converted_accumulator.imag); + + intermediate.real = mul_add_op(-alpha_.imag(), converted_accumulator.imag, intermediate.real); + intermediate.imag = mul_add_op( alpha_.imag(), converted_accumulator.real, intermediate.imag); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return FragmentOutput( + destination_converter(intermediate.real), + destination_converter(intermediate.imag)); + } }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/thread/linear_combination_relu.h b/include/cutlass/epilogue/thread/linear_combination_relu.h index 9afeb3eb..7a2fa9e8 100644 --- a/include/cutlass/epilogue/thread/linear_combination_relu.h +++ b/include/cutlass/epilogue/thread/linear_combination_relu.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -23,8 +23,7 @@ * **************************************************************************************************/ /*! \file - \brief Functor performing linear combination operations used by epilogues. Values are clamped before - converting to the output element type. + \brief Functor performing linear combination with a maximum operation used by epilogues. */ #pragma once @@ -34,6 +33,7 @@ #include "cutlass/array.h" #include "cutlass/functional.h" #include "cutlass/numeric_conversion.h" +#include "cutlass/epilogue/thread/activation.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -43,8 +43,7 @@ namespace thread { ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Applies a linear combination operator to an array of elements then clamps the output before -/// converting to the output element type. +/// Applies a linear combination operator to an array of elements. /// /// D = alpha * accumulator + beta * source + uniform /// @@ -75,10 +74,10 @@ public: ElementCompute alpha; ///< scales accumulators ElementCompute beta; ///< scales source tensor - ElementCompute threshold; ///< Relu threshold + ElementCompute threshold; ///< minimum value that is output ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - + ElementCompute const *threshold_ptr; ///< pointer to threshold scalar - if not null, loads from memory // // Methods // @@ -87,16 +86,17 @@ public: Params(): alpha(ElementCompute(1)), beta(ElementCompute(0)), - threshold(ElementCompute(0)), + threshold(ElementCompute(0)), alpha_ptr(nullptr), - beta_ptr(nullptr) { } + beta_ptr(nullptr), + threshold_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta, - ElementCompute threshold = ElementCompute(0) - ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { + ElementCompute threshold = ElementCompute(0) + ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) { } @@ -104,8 +104,8 @@ public: Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, - ElementCompute threshold = ElementCompute(0) - ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + ElementCompute const *threshold_ptr = nullptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) { } }; @@ -128,7 +128,7 @@ public: alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - threshold_ = params.threshold; + threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold); } /// Returns true if source is needed @@ -144,13 +144,12 @@ public: beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( FragmentAccumulator const &accumulator, - FragmentOutput const &source, - ElementCompute uniform = ElementCompute(0)) const { + FragmentOutput const &source) const { // Convert source to interal compute numeric type NumericArrayConverter source_converter; @@ -160,18 +159,44 @@ public: ComputeFragment converted_accumulator = accumulator_converter(accumulator); // Perform binary operations - ComputeFragment intermediate; multiplies mul_add_source; multiply_add mul_add_accumulator; - - maximum max_accumulator; + ReLu relu; intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - - intermediate = max_accumulator(intermediate, threshold_); + + // Compute threshold optionally + intermediate = relu(threshold_, intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies mul_accumulator; + ReLu relu; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Compute threshold optionally + intermediate = relu(threshold_, intermediate); // Convert to destination numeric type NumericArrayConverter destination_converter; @@ -180,24 +205,24 @@ public: } }; + ///////////////////////////////////////////////////////////////////////////////////////////////// // Conditional guards to enable partial specialization for packed integers -#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && \ - ((__CUDACC_VER_MAJOR__ > 10) || \ - ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 720) && ((__CUDACC_VER_MAJOR__ > 10) || ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2))) -/// Applies a linear combination operator to an array of elements then clamps the output before -/// converting to the output element type. +/// Applies a linear combination operator to an array of elements. /// /// D = alpha * accumulator + beta * source + uniform /// +/// Special handling for int types + template < typename ElementOutput_, ///< Data type used to load and store tensors int Count, ///< Number of elements computed per operation FloatRoundStyle Round > -class LinearCombinationRelu { +class LinearCombinationRelu { public: using ElementOutput = ElementOutput_; @@ -217,10 +242,10 @@ public: ElementCompute alpha; ///< scales accumulators ElementCompute beta; ///< scales source tensor - ElementCompute threshold; ///< Relu threshold + ElementCompute threshold; ///< minimum value that is output ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory - + ElementCompute const *threshold_ptr; ///< pointer to threshold scalar - if not null, loads from memory // // Methods // @@ -229,16 +254,17 @@ public: Params(): alpha(ElementCompute(1)), beta(ElementCompute(0)), - threshold(ElementCompute(0)), + threshold(ElementCompute(0)), alpha_ptr(nullptr), - beta_ptr(nullptr) { } + beta_ptr(nullptr), + threshold_ptr(nullptr) { } CUTLASS_HOST_DEVICE Params( ElementCompute alpha, ElementCompute beta, - ElementCompute threshold = ElementCompute(0) - ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr) { + ElementCompute threshold = ElementCompute(0) + ): alpha(alpha), beta(beta), threshold(threshold), alpha_ptr(nullptr), beta_ptr(nullptr), threshold_ptr(nullptr) { } @@ -246,8 +272,8 @@ public: Params( ElementCompute const *alpha_ptr, ElementCompute const *beta_ptr, - ElementCompute threshold = ElementCompute(0) - ): alpha(0), beta(0), threshold(threshold), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + ElementCompute const *threshold_ptr = nullptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr), threshold_ptr(threshold_ptr) { } }; @@ -270,7 +296,7 @@ public: alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); - threshold_ = params.threshold; + threshold_ = (params.threshold_ptr ? *params.threshold_ptr : params.threshold); } /// Returns true if source is needed @@ -286,13 +312,12 @@ public: beta_ = ElementCompute(1); } } - + /// Computes linear scaling: D = alpha * accumulator + beta * source CUTLASS_HOST_DEVICE FragmentOutput operator()( FragmentAccumulator const &accumulator, - FragmentOutput const &source, - ElementCompute uniform = ElementCompute(0)) const { + FragmentOutput const &source) const { // Convert source to interal compute numeric type NumericArrayConverter source_converter; @@ -302,21 +327,16 @@ public: ComputeFragment converted_accumulator = accumulator_converter(accumulator); // Perform binary operations - ComputeFragment intermediate; multiplies mul_add_source; multiply_add mul_add_accumulator; - - maximum max_accumulator; + ReLu relu; intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X - - // Clamp to theshold - intermediate = max_accumulator(intermediate, threshold_); - // Convert back to accumulator data type + // Convert floats back to INT FragmentAccumulator scaled_accumulator; CUTLASS_PRAGMA_UNROLL @@ -324,8 +344,46 @@ public: scaled_accumulator[i] = static_cast(intermediate[i]); } - // Convert to destination numeric type and pack - NumericArrayConverter destination_converter; + // Compute threshold optionally + scaled_accumulator = relu(threshold_, scaled_accumulator); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(scaled_accumulator); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + ComputeFragment intermediate; + + multiplies mul_accumulator; + ReLu relu; + + intermediate = mul_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + // Convert floats back to INT + FragmentAccumulator scaled_accumulator; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kCount; ++i) { + scaled_accumulator[i] = static_cast(intermediate[i]); + } + + // Compute threshold optionally + scaled_accumulator = relu(threshold_, scaled_accumulator); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; return destination_converter(scaled_accumulator); } @@ -338,3 +396,6 @@ public: } // namespace thread } // namespace epilogue } // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/epilogue/thread/linear_combination_sigmoid.h b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h new file mode 100644 index 00000000..3a65c49a --- /dev/null +++ b/include/cutlass/epilogue/thread/linear_combination_sigmoid.h @@ -0,0 +1,206 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Functor performing linear combination operations used by epilogues. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/array.h" +#include "cutlass/functional.h" +#include "cutlass/numeric_conversion.h" + +#include "cutlass/epilogue/thread/activation.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace thread { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Applies a linear combination operator to an array of elements. +/// +/// D = alpha * accumulator + beta * source + uniform +/// +template < + typename ElementOutput_, ///< Data type used to load and store tensors + int Count, ///< Number of elements computed per operation + typename ElementAccumulator_ = ElementOutput_, ///< Accumulator data type + typename ElementCompute_ = ElementOutput_, ///< Data type used to compute linear combination + FloatRoundStyle Round = FloatRoundStyle::round_to_nearest +> +class LinearCombinationSigmoid { +public: + + using ElementOutput = ElementOutput_; + using ElementAccumulator = ElementAccumulator_; + using ElementCompute = ElementCompute_; + + static int const kCount = Count; + + using FragmentOutput = Array; + using FragmentAccumulator = Array; + using ComputeFragment = Array; + + static FloatRoundStyle const kRound = Round; + + /// Host-constructable parameters structure + struct Params { + + ElementCompute alpha; ///< scales accumulators + ElementCompute beta; ///< scales source tensor + ElementCompute const *alpha_ptr; ///< pointer to accumulator scalar - if not null, loads it from memory + ElementCompute const *beta_ptr; ///< pointer to source scalar - if not null, loads it from memory + + // + // Methods + // + + CUTLASS_HOST_DEVICE + Params(): + alpha(ElementCompute(1)), + beta(ElementCompute(0)), + alpha_ptr(nullptr), + beta_ptr(nullptr) { } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute alpha, + ElementCompute beta + ): alpha(alpha), beta(beta), alpha_ptr(nullptr), beta_ptr(nullptr) { + + } + + CUTLASS_HOST_DEVICE + Params( + ElementCompute const *alpha_ptr, + ElementCompute const *beta_ptr + ): alpha(0), beta(0), alpha_ptr(alpha_ptr), beta_ptr(beta_ptr) { + + } + }; + +private: + + // + // Data members + // + + ElementCompute alpha_; + ElementCompute beta_; + +public: + + /// Constructs the function object, possibly loading from pointers in host memory + CUTLASS_HOST_DEVICE + LinearCombinationSigmoid(Params const ¶ms) { + + alpha_ = (params.alpha_ptr ? *params.alpha_ptr : params.alpha); + beta_ = (params.beta_ptr ? *params.beta_ptr : params.beta); + } + + /// Returns true if source is needed + CUTLASS_HOST_DEVICE + bool is_source_needed() const { + return beta_ != ElementCompute(0); + } + + /// Functionally required for serial reduction in the epilogue + CUTLASS_HOST_DEVICE + void set_k_partition(int k_partition) { + if (k_partition) { + beta_ = ElementCompute(1); + } + } + + /// Computes linear scaling: D = alpha * accumulator + beta * source + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator, + FragmentOutput const &source) const { + + // Convert source to interal compute numeric type + NumericArrayConverter source_converter; + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_source = source_converter(source); + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_add_source; + multiply_add mul_add_accumulator; + Sigmoid sigmoid; + + intermediate = mul_add_source(beta_, converted_source); // X = beta * C + uniform + intermediate = mul_add_accumulator(alpha_, converted_accumulator, intermediate); // D = alpha * Accum + X + + intermediate = sigmoid(intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } + + /// Computes linear scaling: D = alpha * accumulator + CUTLASS_HOST_DEVICE + FragmentOutput operator()( + FragmentAccumulator const &accumulator) const { + + // Convert source to interal compute numeric type + NumericArrayConverter accumulator_converter; + + ComputeFragment converted_accumulator = accumulator_converter(accumulator); + + // Perform binary operations + + ComputeFragment intermediate; + + multiplies mul_add_accumulator; + Sigmoid sigmoid; + + intermediate = mul_add_accumulator(alpha_, converted_accumulator); // D = alpha * Accum + + intermediate = sigmoid(intermediate); + + // Convert to destination numeric type + NumericArrayConverter destination_converter; + + return destination_converter(intermediate); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace thread +} // namespace epilogue +} // namespace cutlass diff --git a/include/cutlass/epilogue/thread/reduction_op.h b/include/cutlass/epilogue/thread/reduction_op.h index b33332e9..0331f0fa 100644 --- a/include/cutlass/epilogue/thread/reduction_op.h +++ b/include/cutlass/epilogue/thread/reduction_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h index c3c40bab..67fccf05 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,6 +45,7 @@ #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h" #include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h" +#include "cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h" #include "cutlass/epilogue/warp/tile_iterator_tensor_op.h" #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h" #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h" @@ -76,6 +77,7 @@ template < /// Elements accessed by inner-most loop of AccumulatorFragmentIterator::load() int ElementsPerAccess, /// Multiply-add operator + /// Selects between (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) typename Operator_ = arch::OpMultiplyAddComplex> struct DefaultEpilogueComplexTensorOp { @@ -146,6 +148,91 @@ struct DefaultEpilogueComplexTensorOp { >; }; +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization and defines sensible defaults for epilogues for complex*complex case +// 3 real-valued mma operations (Gaussian Complex) +// A = (ar + j ai), B = (br +j bi), D = AB +// P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) +// D = dr + j di = (P1 - P3) + j (P1 + P2) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + typename Shape_, + typename WarpMmaTensorOp_, + int PartitionsK, + typename OutputOp_, + int ElementsPerAccess +> +struct DefaultEpilogueComplexTensorOp { + + using Shape = Shape_; + using WarpMmaTensorOp = WarpMmaTensorOp_; + static int const kPartitionsK = PartitionsK; + using OutputOp = OutputOp_; + static int const kElementsPerAccess = ElementsPerAccess; + using Operator = arch::OpMultiplyAddGaussianComplex; + + using ElementOutput = typename OutputOp::ElementOutput; + using LayoutC = typename WarpMmaTensorOp::LayoutC; + using ElementAccumulator = typename WarpMmaTensorOp::ElementC; + + // + // Thread map + // + + using OutputTileThreadMap = typename cutlass::epilogue::threadblock::DefaultThreadMapTensorOp< + Shape, + typename WarpMmaTensorOp::Shape, + kPartitionsK, + ElementOutput, + kElementsPerAccess + >::Type; + + using OutputTileIterator = cutlass::epilogue::threadblock::PredicatedTileIterator< + OutputTileThreadMap, + ElementOutput + >; + + using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorGaussianComplexTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + LayoutC + >; + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + ElementAccumulator, + LayoutC + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< + typename OutputTileThreadMap::CompactedThreadMap, + ElementAccumulator + >; + + /// Hard-coded padding elements added + using Padding = cutlass::MatrixShape<0, 0>; + + // + // Define the epilogue + // + using Epilogue = cutlass::epilogue::threadblock::Epilogue< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputTileIterator, + AccumulatorFragmentIterator, + WarpTileIterator, + SharedLoadIterator, + OutputOp, + Padding + >; +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h index 081bcbac..bb2fdb6b 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -148,6 +148,44 @@ struct DefaultEpiloguePlanarComplex< ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Defines sensible defaults for epilogues. +template < + typename ThreadblockShape_, + typename WarpMmaOperator_, + int PartitionsK, + typename OutputOp_, + int ElementsPerAccess +> +struct DefaultEpiloguePlanarComplex< + ThreadblockShape_, + WarpMmaOperator_, + arch::OpClassTensorOp, + arch::Sm80, + PartitionsK, + OutputOp_, + ElementsPerAccess> { + + using RealEpilogue = DefaultEpilogueTensorOp< + ThreadblockShape_, + WarpMmaOperator_, + PartitionsK, + OutputOp_, + ElementsPerAccess + >; + + using Epilogue = EpiloguePlanarComplex< + ThreadblockShape_, + WarpMmaOperator_, + PartitionsK, + typename RealEpilogue::OutputTileIterator, + typename RealEpilogue::AccumulatorFragmentIterator, + typename RealEpilogue::WarpTileIterator, + typename RealEpilogue::SharedLoadIterator, + OutputOp_, + typename RealEpilogue::Padding + >; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Defines sensible defaults for epilogues. diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h index d39ad1d9..00bf26d3 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_simt.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,7 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h index 5afb1f22..51ebab37 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,16 +39,20 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" #include "cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h" #include "cutlass/epilogue/warp/fragment_iterator_tensor_op.h" +#include "cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h" #include "cutlass/epilogue/warp/tile_iterator_tensor_op.h" +#include "cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h" #include "cutlass/epilogue/threadblock/default_thread_map_tensor_op.h" #include "cutlass/epilogue/threadblock/predicated_tile_iterator.h" #include "cutlass/epilogue/threadblock/shared_load_iterator.h" +#include "cutlass/epilogue/threadblock/shared_load_iterator_mixed.h" #include "cutlass/epilogue/threadblock/epilogue.h" #include "cutlass/epilogue/threadblock/interleaved_epilogue.h" @@ -61,6 +65,177 @@ namespace threadblock { //////////////////////////////////////////////////////////////////////////////// +namespace detail { + +template < + typename ElementOutput, + typename ElementAccumulator, + int ElementsPerAccess, + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< + WarpShape, + InstructionShape, + ElementAccumulator, + layout::RowMajor + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< + ThreadMap, + ElementAccumulator + >; +}; + +/// Partial specialization for half <= float x 8 epilogues avoids shared memory bank conflicts. +template < + typename ThreadblockShape, + typename WarpShape, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + half_t, + float, + 8, + ThreadblockShape, + WarpShape, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + WarpShape, + InstructionShape, + float, + 32, + 16, + 8, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + float, + 32, + 16, + 8, + 8 + >; +}; + +/// Partial specialization for int8_t x 16 <= int32_t x 16 epilogues avoids shared memory bank conflicts. +template < + int K, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + int8_t, + int32_t, + 16, + gemm::GemmShape<128, 128, K>, + gemm::GemmShape<64, 64, K>, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + gemm::GemmShape<64, 64, K>, + InstructionShape, + int32_t, + 32, + 8, + 16, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + int32_t, + 32, + 8, + 16, + 8 + >; +}; + +/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts. +template < + int K, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + int8_t, + int32_t, + 8, + gemm::GemmShape<128, 64, K>, + gemm::GemmShape<64, 32, K>, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + gemm::GemmShape<64, 32, K>, + InstructionShape, + int32_t, + 32, + 8, + 8, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + int32_t, + 32, + 8, + 8, + 8 + >; +}; + +/// Partial specialization for int8_t x 8 <= int32_t x 8 epilogues avoids shared memory bank conflicts. +template < + int K, + typename InstructionShape, + typename ThreadMap +> +struct DefaultIteratorsTensorOp< + int8_t, + int32_t, + 8, + gemm::GemmShape<64, 64, K>, + gemm::GemmShape<32, 32, K>, + InstructionShape, + ThreadMap> { + + using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOpMixed< + gemm::GemmShape<32, 32, K>, + InstructionShape, + int32_t, + 32, + 8, + 8, + 8 + >; + + using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIteratorMixed< + ThreadMap, + int32_t, + 32, + 8, + 8, + 8 + >; +}; + +} // namespace detail + +//////////////////////////////////////////////////////////////////////////////// + /// Defines sensible defaults for epilogues for TensorOps. template < typename Shape_, @@ -98,25 +273,33 @@ struct DefaultEpilogueTensorOp { ElementOutput >; - using AccumulatorFragmentIterator = cutlass::epilogue::warp::FragmentIteratorTensorOp< - typename WarpMmaTensorOp::Shape, - typename WarpMmaTensorOp::Policy::Operator::Shape, - typename WarpMmaTensorOp::Policy::Operator::ElementC, - typename WarpMmaTensorOp::Policy::Operator::FragmentC, - LayoutC - >; + using AccumulatorFragmentIterator = typename std::conditional::value, + cutlass::epilogue::warp::FragmentIteratorComplexTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + LayoutC>, + cutlass::epilogue::warp::FragmentIteratorTensorOp< + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename WarpMmaTensorOp::Policy::Operator::ElementC, + typename WarpMmaTensorOp::Policy::Operator::FragmentC, + LayoutC> >::type; - using WarpTileIterator = cutlass::epilogue::warp::TileIteratorTensorOp< - typename WarpMmaTensorOp::Shape, - typename WarpMmaTensorOp::Policy::Operator::Shape, + /// Support several implementations depending on structure of epilogue + using DefaultIterators = detail::DefaultIteratorsTensorOp< + ElementOutput, ElementAccumulator, - LayoutC + kElementsPerAccess, + Shape, + typename WarpMmaTensorOp::Shape, + typename WarpMmaTensorOp::Policy::Operator::Shape, + typename OutputTileThreadMap::CompactedThreadMap >; - using SharedLoadIterator = cutlass::epilogue::threadblock::SharedLoadIterator< - typename OutputTileThreadMap::CompactedThreadMap, - ElementAccumulator - >; + using WarpTileIterator = typename DefaultIterators::WarpTileIterator; + using SharedLoadIterator = typename DefaultIterators::SharedLoadIterator; /// Hard-coded padding elements added using Padding = cutlass::MatrixShape<0, 64 / sizeof_bits::value * 4>; @@ -184,6 +367,7 @@ struct DefaultInterleavedEpilogueTensorOp { }; //////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h index 8a08e036..7fec5110 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,7 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h index f0435e92..58425c28 100644 --- a/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_epilogue_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,6 +39,7 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/epilogue/thread/linear_combination_clamp.h" #include "cutlass/epilogue/thread/conversion_op.h" #include "cutlass/epilogue/thread/reduction_op.h" diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h index 788e07a7..8e8f4d33 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_simt.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h index 6f4bd2ad..736e5525 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -146,54 +146,6 @@ struct DefaultInterleavedThreadMapTensorOp { //////////////////////////////////////////////////////////////////////////////// -/// Defines the optimal thread map for TensorOp accumulator layouts -template -struct DefaultInterleavedConvThreadMapTensorOp { - using ThreadblockShape = ThreadblockShape_; - using WarpShape = WarpShape_; - static int const kPartitionsK = PartitionsK; - using Element = Element_; - static int const kElementsPerAccess = ElementsPerAccess; - static int const kInterleavedK = InterleavedK; - - // - // Definitions - // - - struct Detail { - /// Tensor Operations fundamentally perform operations on 8 rows - static int const kTensorOpRows = 8; - static int const kWarpSize = 32; - - static_assert(!(ThreadblockShape::kM % WarpShape::kM) && - !(ThreadblockShape::kM % WarpShape::kM), - "Divisibility"); - - /// Number of warps - using WarpCount = - gemm::GemmShape; - - /// Number of participating threads - static int const kThreads = WarpCount::kCount * kWarpSize; - }; - - // - // ThreadMap - // - - /// ThreadMap to be used by epilogue::MaskedTileIterator satisfying concept - /// InterleavedOutputTileThreadMap - using Type = InterleavedConvOutputTileThreadMap< - MatrixShape, - MatrixShape, - Detail::kThreads, kElementsPerAccess, sizeof_bits::value>; -}; - -//////////////////////////////////////////////////////////////////////////////// - } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h index 4c4068a3..45aba393 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h index 376887c3..34ec750d 100644 --- a/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/default_thread_map_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h index f197112b..f14be1ff 100644 --- a/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h +++ b/include/cutlass/epilogue/threadblock/direct_epilogue_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h index fe6877aa..07868420 100644 --- a/include/cutlass/epilogue/threadblock/epilogue.h +++ b/include/cutlass/epilogue/threadblock/epilogue.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -175,15 +175,106 @@ public: OutputOp const &output_op, ///< Output operator OutputTileIterator destination_iterator, ///< Tile iterator for destination AccumulatorTile const &accumulators, ///< Complete warp-level accumulator tile - OutputTileIterator source_iterator, ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) - int64_t imag_stride_dest = 0, ///< Arguments required for planar complex case - not used in real-valued case - int64_t imag_stride_src = 0) { ///< - - typename OutputTileIterator::Fragment source_fragment; - + OutputTileIterator source_iterator) { ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) + if (!output_op.is_source_needed()) { - source_iterator.clear_mask(); + compute_source_not_needed_(output_op, destination_iterator, accumulators); } + else { + compute_source_needed_(output_op, destination_iterator, accumulators, source_iterator); + } + } + +private: + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_not_needed_( + OutputOp const &output_op, ///< Output operator + OutputTileIterator destination_iterator, ///< Tile iterator for destination + AccumulatorTile const &accumulators) { ///< Complete warp-level accumulator tile + + // + // Iterator over warp-level accumulator fragment + // + + AccumulatorFragmentIterator accum_fragment_iterator(accumulators); + + // + // Iterate over accumulator tile + // + + CUTLASS_PRAGMA_UNROLL + for (int iter = 0; iter < OutputTileIterator::kIterations; ++iter) { + + // + // Convert and store fragment + // + + __syncthreads(); + + typename AccumulatorFragmentIterator::Fragment accum_fragment; + + accum_fragment_iterator.load(accum_fragment); + ++accum_fragment_iterator; + + this->warp_tile_iterator_.store(accum_fragment); + + __syncthreads(); + + // + // Load fragments from shared memory + // + + typename SharedLoadIterator::Fragment aligned_accum_fragment[kPartitionsK]; + + shared_load_iterator_.load(aligned_accum_fragment[0]); + + // If the number of k-slices is > 1 - perform a reduction amongst the k-slices + if (kPartitionsK > 1) + { + plus add_fragments; + const int tile_row_offset = Base::SharedStorage::StorageShape::kRow / PartitionsK; + + CUTLASS_PRAGMA_UNROLL + for ( int i = 1; i < kPartitionsK; ++i) { + shared_load_iterator_.add_tile_offset({tile_row_offset , 0}); + shared_load_iterator_.load(aligned_accum_fragment[i]); + aligned_accum_fragment[0] = add_fragments(aligned_accum_fragment[0], aligned_accum_fragment[i]); + } + + shared_load_iterator_.add_tile_offset({-1 * (kPartitionsK-1) * tile_row_offset, 0}); + } + + // + // Compute the output result + // + + typename OutputTileIterator::Fragment output_fragment; + + apply_output_operator_source_not_needed_(output_fragment, output_op, aligned_accum_fragment[0]); + + + // + // Store the final result + // + + destination_iterator.store(output_fragment); + ++destination_iterator; + + } + } + + + /// Streams the result to global memory + CUTLASS_DEVICE + void compute_source_needed_( + OutputOp const &output_op, ///< Output operator + OutputTileIterator destination_iterator, ///< Tile iterator for destination + AccumulatorTile const &accumulators, ///< Complete warp-level accumulator tile + OutputTileIterator source_iterator) { ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles) + + typename OutputTileIterator::Fragment source_fragment; source_fragment.clear(); @@ -265,8 +356,6 @@ public: } } -private: - /// Helper to invoke the output functor over each vector of output CUTLASS_DEVICE void apply_output_operator_( @@ -294,6 +383,30 @@ private: output_frag_ptr[i] = output_op(compute_frag_ptr[i], source_frag_ptr[i]); } } + + /// Helper to invoke the output functor over each vector of output + CUTLASS_DEVICE + void apply_output_operator_source_not_needed_( + typename OutputTileIterator::Fragment &output_fragment, + OutputOp const &output_op, ///< Output operator + typename SharedLoadIterator::Fragment const &aligned_accum_fragment) { + + OutputAccessType *output_frag_ptr = + reinterpret_cast(&output_fragment); + + AccumulatorAccessType const *compute_frag_ptr = + reinterpret_cast(&aligned_accum_fragment); + + int const kOutputOpIterations = + OutputTileIterator::Fragment::kElements / OutputTileIterator::kElementsPerAccess; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kOutputOpIterations; ++i) { + + // Call the output operator + output_frag_ptr[i] = output_op(compute_frag_ptr[i]); + } + } }; //////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/threadblock/epilogue_base.h b/include/cutlass/epilogue/threadblock/epilogue_base.h index a8a0dc49..a9b5a414 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_base.h +++ b/include/cutlass/epilogue/threadblock/epilogue_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h index 8362748e..6cb99636 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h +++ b/include/cutlass/epilogue/threadblock/epilogue_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/epilogue_workspace.h b/include/cutlass/epilogue/threadblock/epilogue_workspace.h index 72eb8d2e..36d196a3 100644 --- a/include/cutlass/epilogue/threadblock/epilogue_workspace.h +++ b/include/cutlass/epilogue/threadblock/epilogue_workspace.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h index 0a730ef1..b616545b 100644 --- a/include/cutlass/epilogue/threadblock/interleaved_epilogue.h +++ b/include/cutlass/epilogue/threadblock/interleaved_epilogue.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h index fd28ac75..4eb5e378 100644 --- a/include/cutlass/epilogue/threadblock/output_tile_thread_map.h +++ b/include/cutlass/epilogue/threadblock/output_tile_thread_map.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -490,67 +490,6 @@ struct InterleavedOutputTileThreadMap { //////////////////////////////////////////////////////////////////////////////// -/// Template metaprogram for partitioning a 4D interleaved layout across warps -/// to achieve several performance objectives: -/// -/// - coalesced memory accesses in units of 64 Byte lines -/// - minimal address arithmetic -/// - minimal predicate calculations -/// -template -struct InterleavedConvOutputTileThreadMap { - using WarpCount = WarpCount_; - - static int const kWarpSize = 32; - static int const kThreads = Threads; - static int const kWarpCount = kThreads / kWarpSize; - - static int const kElementsPerAccess = ElementsPerAccess; - static int const kElementSize = ElementSize; - - // - // Metaprogram computation - // - - struct Detail {}; - - // - // Output - // - - using Iterations = Iterations_; - - using Delta = MatrixShape; - - /// Initial offset function - CUTLASS_HOST_DEVICE - static MatrixCoord initial_offset(int thread_idx) { - int warp_idx = thread_idx / kWarpSize; - int lane_idx = thread_idx % kWarpSize; - - // Compute warp location - MatrixCoord warp_footprint{ - Delta::kRow * Iterations::kRow, - Delta::kColumn * Iterations::kColumn, - }; - - MatrixCoord warp_offset{warp_idx % WarpCount::kRow, - warp_idx / WarpCount::kRow}; - - // Compute per-lane offset - MatrixCoord thread_offset_in_warp{lane_idx / 4, - (lane_idx % 4) * kElementsPerAccess}; - - MatrixCoord thread_offset_in_threadblock_tile = - warp_footprint * warp_offset + thread_offset_in_warp; - - return thread_offset_in_threadblock_tile; - } -}; - -//////////////////////////////////////////////////////////////////////////////// - } // namespace threadblock } // namespace epilogue } // namespace cutlass diff --git a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h index 486d16c7..f3c88300 100644 --- a/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h +++ b/include/cutlass/epilogue/threadblock/predicated_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -41,7 +41,7 @@ #include "cutlass/tensor_ref.h" #include "cutlass/transform/pitch_linear_thread_map.h" #include "cutlass/epilogue/threadblock/output_tile_thread_map.h" - +#include "cutlass/arch/memory.h" //////////////////////////////////////////////////////////////////////////////// @@ -306,10 +306,15 @@ public: bool guard = row_guard && mask_.predicates[column]; - if (guard) { - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column] = - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess]; - } + cutlass::arch::global_load< + AccessType, + sizeof(AccessType) + >( + frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + + column], + (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / + kElementsPerAccess], + guard); } if (row + 1 < ThreadMap::Iterations::kRow) { @@ -365,11 +370,12 @@ public: bool guard = row_guard && mask_.predicates[column]; - if (guard) { - - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess] = - frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + column]; - } + cutlass::arch::global_store( + frag_ptr[frag_row_idx * ThreadMap::Iterations::kColumn + + column], + (void *)&memory_pointer[column * ThreadMap::Delta::kColumn / + kElementsPerAccess], + guard); } if (row + 1 < ThreadMap::Iterations::kRow) { @@ -660,9 +666,13 @@ public: bool guard = col_guard && mask_.predicates[iteration_contiguous_]; - if (guard) { - *frag_ptr = *memory_pointer; - } + cutlass::arch::global_load< + AccessType, + sizeof(AccessType) + >( + *frag_ptr, + (void *)memory_pointer, + guard); } /// Stores a fragment to memory @@ -678,9 +688,8 @@ public: bool guard = col_guard && mask_.predicates[iteration_contiguous_]; - if (guard) { - *memory_pointer = *frag_ptr; - } + cutlass::arch::global_store( + *frag_ptr, (void *)memory_pointer, guard); } /// Overrides the internal iteration index @@ -732,6 +741,7 @@ public: } }; +/////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// } // namespace threadblock diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator.h b/include/cutlass/epilogue/threadblock/shared_load_iterator.h index 5e4a64b1..0aa3dbb1 100644 --- a/include/cutlass/epilogue/threadblock/shared_load_iterator.h +++ b/include/cutlass/epilogue/threadblock/shared_load_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -96,6 +96,15 @@ public: ThreadMap::kElementsPerAccess, kAlignment>; + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + const_min(128 / sizeof_bits::value, ThreadMap::kElementsPerAccess), + const_min(16, kAlignment) + >; + + static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements; + private: // @@ -149,7 +158,6 @@ public: CUTLASS_DEVICE void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { - AccessType *frag_ptr = reinterpret_cast(&frag); CUTLASS_PRAGMA_UNROLL for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { @@ -169,15 +177,19 @@ public: int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); - AccessType const *memory_pointer = reinterpret_cast(byte_pointer); + LoadType *frag_ptr = reinterpret_cast(&frag); + LoadType const *memory_pointer = reinterpret_cast(byte_pointer); CUTLASS_PRAGMA_UNROLL for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; - frag_ptr[frag_idx] = - memory_pointer[column * ThreadMap::Delta::kColumn / kElementsPerAccess]; + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + frag_ptr[frag_idx * kLoadsPerAccess + v] = + memory_pointer[(column * ThreadMap::Delta::kColumn / kElementsPerAccess) * kLoadsPerAccess + v]; + } } } } diff --git a/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h new file mode 100644 index 00000000..d37b07d5 --- /dev/null +++ b/include/cutlass/epilogue/threadblock/shared_load_iterator_mixed.h @@ -0,0 +1,559 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Epilogue for threadblock scoped GEMMs using Tensor Ops optimized for mixed-precision. + + This assumes the shared memory tile is in a permuted layout which avoids bank conflicts on loading. + + When the fragment is loaded into registers, it matches the row-major thread map assumed by + the predicated tile iterator writing to global memory. + + The epilogue rearranges the result of a matrix product through shared memory to match canonical + tensor layouts in global memory. Epilogues support conversion and reduction operations. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/array.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/tensor_ref.h" + +#include "cutlass/epilogue/threadblock/output_tile_thread_map.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator used to load output tile from shared memory in epilogue. +/// +/// Satisfies: ReadableTileIterator +/// +template < + typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename Element_, ///< Accumulator data type + int ElementSizeBits_, ///< Size of accumulator in bits + int OutputSizeBits_, ///< Size of output element in bits + int ElementsPerAccess, ///< Vector length of output vector + int ContiguousLanes ///< Number of lanes in the warp writing to contiguous elements + /// in the global memory tensor +> +class SharedLoadIteratorMixed; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator used to load output tile from shared memory in epilogue. +/// +/// Satisfies: ReadableTileIterator +/// +template < + typename ThreadMap_, ///< Thread map (conept: OutputTileThreadMap) + typename Element_ ///< Accumulator data type +> +class SharedLoadIteratorMixed { +public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = Element_; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + + static int const kAlignment = ThreadMap::kElementsPerAccess * sizeof_bits::value / 8; + + static int const kThreads = ThreadMap::kThreads; + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * + ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * + ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray< + Element, + ThreadMap::kElementsPerAccess, + kAlignment>; + + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + const_min(128 / sizeof_bits::value, ThreadMap::kElementsPerAccess), + const_min(16, kAlignment) + >; + + static int const kLoadsPerAccess = AccessType::kElements / LoadType::kElements; + +private: + + // + // Data members + // + + /// Byte-level pointer + LoadType const *pointers_[kLoadsPerAccess]; + + /// Stride along adjacent rows in units of LoadType + int stride_; + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + SharedLoadIteratorMixed( + TensorRef ref, + int thread_idx + ): + stride_((ref.stride(0) / LoadType::kElements)) { + + TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx); + + // Initialize pointers + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] = reinterpret_cast(ref.data()); + + int col_idx = (thread_offset.column() / kElementsPerAccess) * kLoadsPerAccess; + int bank_offset = (col_idx * sizeof(LoadType) / 128) % kLoadsPerAccess; + + col_idx += (bank_offset + i) % kLoadsPerAccess; + + pointers_[i] += thread_offset.row() * stride_ + col_idx; + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_ += pointer_offset / LoadType::kElements; + } + } + + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { + + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + + int row_ptr_offset = + row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup* stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; + + int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); + + LoadType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { + + int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + + int vector_idx = (column * ThreadMap::Delta::kColumn / kElementsPerAccess * kLoadsPerAccess); + + LoadType const *memory_pointer = pointers_[v] + row_ptr_offset; + + frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[vector_idx]; + } + } + } + } + } + } + + /// Loads a fragment + CUTLASS_DEVICE + void load(Fragment &frag) { + + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 16 => int8_t x 16 +template < + typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap) +> +class SharedLoadIteratorMixed { +public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = int32_t; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + + static int const kAlignment = 16; + + static int const kThreads = ThreadMap::kThreads; + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * + ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * + ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray< + Element, + 16, + kAlignment>; + + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + 4, + 16 + >; + + static int const kLoadsPerAccess = 4; + +private: + + // + // Data members + // + + /// Byte-level pointer + LoadType const *pointers_[kLoadsPerAccess]; + + /// Stride along adjacent rows in units of LoadType + int stride_; + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + SharedLoadIteratorMixed( + TensorRef ref, + int thread_idx + ): + stride_((ref.stride(0) / LoadType::kElements)) { + + TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx); + + // Initialize pointers + LoadType const *base_ptr = reinterpret_cast(ref.data()) + thread_offset.row() * stride_; + + int lane_col_idx = thread_offset.column() / 16; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + int lane_offset = (lane_col_idx % 2) * 4 | ((lane_col_idx / 2) * 8) | ((lane_col_idx / 2) ^ i); + + pointers_[i] = base_ptr + lane_offset; + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += pointer_offset / LoadType::kElements; + } + } + + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { + + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + + int row_ptr_offset = + row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup* stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; + + int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); + + LoadType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { + + int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + + LoadType const *memory_pointer = pointers_[v]; + + frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset]; + } + } + } + } + } + } + + /// Loads a fragment + CUTLASS_DEVICE + void load(Fragment &frag) { + + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 8 => int8_t x 8 +template < + typename ThreadMap_ ///< Thread map (conept: OutputTileThreadMap) +> +class SharedLoadIteratorMixed { +public: + using ThreadMap = ThreadMap_; + using Shape = typename ThreadMap::Shape; + + using Element = int32_t; + + using Layout = layout::RowMajor; + using TensorRef = TensorRef; + using ConstTensorRef = typename TensorRef::ConstTensorRef; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + using TensorCoord = MatrixCoord; + + static int const kElementsPerAccess = ThreadMap::kElementsPerAccess; + + static int const kAlignment = 8; + + static int const kThreads = ThreadMap::kThreads; + + /// Fragment object + using Fragment = Array< + Element, + ThreadMap::Iterations::kColumn * + ThreadMap::Iterations::kRow * + ThreadMap::Iterations::kGroup * + ThreadMap::Iterations::kCluster * + ThreadMap::kElementsPerAccess>; + + /// Memory access size + using AccessType = AlignedArray< + Element, + 8, + kAlignment>; + + /// Vector type used for SMEM loads + using LoadType = AlignedArray< + Element, + 4, + 16 + >; + + static int const kLoadsPerAccess = 2; + +private: + + // + // Data members + // + + /// Byte-level pointer + LoadType const *pointers_[kLoadsPerAccess]; + + /// Stride along adjacent rows in units of LoadType + int stride_; + +public: + + // + // Methods + // + + /// Constructor + CUTLASS_DEVICE + SharedLoadIteratorMixed( + TensorRef ref, + int thread_idx + ): + stride_((ref.stride(0) / LoadType::kElements)) { + + TensorCoord thread_offset = ThreadMap::initial_offset(thread_idx); + + // Initialize pointers + LoadType const *base_ptr = reinterpret_cast(ref.data()) + thread_offset.row() * stride_; + + int lane_col_idx = thread_offset.column() / 8; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + int lane_offset = (lane_col_idx % 8) * 2 | ((lane_col_idx / 4) ^ i); + + pointers_[i] = base_ptr + lane_offset; + } + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += pointer_offset / LoadType::kElements; + } + } + + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &offset) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kLoadsPerAccess; ++i) { + pointers_[i] += offset.row() * stride_ + offset.column() / LoadType::kElements; + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int cluster = 0; cluster < ThreadMap::Iterations::kCluster; ++cluster) { + + CUTLASS_PRAGMA_UNROLL + for (int group = 0; group < ThreadMap::Iterations::kGroup; ++group) { + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < ThreadMap::Iterations::kRow; ++row) { + + int row_ptr_offset = + row * ThreadMap::Delta::kRow * stride_ + + group * ThreadMap::Delta::kGroup* stride_ + + cluster * ThreadMap::Delta::kCluster * stride_ + + pointer_offset / LoadType::kElements; + + int frag_row_idx = (row + ThreadMap::Iterations::kRow * (group + ThreadMap::Iterations::kGroup * cluster)); + + LoadType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int column = 0; column < ThreadMap::Iterations::kColumn; ++column) { + + int frag_idx = frag_row_idx * ThreadMap::Iterations::kColumn + column; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < kLoadsPerAccess; ++v) { + + LoadType const *memory_pointer = pointers_[v]; + + frag_ptr[frag_idx * kLoadsPerAccess + v] = memory_pointer[row_ptr_offset]; + } + } + } + } + } + } + + /// Loads a fragment + CUTLASS_DEVICE + void load(Fragment &frag) { + + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h index d369a835..1bab9104 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_complex_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h new file mode 100644 index 00000000..4c956492 --- /dev/null +++ b/include/cutlass/epilogue/warp/fragment_iterator_gaussian_complex_tensor_op.h @@ -0,0 +1,188 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief This defines a "fragment" iterator for visiting the fragments of an accumulator tile + that participate in one warp-level store operation. + + Typically, the accumulator tile is the largest single block of register-backed storage + within the kernel. Storing it to memory is best accomplished by partitioning it into + smaller tiles and storing these sequentially. + + Round trips through shared memory during the Epilogue phase require partitioning, as + shared memory capacity is typically insufficient for a threadblock's total accumulator + size. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/layout/matrix.h" + +#include "cutlass/epilogue/warp/tensor_op_policy.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace warp { + +//////////////////////////////////////////////////////////////////////////////// + +/// +template < + typename WarpShape, ///< shape of warp-level GEMM (concept: MatrixShape) + typename OperatorShape, ///< matrix multiply operation shape (concept: gemm::GemmShape) + typename OperatorElementC, ///< matrix multiply operation data type (concept: data type) + typename OperatorFragmentC, ///< matrix multiply operation fragment (concept: Array) + typename Layout ///< target shared memory layout +> +class FragmentIteratorGaussianComplexTensorOp; + +//////////////////////////////////////////////////////////////////////////////// + + +/// Partial specialization for row-major shared memory +template < + typename WarpShape_, ///< shape of the warp-level GEMM tile + typename OperatorShape_, ///< underlying real-valued matrix multiply operation shape (concept: gemm::GemmShape) + typename OperatorElementC_, ///< underlying real-valued matrix multiply operation data type + typename OperatorFragmentC_ ///< underlying real-valued matrix multiply operation fragment (concept: Array) +> +class FragmentIteratorGaussianComplexTensorOp { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using OperatorElementC = OperatorElementC_; + using OperatorFragmentC = OperatorFragmentC_; + using Layout = layout::RowMajor; + + using Policy = TensorOpPolicy; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + complex, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// Size of one part of accumulator of 3-part accumulator in units of number of OperatorElementC + static int const kElementsAccumulatorPerPart = + OperatorFragmentC::kElements * Policy::OperatorCount::kRow * Policy::OperatorCount::kColumn; + + /// Offset into the accumulator fragment part 1 + static int const kPart1Index = kElementsAccumulatorPerPart * 0; + + /// Offset into the accumulator fragment part 2 + static int const kPart2Index = kElementsAccumulatorPerPart * 1; + + /// Offset into the accumulator fragment part 3 + static int const kPart3Index = kElementsAccumulatorPerPart * 2; + + /// This is the complete warp-level accumulator tile holding part1, part2, and part3 + using AccumulatorTile = Array; + + /// This is the complete warp-level accumulator tile holding final output of complex type + using OutputAccumulatorTile = Array, kElementsAccumulatorPerPart>; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + +private: + + /// Internal access type + using AccessType = Array; + + using FragmentAccessType = Array, Policy::kElementsPerAccess>; + +private: + + // + // Data members + // + + /// Accumulator tile + AccessType const *accumulators_; + + /// Internal index + int index_; + +public: + + /// Constructs an iterator + CUTLASS_HOST_DEVICE + FragmentIteratorGaussianComplexTensorOp(AccumulatorTile const &accum): + accumulators_(reinterpret_cast(&accum)), + index_(0) { + } + + /// Increments + CUTLASS_HOST_DEVICE + FragmentIteratorGaussianComplexTensorOp &operator++() { + ++index_; + return *this; + } + + /// Decrements + CUTLASS_HOST_DEVICE + FragmentIteratorGaussianComplexTensorOp &operator--() { + --index_; + return *this; + } + + /// Loads a fragment from the referenced part of the accumulator tile + CUTLASS_HOST_DEVICE + void load(Fragment &frag, int index_offset = 0) const { + + int index = index_ + index_offset; + + FragmentAccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int accumulator_access_offset = + index + n * Policy::kAccumulatorColumnStride / Policy::kElementsPerAccess; + + auto const & part1_accum_array = accumulators_[accumulator_access_offset + kPart1Index]; + auto const & part2_accum_array = accumulators_[accumulator_access_offset + kPart2Index / Policy::kElementsPerAccess]; + auto const & part3_accum_array = accumulators_[accumulator_access_offset + kPart3Index / Policy::kElementsPerAccess]; + + // Pack parts 1, 2, and 3 into a structure. This is likely to result in MOVs + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Policy::kElementsPerAccess; ++i) { + + frag_ptr[n][i].real() = part1_accum_array[i] - part3_accum_array[i]; + frag_ptr[n][i].imag() = part1_accum_array[i] + part2_accum_array[i]; + } + } + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace epilogue +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/fragment_iterator_simt.h b/include/cutlass/epilogue/warp/fragment_iterator_simt.h index 16084420..6d75e569 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_simt.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h index e19f12b9..f620e4bd 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h index 15c095ff..1abbbdc0 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h index b96b4c5b..79106b11 100644 --- a/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/fragment_iterator_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/simt_policy.h b/include/cutlass/epilogue/warp/simt_policy.h index 1d010c68..3e096978 100644 --- a/include/cutlass/epilogue/warp/simt_policy.h +++ b/include/cutlass/epilogue/warp/simt_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tensor_op_policy.h b/include/cutlass/epilogue/warp/tensor_op_policy.h index c02656a5..82e685b8 100644 --- a/include/cutlass/epilogue/warp/tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_simt.h b/include/cutlass/epilogue/warp/tile_iterator_simt.h index 2bf92e01..a9d03db1 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_simt.h +++ b/include/cutlass/epilogue/warp/tile_iterator_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h index d934c05a..04c361f5 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h new file mode 100644 index 00000000..82a93e2d --- /dev/null +++ b/include/cutlass/epilogue/warp/tile_iterator_tensor_op_mixed.h @@ -0,0 +1,675 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/pitch_linear.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/epilogue/warp/tensor_op_policy.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace epilogue { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Template for reading and writing tiles of accumulators to shared memory. This is optimized +/// for mixed-precision epilogues in which the accumulators are 32b in width, but the output +/// data type is smaller. +template < + typename WarpShape_, ///< shape of warp-level GEMM (concept: GemmShape) + typename OperatorShape_, ///< matrix multiply operation shape (concept: gemm::GemmShape) + typename Element_, ///< data type of accumulator element + int ElementSizeBits, ///< Size of accumulator element in bits + int OutputSizeBits, ///< Size of output element in bits + int OutputElementCount, ///< number of elements in output vector + int ContiguousLanes ///< Number of consecutive lanes writing to contiguous memory +> +class TileIteratorTensorOpMixed { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using Element = Element_; + using Layout = layout::RowMajor; + static int const kOutputElementCount = OutputElementCount; + + using TensorRef = TensorRef; ///< Tensor Reference object + using TensorCoord = MatrixCoord; ///< Logical coordinate in referenced tensor + using Index = typename TensorRef::Index; + using LongIndex = typename TensorRef::LongIndex; + + using Policy = TensorOpPolicy; + + /// Shape of the tile in memory + using Shape = MatrixShape< + Policy::kRowsPerIteration, + WarpShape::kN + >; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + Element, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// This is the complete warp-level accumulator tile. + //using AccumulatorTile = typename Operator::FragmentC; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + + // Internal constants + struct Detail { + static int const kLanesInQuad = 4; + + /// Number of pointers needed to write accumulators + static int const kPointerCount = + (OutputElementCount * sizeof_bits::value) / (const_min(128, OutputElementCount * sizeof_bits::value)); + + static_assert(kPointerCount <= 4, "Can only accommodate four pointers at present."); + static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32)."); + }; + + /// Padding quantity + using Padding = MatrixShape< + 0, + Detail::kLanesInQuad * Policy::kElementsPerAccess>; + +private: + + /// Storage type for accessing memory + using AccessType = AlignedArray; + + // + // Data members + // + + /// Internal pointer to memory + AccessType *pointers_[Detail::kPointerCount]; + + /// Stride in units of AccessType + int stride_; + + /// Logical column in which warp tile is aligned + int warp_column_; + +public: + + /// Default constructor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed() { + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] = nullptr; + } + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed( + TensorRef const &ref, + unsigned lane_id + ): + stride_(ref.stride()[0] / Policy::kElementsPerAccess), + warp_column_(0) { + + int quad_id = (lane_id / Detail::kLanesInQuad); + int lane_in_quad = (lane_id % Detail::kLanesInQuad); + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + AccessType *ptr = reinterpret_cast(ref.data()) + quad_id * stride_; + int column_idx = (lane_in_quad % 2) + (((lane_in_quad / 2) + i) % Detail::kPointerCount) * 2; + + ptr += column_idx; + + if (i == 0) { + pointers_[0 % Detail::kPointerCount] = ptr; + } + else if (i == 1) { + pointers_[1 % Detail::kPointerCount] = ptr; + } + else if (i == 2) { + pointers_[2 % Detail::kPointerCount] = ptr; + } + else if (i == 3) { + pointers_[3 % Detail::kPointerCount] = ptr; + } + } + } + + /// Adds a pointer offset + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += pointer_offset / Policy::kElementsPerAccess; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += tile_offset.row() * Shape::kRow * stride_ + + tile_offset.column() * Shape::kColumn / Policy::kElementsPerAccess; + } + + warp_column_ += tile_offset.column() * Shape::kColumn; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) { + return add_tile_offset(tile_offset); + } + + /// Store + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess; + int ptr_idx = ((column_idx * sizeof_bits::value) / 1024) % Detail::kPointerCount; + + AccessType *ptr; + if (ptr_idx == 0) { + ptr = pointers_[0 % Detail::kPointerCount]; + } + else if (ptr_idx == 1) { + ptr = pointers_[1 % Detail::kPointerCount]; + } + else if (ptr_idx == 2) { + ptr = pointers_[2 % Detail::kPointerCount]; + } + else if (ptr_idx == 3) { + ptr = pointers_[3 % Detail::kPointerCount]; + } + + int offset = n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess; +#if 0 + // Using inline PTX to avoid generic memory + AccessType *smem_ptr = pointers_[ptr_idx]; + smem_ptr[offset] = frag_ptr[n]; +#else + uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr); + uint32_t const *data = reinterpret_cast(frag_ptr + n); + uint32_t offset_in_bytes = offset * sizeof(AccessType); + + asm volatile( + "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n" + : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1]) + ); +#endif + } + } + + /// Store + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) { + store_with_pointer_offset(frag, 0); + } + + /// Load + CUTLASS_HOST_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) const { + + AccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int64_t n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int column_idx = warp_column_ + n * Detail::kLanesInQuad * Policy::kElementsPerAccess; + int ptr_idx = ((column_idx * sizeof_bits::value) / 1024) % Detail::kPointerCount; + + AccessType const *smem_ptr = pointers_[ptr_idx]; + frag_ptr[n] = smem_ptr[n * Detail::kLanesInQuad + pointer_offset / Policy::kElementsPerAccess]; + } + } + + /// Load + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 16 => int8_t x 16 +template < + typename WarpShape_, ///< shape of warp-level GEMM (concept: GemmShape) + typename OperatorShape_ ///< matrix multiply operation shape (concept: gemm::GemmShape) +> +class TileIteratorTensorOpMixed { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using Element = int32_t; + using Layout = layout::RowMajor; + static int const kOutputElementCount = 16; + + using TensorRef = TensorRef; ///< Tensor Reference object + using TensorCoord = MatrixCoord; ///< Logical coordinate in referenced tensor + using Index = typename TensorRef::Index; + using LongIndex = typename TensorRef::LongIndex; + + using Policy = TensorOpPolicy; + + /// Shape of the tile in memory + using Shape = MatrixShape< + Policy::kRowsPerIteration, + WarpShape::kN + >; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + Element, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// This is the complete warp-level accumulator tile. + //using AccumulatorTile = typename Operator::FragmentC; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + + // Internal constants + struct Detail { + static int const kLanesInQuad = 4; + + /// Number of pointers needed to write accumulators + static int const kPointerCount = 2; + + /// Offsets added + static int const kOffsetCount = 4; + + static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32)."); + }; + + /// Padding quantity + using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>; + +private: + + /// Storage type for accessing memory + using AccessType = AlignedArray; + + // + // Data members + // + + /// Internal pointer to memory + AccessType *pointers_[Detail::kPointerCount]; + + /// Stride in units of AccessType + int stride_; + + /// Uniform offset in bytes added to warp tile iterator + int uniform_offset_[Detail::kOffsetCount]; + +public: + + /// Default constructor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed() { + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] = nullptr; + } + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed( + TensorRef const &ref, + unsigned lane_id + ): + stride_(ref.stride()[0] / AccessType::kElements) { + + int quad_id = (lane_id / Detail::kLanesInQuad); + int lane_in_quad = (lane_id % Detail::kLanesInQuad); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kPointerCount; ++i) { + AccessType *ptr = reinterpret_cast(ref.data()) + quad_id * stride_; + int column_idx = lane_in_quad ^ (i * 2); + + ptr += column_idx; + + if (i == 0) { + pointers_[0] = ptr; + } + else if (i == 1) { + pointers_[1] = ptr; + } + } + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kOffsetCount; ++i) { + uniform_offset_[i] = (i ^ 0) * 4 * sizeof(AccessType); + } + } + + /// Adds a pointer offset + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += pointer_offset / AccessType::kElements; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) { + + int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + + tile_offset.column() * Shape::kColumn / AccessType::kElements; + + pointers_[0] += ptr_offset; + pointers_[1] += ptr_offset; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kOffsetCount; ++i) { + uniform_offset_[i] = (i ^ tile_offset.column()) * 4 * sizeof(AccessType); + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) { + return add_tile_offset(tile_offset); + } + + /// Store + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int ptr_idx = (n / 4); + int offset_idx = (n % 4); + + AccessType *ptr; + if (ptr_idx == 0) { + ptr = pointers_[0]; + } + else if (ptr_idx == 1) { + ptr = pointers_[1]; + } + + int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements; + +#if 0 + // + // Using inline PTX to avoid generic memory + // + AccessType *smem_ptr = pointers_[ptr_idx]; + smem_ptr[offset] = frag_ptr[n]; +#else + uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr); + uint32_t const *data = reinterpret_cast(frag_ptr + n); + uint32_t offset_in_bytes = offset * sizeof(AccessType) + uniform_offset_[offset_idx]; + + asm volatile( + "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n" + : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1]) + ); +#endif + } + } + + /// Store + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) { + store_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for int32_t x 8 => int8_t x 8 +template < + typename WarpShape_, ///< shape of warp-level GEMM (concept: GemmShape) + typename OperatorShape_ ///< matrix multiply operation shape (concept: gemm::GemmShape) +> +class TileIteratorTensorOpMixed { +public: + + using WarpShape = WarpShape_; + using OperatorShape = OperatorShape_; + using Element = int32_t; + using Layout = layout::RowMajor; + static int const kOutputElementCount = 8; + + using TensorRef = TensorRef; ///< Tensor Reference object + using TensorCoord = MatrixCoord; ///< Logical coordinate in referenced tensor + using Index = typename TensorRef::Index; + using LongIndex = typename TensorRef::LongIndex; + + using Policy = TensorOpPolicy; + + /// Shape of the tile in memory + using Shape = MatrixShape< + Policy::kRowsPerIteration, + WarpShape::kN + >; + + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array< + Element, + Policy::OperatorCount::kColumn * Policy::kElementsPerAccess>; + + /// This is the complete warp-level accumulator tile. + //using AccumulatorTile = typename Operator::FragmentC; + + /// Number of times this iterator can be incremented + static int const kIterations = Policy::kIterations; + + // Internal constants + struct Detail { + static int const kLanesInQuad = 4; + + /// Number of pointers needed to write accumulators + static int const kPointerCount = 2; + + static_assert(sizeof(Element) == 4, "This can only be used with 32b accumulator data types (f32, s32)."); + }; + + /// Padding quantity + using Padding = MatrixShape<0, Detail::kLanesInQuad * 2>; + +private: + + /// Storage type for accessing memory + using AccessType = AlignedArray; + + // + // Data members + // + + /// Internal pointer to memory + AccessType *pointers_[Detail::kPointerCount]; + + /// Stride in units of AccessType + int stride_; + +public: + + /// Default constructor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed() { + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] = nullptr; + } + } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed( + TensorRef const &ref, + unsigned lane_id + ): + stride_(ref.stride()[0] / AccessType::kElements) { + + int quad_id = (lane_id / Detail::kLanesInQuad); + int lane_in_quad = (lane_id % Detail::kLanesInQuad); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Detail::kPointerCount; ++i) { + AccessType *ptr = reinterpret_cast(ref.data()) + quad_id * stride_; + int column_idx = lane_in_quad ^ (i * 2); + + ptr += column_idx; + + if (i == 0) { + pointers_[0] = ptr; + } + else if (i == 1) { + pointers_[1] = ptr; + } + } + } + + /// Adds a pointer offset + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_pointer_offset(Index pointer_offset) { + + CUTLASS_PRAGMA_UNROLL + for (int64_t i = 0; i < Detail::kPointerCount; ++i) { + pointers_[i] += pointer_offset / AccessType::kElements; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & add_tile_offset(TensorCoord const &tile_offset) { + + int ptr_offset = tile_offset.row() * Shape::kRow * stride_ + + tile_offset.column() * Shape::kColumn / AccessType::kElements; + + pointers_[0] += ptr_offset; + pointers_[1] += ptr_offset; + + if (tile_offset.column() % 2) { + auto tmp = pointers_[0]; + pointers_[0] = pointers_[1]; + pointers_[1] = tmp; + } + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_HOST_DEVICE + TileIteratorTensorOpMixed & operator+=(TensorCoord const &tile_offset) { + return add_tile_offset(tile_offset); + } + + /// Store + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < Policy::OperatorCount::kColumn; ++n) { + + int ptr_idx = (n / 4); + + AccessType *ptr; + if (ptr_idx == 0) { + ptr = pointers_[0]; + } + else if (ptr_idx == 1) { + ptr = pointers_[1]; + } + + int offset = (n / 4) * 16 + pointer_offset / AccessType::kElements + (n % 4) * 4; + +#if 0 + // + // Using inline PTX to avoid generic memory + // + AccessType *smem_ptr = pointers_[ptr_idx]; + smem_ptr[offset] = frag_ptr[n]; +#else + uint32_t smem_addr = arch::cutlass_get_smem_pointer(ptr); + uint32_t const *data = reinterpret_cast(frag_ptr + n); + uint32_t offset_in_bytes = offset * sizeof(AccessType); + + asm volatile( + "{ .reg .u32 smem_ptr; add.u32 smem_ptr, %0, %1; st.shared.v2.u32 [smem_ptr], {%2, %3}; }\n" + : : "r"(smem_addr), "r"(offset_in_bytes), "r"(data[0]), "r"(data[1]) + ); +#endif + } + } + + /// Store + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) { + store_with_pointer_offset(frag, 0); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace epilogue +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h index a9ca2315..8ffb5ec1 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_volta_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h index e8299f9d..6017b5c7 100644 --- a/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h +++ b/include/cutlass/epilogue/warp/tile_iterator_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h index 631d423e..b0ecc5eb 100644 --- a/include/cutlass/epilogue/warp/volta_tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/volta_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h index fc312c7a..7b938d37 100644 --- a/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h +++ b/include/cutlass/epilogue/warp/wmma_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/fast_math.h b/include/cutlass/fast_math.h index ebc821ed..036b08e2 100644 --- a/include/cutlass/fast_math.h +++ b/include/cutlass/fast_math.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index f712e04a..13ee7f54 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -96,6 +96,16 @@ struct multiply_add { } }; +/// Fused multiply-add +template +struct and_add { + CUTLASS_HOST_DEVICE + T operator()(T const &a, T const &b, T const &c) const { + return ((a & b) + c); + } +}; + + /// Fused multiply-add template struct xor_add { @@ -1207,6 +1217,212 @@ struct multiply_add, Array, Array> { ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Fused multiply-add +template +struct multiply_add, Array, Array> { + + CUTLASS_HOST_DEVICE + Array operator()( + Array const &a, + Array const &b, + Array const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + unsigned const *a_ptr = reinterpret_cast(&a); + unsigned const *b_ptr = reinterpret_cast(&b); + unsigned const *c_ptr = reinterpret_cast(&c); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_ptr[i]) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a[i], b[i], c[i]); + } + #endif + + return result; + } + + CUTLASS_HOST_DEVICE + Array operator()( + bfloat16_t const &a, + Array const &b, + Array const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + + unsigned const *b_ptr = reinterpret_cast(&b); + unsigned const *c_ptr = reinterpret_cast(&c); + + unsigned a_packed = static_cast(a.raw()); + a_packed = (a_packed | (a_packed << 16)); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_packed), "r"(b_ptr[i]), "r"(c_ptr[i]) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[0]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[N - 1]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a, b[i], c[i]); + } + #endif + + return result; + } + + CUTLASS_HOST_DEVICE + Array operator()( + Array const &a, + bfloat16_t const &b, + Array const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + + unsigned const *a_ptr = reinterpret_cast(&a); + unsigned const *c_ptr = reinterpret_cast(&c); + + unsigned b_packed = static_cast(b.raw()); + b_packed = (b_packed | (b_packed << 16)); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_ptr[i]), "r"(b_packed), "r"(c_ptr[i]) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[0]), "h"(c_residual_ptr[N - 1]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a[i], b, c[i]); + } + #endif + + return result; + } + + CUTLASS_HOST_DEVICE + Array operator()( + Array const &a, + Array const &b, + bfloat16_t const &c) const { + + Array result; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + + unsigned *result_ptr = reinterpret_cast(&result); + + unsigned const *a_ptr = reinterpret_cast(&a); + unsigned const *b_ptr = reinterpret_cast(&b); + + unsigned c_packed = static_cast(c.raw()); + c_packed = (c_packed | (c_packed << 16)); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + asm ("fma.rn.bf16x2 %0, %1, %2, %3;\n" + : "=r"(result_ptr[i]) + : "r"(a_ptr[i]), "r"(b_ptr[i]), "r"(c_packed) + ); + } + + if (N % 2) { + + uint16_t *result_ptr = reinterpret_cast(&result); + uint16_t const *a_residual_ptr = reinterpret_cast(&a); + uint16_t const *b_residual_ptr = reinterpret_cast(&b); + uint16_t const *c_residual_ptr = reinterpret_cast(&c); + + asm ("fma.rn.bf16 %0, %1, %2, %3;\n" + : "=h"(result_ptr[N - 1]) + : "h"(a_residual_ptr[N - 1]), "h"(b_residual_ptr[N - 1]), "h"(c_residual_ptr[0]) + ); + } + + #else + + multiply_add op; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + result[i] = op(a[i], b[i], c); + } + #endif + + return result; + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h index fff34dc4..c65b3f00 100644 --- a/include/cutlass/gemm/device/default_gemm_configuration.h +++ b/include/cutlass/gemm/device/default_gemm_configuration.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -422,6 +422,342 @@ struct DefaultGemmConfiguration< using Operator = arch::OpMultiplyAddSaturate; }; +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm75, + uint1b_t, + uint1b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 512>; + using WarpShape = GemmShape<64, 64, 512>; + using InstructionShape = GemmShape<8, 8, 128>; + static int const kStages = 2; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpXorPopc; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template +struct DefaultGemmConfiguration { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 16>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombination< + ElementC, 128 / sizeof_bits::value, ElementAccumulator, + ElementAccumulator>; + + using Operator = typename platform::conditional< + (platform::is_same::value || + platform::is_same::value || + platform::is_same::value || + platform::is_same::value), + arch::OpMultiplyAddSaturate, arch::OpMultiplyAdd>::type; +}; + +//////////////////////////////////////////////////////////////////////////////// +template +struct DefaultGemmConfiguration { + + static int const kAlignmentA = 1; + static int const kAlignmentB = 1; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 16>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombination< + ElementC, 128 / sizeof_bits::value, ElementAccumulator, + ElementAccumulator>; + + using Operator = arch::OpMultiplyAdd; +}; + + +template <> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + complex, + complex, + complex, + complex + > { + + static int const kAlignmentA = 1; + static int const kAlignmentB = 1; + + using ThreadblockShape = GemmShape<64, 64, 16>; + using WarpShape = GemmShape<32, 32, 16>; + using InstructionShape = GemmShape<8, 8, 4>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombination< + complex, 1, complex, + complex>; + + using Operator = arch::OpMultiplyAddComplex; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int8_t, + int8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int8_t, + uint8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint8_t, + int8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint8_t, + uint8_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 64>; + using WarpShape = GemmShape<64, 64, 64>; + using InstructionShape = GemmShape<16, 8, 32>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int4b_t, + int4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + int4b_t, + uint4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint4b_t, + int4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint4b_t, + uint4b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 128>; + using WarpShape = GemmShape<64, 64, 128>; + using InstructionShape = GemmShape<16, 8, 64>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAddSaturate; +}; + +//////////////////////////////////////////////////////////////////////////////// + +template < + typename ElementC> +struct DefaultGemmConfiguration< + arch::OpClassTensorOp, + arch::Sm80, + uint1b_t, + uint1b_t, + ElementC, + int32_t> { + + static int const kAlignmentA = 128 / sizeof_bits::value; + static int const kAlignmentB = 128 / sizeof_bits::value; + + using ThreadblockShape = GemmShape<128, 256, 512>; + using WarpShape = GemmShape<64, 64, 512>; + using InstructionShape = GemmShape<16, 8, 256>; + static int const kStages = 3; + + using EpilogueOutputOp = epilogue::thread::LinearCombinationClamp< + ElementC, 128 / sizeof_bits::value, int32_t, float>; + + using Operator = arch::OpMultiplyAdd; +}; + +//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// } // namespace device } // namespace gemm diff --git a/include/cutlass/gemm/device/gemm.h b/include/cutlass/gemm/device/gemm.h index c91aac20..70383e15 100644 --- a/include/cutlass/gemm/device/gemm.h +++ b/include/cutlass/gemm/device/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -193,7 +193,7 @@ template < ElementAccumulator_>::EpilogueOutputOp, /// Threadblock-level swizzling operator typename ThreadblockSwizzle_ = - typename threadblock::GemmCohortThreadblockSwizzle, + typename threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration::EpilogueOutputOp, /// Threadblock-level swizzling operator - typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle, + typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration; diff --git a/include/cutlass/gemm/device/gemm_splitk_parallel.h b/include/cutlass/gemm/device/gemm_splitk_parallel.h index df11ba5b..73f1c240 100644 --- a/include/cutlass/gemm/device/gemm_splitk_parallel.h +++ b/include/cutlass/gemm/device/gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/device/gemm_universal.h b/include/cutlass/gemm/device/gemm_universal.h index 4b57fa0d..09129090 100644 --- a/include/cutlass/gemm/device/gemm_universal.h +++ b/include/cutlass/gemm/device/gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -89,7 +89,7 @@ template < OperatorClass_, ArchTag_, ElementA_, ElementB_, ElementC_, ElementAccumulator_>::EpilogueOutputOp, /// Threadblock-level swizzling operator - typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle, + typename ThreadblockSwizzle_ = threadblock::GemmIdentityThreadblockSwizzle<>, /// Number of stages used in the pipelined mainloop int Stages = DefaultGemmConfiguration + struct MapArguments { + using ElementA = ElementA_; + using LayoutA = LayoutA_; + static ComplexTransform const kTransformA = TransformA; + static int const kAlignmentA = AlignmentA; + using ElementB = ElementB_; + using LayoutB = LayoutB_; + static ComplexTransform const kTransformB = TransformB; + static int const kAlignmentB = AlignmentB; + using LayoutC = LayoutC_; + }; + + template < + typename ElementA_, + typename LayoutA_, + ComplexTransform TransformA, + int AlignmentA, + typename ElementB_, + typename LayoutB_, + ComplexTransform TransformB, + int AlignmentB, + typename LayoutC_ + > + struct MapArguments< + ElementA_, + LayoutA_, + TransformA, + AlignmentA, + ElementB_, + LayoutB_, + TransformB, + AlignmentB, + LayoutC_, + true + > { + using ElementA = ElementB_; + using LayoutA = typename layout::LayoutTranspose::type; + static ComplexTransform const kTransformA = TransformB; + static int const kAlignmentA = AlignmentB; + using ElementB = ElementA_; + using LayoutB = typename layout::LayoutTranspose::type; + static ComplexTransform const kTransformB = TransformA; + static int const kAlignmentB = AlignmentA; + using LayoutC = typename layout::LayoutTranspose::type; + }; + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + template class GemmUniversalAdapter { public: using GemmKernel = GemmKernel_; - static_assert(std::is_same::value, - "Universal adapter expects the kernel to be row-major and transposes its arguments."); + static bool const kInternalTranspose = + std::is_same::value; using ThreadblockShape = typename GemmKernel::Mma::Shape; using WarpShape = typename GemmKernel::WarpShape; @@ -56,26 +120,39 @@ public: using OperatorClass = typename GemmKernel::OperatorClass; using ArchTag = typename GemmKernel::ArchTag; - + // Type, layout, and complex transform deliberately exchanged with B - using ElementA = typename GemmKernel::ElementB; - using LayoutA = typename layout::LayoutTranspose::type; - using TensorRefA = TensorRef; - static ComplexTransform const kTransformA = GemmKernel::kTransformB; + using MapArguments = detail::MapArguments< + typename GemmKernel::ElementA, + typename GemmKernel::LayoutA, + GemmKernel::kTransformA, + GemmKernel::kAlignmentA, + typename GemmKernel::ElementB, + typename GemmKernel::LayoutB, + GemmKernel::kTransformB, + GemmKernel::kAlignmentB, + typename GemmKernel::LayoutC, + kInternalTranspose + >; + + using ElementA = typename MapArguments::ElementA; + using LayoutA = typename MapArguments::LayoutA; + static ComplexTransform const kTransformA = MapArguments::kTransformA; static int const kAlignmentA = GemmKernel::kAlignmentA; - // Type, layout, and complex transform deliberately exchanged with A - using ElementB = typename GemmKernel::ElementA; - using LayoutB = typename layout::LayoutTranspose::type; - using TensorRefB = TensorRef; - static ComplexTransform const kTransformB = GemmKernel::kTransformA; + using ElementB = typename MapArguments::ElementB; + using LayoutB = typename MapArguments::LayoutB; + static ComplexTransform const kTransformB = MapArguments::kTransformB; static int const kAlignmentB = GemmKernel::kAlignmentB; - + using ElementC = typename GemmKernel::ElementC; - using LayoutC = cutlass::layout::ColumnMajor; + using LayoutC = typename MapArguments::LayoutC; + static int const kAlignmentC = GemmKernel::kAlignmentC; + + using TensorRefA = TensorRef; + using TensorRefB = TensorRef; using TensorRefC = TensorRef; using TensorRefD = TensorRef; - static int const kAlignmentC = GemmKernel::kAlignmentC; using ElementAccumulator = typename GemmKernel::Mma::Policy::Operator::ElementC; @@ -99,7 +176,12 @@ public: /// Helper to construct a transposed equivalent for the underying GEMM operator static Arguments to_underlying_arguments(Arguments const &args) { - return args.transposed_problem(); + if (kInternalTranspose) { + return args.transposed_problem(); + } + else { + return args; + } } /// Determines whether the GEMM can execute the given problem. diff --git a/include/cutlass/gemm/device/gemm_universal_base.h b/include/cutlass/gemm/device/gemm_universal_base.h index de0ee183..18ccb346 100644 --- a/include/cutlass/gemm/device/gemm_universal_base.h +++ b/include/cutlass/gemm/device/gemm_universal_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/gemm.h b/include/cutlass/gemm/gemm.h index 011e03c9..78d0a6da 100644 --- a/include/cutlass/gemm/gemm.h +++ b/include/cutlass/gemm/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -400,7 +400,8 @@ enum class GemmUniversalMode { kGemm, kGemmSplitKParallel, kBatched, - kArray + kArray, + kInvalid }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/kernel/default_gemm.h b/include/cutlass/gemm/kernel/default_gemm.h index f3f6a149..0aba2d3a 100644 --- a/include/cutlass/gemm/kernel/default_gemm.h +++ b/include/cutlass/gemm/kernel/default_gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -49,6 +49,7 @@ #include "cutlass/gemm/kernel/gemm_pipelined.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" #include "cutlass/gemm/threadblock/default_mma.h" #include "cutlass/gemm/threadblock/default_mma_core_simt.h" #include "cutlass/gemm/threadblock/threadblock_swizzle.h" @@ -116,6 +117,68 @@ template < struct DefaultGemm; //////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of A matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator> +struct DefaultGemm { + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, + ThreadblockShape, WarpShape, InstructionShape, Stages, + Operator>::ThreadblockMma; + + static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp, + EpilogueOutputOp::kCount>::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; +//////////////////////////////////////////////////////////////////////////////// + /// Partial specialization for Turing Architecture template < /// Element type for A matrix operand @@ -201,6 +264,75 @@ struct DefaultGemm< }; //////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Ampere Integer Matrix Multiply Interleaved layout +template < + /// Element type for A matrix operand + typename ElementA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Number of Interleaved k + int InterleavedK, + /// If true, kernel is configured to support serial reduction in the + /// epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator, + /// Is Beta zero or not + bool IsBetaZero> +struct DefaultGemm< + ElementA, layout::ColumnMajorInterleaved, kAlignmentA, + ElementB, layout::RowMajorInterleaved, kAlignmentB, ElementC, + layout::ColumnMajorInterleaved, int32_t, + arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, WarpShape, + InstructionShape, EpilogueOutputOp, ThreadblockSwizzle, Stages, + SplitKSerial, Operator, IsBetaZero> { + using LayoutA = layout::ColumnMajorInterleaved; + using LayoutB = layout::RowMajorInterleaved; + using LayoutC = layout::ColumnMajorInterleaved; + + using ElementAccumulator = int32_t; + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, LayoutC, arch::OpClassTensorOp, arch::Sm80, + ThreadblockShape, WarpShape, InstructionShape, Stages, Operator, + true>::ThreadblockMma; + + static const int kPartitionsK = ThreadblockShape::kK / WarpShape::kK; + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock:: + DefaultInterleavedEpilogueTensorOp< + ThreadblockShape, typename Mma::Operator, kPartitionsK, EpilogueOutputOp, + 64 / sizeof_bits::value, InterleavedK, + IsBetaZero>::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + +//////////////////////////////////////////////////////////////////////////////// + /// Partial specialization for Turing Integer Matrix Multiply Interleaved layout template < /// Element type for A matrix operand @@ -439,6 +571,80 @@ struct DefaultGemm< //////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for Ampere +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of A matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages + int Stages, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial, + /// Operation performed by GEMM + typename Operator> +struct DefaultGemm, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + SplitKSerial, + Operator> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMma< + ElementA, LayoutA, kAlignmentA, ElementB, LayoutB, kAlignmentB, + ElementAccumulator, layout::RowMajor, arch::OpClassSimt, arch::Sm80, + ThreadblockShape, WarpShape, GemmShape<1, 1, 1>, Stages, + Operator>::ThreadblockMma; + + static int const kEpilogueElementsPerAccess = EpilogueOutputOp::kCount; + static_assert(kEpilogueElementsPerAccess == 1, "simt epilogue must operate on scalars"); + + /// Define the epilogue + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueSimt< + ThreadblockShape, + typename Mma::Operator, + EpilogueOutputOp, + kEpilogueElementsPerAccess + >::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + //////////////////////////////////////////////////////////////////////////////// /// Partial specialization for SIMT DP4A @@ -516,7 +722,6 @@ struct DefaultGemm; }; - #if defined(CUTLASS_ARCH_WMMA_ENABLED) //////////////////////////////////////////////////////////////////////////////// /// Partial specialization for Wmma Gemm Kernel diff --git a/include/cutlass/gemm/kernel/default_gemm_complex.h b/include/cutlass/gemm/kernel/default_gemm_complex.h index a9ef4e31..15b1430c 100644 --- a/include/cutlass/gemm/kernel/default_gemm_complex.h +++ b/include/cutlass/gemm/kernel/default_gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -49,7 +49,9 @@ #include "cutlass/gemm/kernel/gemm_pipelined.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h" #include "cutlass/gemm/threadblock/default_mma.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex.h" #include "cutlass/gemm/threadblock/default_mma_core_simt.h" #include "cutlass/gemm/threadblock/threadblock_swizzle.h" #include "cutlass/epilogue/threadblock/default_epilogue_complex_tensor_op.h" @@ -101,6 +103,7 @@ template < /// Complex elementwise transformation on B operand ComplexTransform TransformB, /// Multiply-add operator + // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) typename Operator, /// If true, kernel is configured to support serial reduction in the epilogue bool SplitKSerial @@ -109,6 +112,64 @@ struct DefaultGemmComplex; //////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for Ampere Architecture +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Element type for C and D matrix operands + typename ElementC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Multiply-add operator + // (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator, + /// If true, kernel is configured to support serial reduction in the epilogue + bool SplitKSerial + > +struct DefaultGemmComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementC, + layout::RowMajor, ElementAccumulator, arch::OpClassTensorOp, + arch::Sm80, ThreadblockShape, WarpShape, InstructionShape, + EpilogueOutputOp, ThreadblockSwizzle, Stages, TransformA, TransformB, Operator, SplitKSerial> { + + /// Define the threadblock-scoped matrix multiply-accumulate + using Mma = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplex< + ElementA, LayoutA, ElementB, LayoutB, ElementAccumulator, + layout::RowMajor, arch::OpClassTensorOp, arch::Sm80, ThreadblockShape, + WarpShape, InstructionShape, Stages, TransformA, TransformB, Operator>::ThreadblockMma; + + /// Define the epilogue + using Epilogue = + typename cutlass::epilogue::threadblock::DefaultEpilogueComplexTensorOp< + ThreadblockShape, typename Mma::Operator, 1, EpilogueOutputOp, + EpilogueOutputOp::kCount, Operator>::Epilogue; + + /// Define the kernel-level GEMM operator. + using GemmKernel = kernel::Gemm; +}; + //////////////////////////////////////////////////////////////////////////////// } // namespace kernel diff --git a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h index 3664fece..87008483 100644 --- a/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h +++ b/include/cutlass/gemm/kernel/default_gemm_planar_complex_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -49,6 +49,7 @@ #include "cutlass/epilogue/threadblock/default_epilogue_planar_complex.h" #include "cutlass/gemm/threadblock/default_mma_planar_complex_pipelined.h" +#include "cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h" ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -222,6 +223,122 @@ struct DefaultGemmPlanarComplexUniversal< ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization for multiple pipeline stages. +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Complex elementwise transformation on A operand + ComplexTransform TransformA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Complex elementwise transformation on B operand + ComplexTransform TransformB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for C and D matrix operands + typename ElementC, + /// Layout type for C and D matrix operands + typename LayoutC, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Operator class tag + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Warp-level tile size (concept: GemmShape) + typename InstructionShape, + /// Epilogue output operator + typename EpilogueOutputOp, + /// Threadblock-level swizzling operator + typename ThreadblockSwizzle, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Operation performed by GEMM + typename Operator + > +struct DefaultGemmPlanarComplexUniversal< + ElementA, + LayoutA, + TransformA, + kAlignmentA, + ElementB, + LayoutB, + TransformB, + kAlignmentB, + ElementC, + LayoutC, + ElementAccumulator, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + EpilogueOutputOp, + ThreadblockSwizzle, + Stages, + Operator, + typename std::enable_if<(Stages > 2)>::type +> { + + /// Define planar complex valued variants instead + using Mma = typename gemm::threadblock::DefaultMmaPlanarComplexMultistage< + ElementA, + LayoutA, + kAlignmentA, + ElementB, + LayoutB, + kAlignmentB, + ElementAccumulator, + LayoutC, + OperatorClass, + ArchTag, + ThreadblockShape, + WarpShape, + InstructionShape, + Stages, + TransformA, + TransformB, + Operator + >::ThreadblockMma; + + /// Planar complex epilogue + using Epilogue = typename epilogue::threadblock::DefaultEpiloguePlanarComplex< + ThreadblockShape, + typename Mma::Policy::Operator, + OperatorClass, + ArchTag, + ThreadblockShape::kK / WarpShape::kK, + EpilogueOutputOp, + EpilogueOutputOp::kCount + >::Epilogue; + + /// Define the kernel in terms of the default kernel + using GemmKernel = kernel::GemmPlanarComplex< + Mma, + Epilogue, + ThreadblockSwizzle + >; + + // Array variant + using GemmArrayKernel = kernel::GemmPlanarComplexArray< + Mma, + Epilogue, + ThreadblockSwizzle + >; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace kernel } // namespace gemm } // namespace cutlass diff --git a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h index f50ead04..e23965d3 100644 --- a/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h +++ b/include/cutlass/gemm/kernel/default_gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemm_universal.h b/include/cutlass/gemm/kernel/default_gemm_universal.h index 23db577c..579005cb 100644 --- a/include/cutlass/gemm/kernel/default_gemm_universal.h +++ b/include/cutlass/gemm/kernel/default_gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/default_gemv.h b/include/cutlass/gemm/kernel/default_gemv.h index 08a30790..36ae339c 100755 --- a/include/cutlass/gemm/kernel/default_gemv.h +++ b/include/cutlass/gemm/kernel/default_gemv.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm.h b/include/cutlass/gemm/kernel/gemm.h index 36cf6731..6700659a 100644 --- a/include/cutlass/gemm/kernel/gemm.h +++ b/include/cutlass/gemm/kernel/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_array.h b/include/cutlass/gemm/kernel/gemm_array.h index 30ff1d30..f63571b0 100644 --- a/include/cutlass/gemm/kernel/gemm_array.h +++ b/include/cutlass/gemm/kernel/gemm_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_batched.h b/include/cutlass/gemm/kernel/gemm_batched.h index 8bf4354a..eb638375 100644 --- a/include/cutlass/gemm/kernel/gemm_batched.h +++ b/include/cutlass/gemm/kernel/gemm_batched.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_pipelined.h b/include/cutlass/gemm/kernel/gemm_pipelined.h index 293592e7..6caa0eae 100644 --- a/include/cutlass/gemm/kernel/gemm_pipelined.h +++ b/include/cutlass/gemm/kernel/gemm_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex.h b/include/cutlass/gemm/kernel/gemm_planar_complex.h index 3d975bb2..e0511256 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -421,6 +421,13 @@ public: cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() || + params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) { + + return; + } + int offset_k = 0; int problem_size_k = params.problem_size.k(); diff --git a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h index efb500b2..00841d46 100644 --- a/include/cutlass/gemm/kernel/gemm_planar_complex_array.h +++ b/include/cutlass/gemm/kernel/gemm_planar_complex_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -377,6 +377,14 @@ public: ThreadblockSwizzle threadblock_swizzle; cutlass::gemm::GemmCoord threadblock_tile_offset = threadblock_swizzle.get_tile_offset(); + + // Early exit if CTA is out of range + if (params.grid_tiled_shape.m() <= threadblock_tile_offset.m() || + params.grid_tiled_shape.n() <= threadblock_tile_offset.n()) { + + return; + } + int batch_idx = threadblock_tile_offset.k(); int problem_size_m = params.problem_size.m(); diff --git a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h index 2c5978aa..97389752 100644 --- a/include/cutlass/gemm/kernel/gemm_splitk_parallel.h +++ b/include/cutlass/gemm/kernel/gemm_splitk_parallel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/kernel/gemm_universal.h b/include/cutlass/gemm/kernel/gemm_universal.h index 11831d8d..6efd50a7 100644 --- a/include/cutlass/gemm/kernel/gemm_universal.h +++ b/include/cutlass/gemm/kernel/gemm_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ public: using OperatorClass = typename Mma::Operator::OperatorClass; using ThreadblockShape = typename Mma::Shape; using WarpShape = typename Mma::Operator::Shape; - using InstructionShape = typename Mma::Policy::Operator::Shape; + using InstructionShape = typename Mma::Policy::Operator::InstructionShape; using ArchTag = typename Mma::ArchTag; static int const kStages = Mma::kStages; @@ -259,9 +259,9 @@ public: Arguments const &args, void *workspace = nullptr) { - ptr_A = args.ptr_A; - ptr_B = args.ptr_B; - ptr_C = args.ptr_C; + ptr_A = const_cast(args.ptr_A); + ptr_B = const_cast(args.ptr_B); + ptr_C = const_cast(args.ptr_C); ptr_D = args.ptr_D; output_op = args.epilogue; @@ -303,6 +303,10 @@ public: return Status::kSuccess; } + static Status can_implement(Arguments const &args) { + return can_implement(args.problem_size); + } + /// Executes one GEMM CUTLASS_DEVICE void operator()(Params const ¶ms, SharedStorage &shared_storage) { diff --git a/include/cutlass/gemm/kernel/gemv_batched_strided.h b/include/cutlass/gemm/kernel/gemv_batched_strided.h index 852edde2..ea8d9bdf 100755 --- a/include/cutlass/gemm/kernel/gemv_batched_strided.h +++ b/include/cutlass/gemm/kernel/gemv_batched_strided.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma.h b/include/cutlass/gemm/thread/mma.h index 41ea8b49..15dfe433 100644 --- a/include/cutlass/gemm/thread/mma.h +++ b/include/cutlass/gemm/thread/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm50.h b/include/cutlass/gemm/thread/mma_sm50.h index 78c77bef..04658f7b 100644 --- a/include/cutlass/gemm/thread/mma_sm50.h +++ b/include/cutlass/gemm/thread/mma_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm60.h b/include/cutlass/gemm/thread/mma_sm60.h index 66fed7e1..16d0d61c 100644 --- a/include/cutlass/gemm/thread/mma_sm60.h +++ b/include/cutlass/gemm/thread/mma_sm60.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/thread/mma_sm61.h b/include/cutlass/gemm/thread/mma_sm61.h index 13bbb542..83e31b23 100644 --- a/include/cutlass/gemm/thread/mma_sm61.h +++ b/include/cutlass/gemm/thread/mma_sm61.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_gemv_core.h b/include/cutlass/gemm/threadblock/default_gemv_core.h index de234b85..9d692d6d 100755 --- a/include/cutlass/gemm/threadblock/default_gemv_core.h +++ b/include/cutlass/gemm/threadblock/default_gemv_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma.h b/include/cutlass/gemm/threadblock/default_mma.h index 11af1de4..3ebe14e6 100644 --- a/include/cutlass/gemm/threadblock/default_mma.h +++ b/include/cutlass/gemm/threadblock/default_mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -38,6 +38,8 @@ #include "cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h" #include "cutlass/gemm/threadblock/default_mma_core_sm70.h" #include "cutlass/gemm/threadblock/default_mma_core_sm75.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" + #if defined(CUTLASS_ARCH_WMMA_ENABLED) #include "cutlass/gemm/threadblock/default_mma_core_wmma.h" #endif //CUTLASS_ARCH_WMMA_ENABLED @@ -203,6 +205,58 @@ struct DefaultMma +struct DefaultMma { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, float, LayoutA, float, + LayoutB, float, layout::RowMajor, arch::OpClassTensorOp, 2, + arch::OpMultiplyAddFastF16>; + + // Define iterators over tiles from the A operand + using IteratorA = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + float, LayoutA, 1, typename MmaCore::IteratorThreadMapA, kAlignmentA>; + + // Define iterators over tiles from the B operand + using IteratorB = + cutlass::transform::threadblock::PredicatedTileIterator< + cutlass::MatrixShape, + float, LayoutB, 0, typename MmaCore::IteratorThreadMapB, kAlignmentB>; + + // Define the threadblock-scoped pipelined matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaPipelined< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + IteratorB, typename MmaCore::SmemIteratorB, float, + layout::RowMajor, typename MmaCore::MmaPolicy>; +}; + +//////////////////////////////////////////////////////////////////////////////// + /// Specialization for column-major-interleaved output template < /// Element type for A matrix operand @@ -271,6 +325,214 @@ struct DefaultMma +struct DefaultMma { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassSimt, + Stages, Operator>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for row-major output (OperatorClass TensorOp) +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Operation perfomed by GEMM + typename Operator + > +struct DefaultMma { + static cutlass::arch::CacheOperation::Kind const CacheOpA = + ((sizeof_bits::value * kAlignmentA) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + static cutlass::arch::CacheOperation::Kind const CacheOpB = + ((sizeof_bits::value * kAlignmentB) == 128) + ? cutlass::arch::CacheOperation::Global + : cutlass::arch::CacheOperation::Always; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, arch::OpClassTensorOp, + Stages, Operator, false, CacheOpA, CacheOpB>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for column-major-interleaved output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Operation performed by GEMM + typename Operator, + /// Number of Interleaved K + int InterleavedK> +struct DefaultMma, OperatorClass, + ArchTag, ThreadblockShape, WarpShape, InstructionShape, + Stages, Operator, true> { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, + layout::ColumnMajorInterleaved, OperatorClass, Stages, + Operator, true>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + //////////////////////////////////////////////////////////////////////////////// /// Specialization for SIMT IDP4A Kernels diff --git a/include/cutlass/gemm/threadblock/default_mma_core.h b/include/cutlass/gemm/threadblock/default_mma_core.h index f346709e..a7ac7c44 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core.h +++ b/include/cutlass/gemm/threadblock/default_mma_core.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -40,6 +40,8 @@ #include "cutlass/gemm/warp/mma.h" #include "cutlass/gemm/threadblock/mma_pipelined.h" #include "cutlass/gemm/threadblock/mma_singlestage.h" +#include "cutlass/arch/cache_operation.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -86,6 +88,17 @@ template < /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. bool AccumulatorsInRowMajor = false + /// Cache operation of operand A + , cutlass::arch::CacheOperation::Kind CacheOpA = + cutlass::arch::CacheOperation::Global, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB = + cutlass::arch::CacheOperation::Global, + /// per-element transformation for elements of A + ComplexTransform TransformA = ComplexTransform::kNone, + /// per-element transformation for elements of B + ComplexTransform TransformB = ComplexTransform::kNone, + bool IsComplex = false // (is_complex::value || is_complex::value) > struct DefaultMmaCore; diff --git a/include/cutlass/gemm/threadblock/default_mma_core_simt.h b/include/cutlass/gemm/threadblock/default_mma_core_simt.h index 9eaa6a7a..be501493 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_simt.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm50.h b/include/cutlass/gemm/threadblock/default_mma_core_sm50.h index 37aee476..782cd7ae 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm50.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm50.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h index a9ec80fd..30b3b3c0 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm70.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h index 51b5878f..e7a2adcb 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_sm75.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -598,6 +598,523 @@ struct DefaultMmaCore +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::ColumnMajor; + using ElementB = float; + using LayoutB = layout::RowMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape< + Shape::kM / WarpShape::kM, + Shape::kN / WarpShape::kN, + Shape::kK / WarpShape::kK + >; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && + !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size." + ); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(half_t))>; + + // Shared memory layout + using SmemLayoutB = + layout::RowMajorTensorOpMultiplicandCongruous::value, + int(128 / sizeof(half_t))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutA, + 1, + IteratorThreadMapA + >; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutB, + 0, + IteratorThreadMapB + >; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaTensorOp, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::RowMajor; + using ElementB = float; + using LayoutB = layout::ColumnMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape< + Shape::kM / WarpShape::kM, + Shape::kN / WarpShape::kN, + Shape::kK / WarpShape::kK + >; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && + !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size." + ); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = + layout::RowMajorTensorOpMultiplicandCrosswise::value, + Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutA, + 0, + IteratorThreadMapA + >; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutB, + 1, + IteratorThreadMapB + >; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaTensorOp, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::RowMajor; + using ElementB = float; + using LayoutB = layout::RowMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape< + Shape::kM / WarpShape::kM, + Shape::kN / WarpShape::kN, + Shape::kK / WarpShape::kK + >; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && + !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size." + ); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(half_t))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutA, + 0, + IteratorThreadMapA + >; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, + half_t, + SmemLayoutB, + 0, + IteratorThreadMapB + >; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaTensorOp, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = float; + using LayoutA = layout::ColumnMajor; + using ElementB = float; + using LayoutB = layout::ColumnMajor; + using ElementC = float; + using LayoutC = LayoutC_; + using OperatorClass = arch::OpClassTensorOp; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 256; + + /// Default Operator + using Operator = arch::OpMultiplyAdd; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(half_t))>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileIterator< + MatrixShape, half_t, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileIterator< + MatrixShape, half_t, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, half_t, SmemLayoutA, half_t, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, MatrixShape<0, 0>, + WarpCount::kK>; +}; + //////////////////////////////////////////////////////////////////////////////// /// Partial specialization: diff --git a/include/cutlass/gemm/threadblock/default_mma_core_sm80.h b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h new file mode 100644 index 00000000..d9b3d9a0 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_mma_core_sm80.h @@ -0,0 +1,2130 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" + +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" +#include "cutlass/gemm/warp/default_mma_tensor_op.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/gemm/threadblock/default_mma_core.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h" +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h" + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" +#include "cutlass/gemm/threadblock/mma_multistage.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for double-precision +/// +/// A: column-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::ColumnMajor; + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +/// Partial specialization for double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::ColumnMajor; + using ElementB = double; + using LayoutB = layout::RowMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for double-precision +/// +/// A: row-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::RowMajor; + using ElementB = double; + using LayoutB = layout::ColumnMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Partial specialization for double-precision +/// +/// A: row-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = double; + using LayoutA = layout::RowMajor; + using ElementB = double; + using LayoutB = layout::RowMajor; + using ElementC = double; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for float-precision +/// +/// ElementA: complex +/// ElementB: complex +/// ElementC: complex +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout for A operand + typename LayoutA_, + /// Layout for B operand + typename LayoutB_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// per-element transformation for elements of A + ComplexTransform TransformA_, + /// per-element transformation for elements of B + ComplexTransform TransformB_ + > +struct DefaultMmaCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, LayoutA_, + complex, LayoutB_, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + Operator_, + false, + CacheOpA, + CacheOpB, + TransformA_, TransformB_, true> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = LayoutA_; + using ElementB = complex; + using LayoutB = LayoutB_; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + static const ComplexTransform TransformA = TransformA_; + static const ComplexTransform TransformB = TransformB_; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + static_assert( + platform::is_same::value || + platform::is_same::value, + "The operator tag must indicate complex multiplication."); + + // + // Underlying template + // + + using MmaComplexCore = DefaultMultistageMmaComplexCore< + Shape, WarpShape, InstructionShape, + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + arch::OpClassTensorOp, + kStages, + TransformA, + TransformB, + Operator, + kCacheOpA, + kCacheOpB + >; + + // + // Shared memory layouts + // + + using SmemLayoutA = typename MmaComplexCore::SmemLayoutA; + + // Shared memory layout + using SmemLayoutB = typename MmaComplexCore::SmemLayoutB; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA; + + /// Shared memory iterator to A operand + using SmemIteratorA = typename MmaComplexCore::SmemIteratorA; + + /// ThreadMap of iterator B + using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB; + + /// Shared memory iterator to B operand + using SmemIteratorB = typename MmaComplexCore::SmemIteratorB; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename MmaComplexCore::MmaTensorOp; + + /// Policy used to define MmaPipelined + using MmaPolicy = typename MmaComplexCore::MmaPolicy; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for double-precision +/// +/// ElementA: complex +/// ElementB: complex +/// ElementC: complex +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout for A operand + typename LayoutA_, + /// Layout for B operand + typename LayoutB_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// per-element transformation for elements of A + ComplexTransform TransformA_, + /// per-element transformation for elements of B + ComplexTransform TransformB_ + > +struct DefaultMmaCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, LayoutA_, + complex, LayoutB_, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + Operator_, + false, + CacheOpA, + CacheOpB, + TransformA_, TransformB_, true> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = LayoutA_; + using ElementB = complex; + using LayoutB = LayoutB_; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + static const ComplexTransform TransformA = TransformA_; + static const ComplexTransform TransformB = TransformB_; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 64; + + /// Default Operator + using Operator = Operator_; + + static_assert( + platform::is_same::value || + platform::is_same::value, + "The operator tag must indicate complex multiplication."); + + // + // Underlying template + // + + using MmaComplexCore = DefaultMultistageMmaComplexCore< + Shape, WarpShape, InstructionShape, + ElementA, LayoutA, + ElementB, LayoutB, + ElementC, LayoutC, + arch::OpClassTensorOp, + kStages, + TransformA, + TransformB, + Operator, + kCacheOpA, + kCacheOpB + >; + + // + // Shared memory layouts + // + + using SmemLayoutA = typename MmaComplexCore::SmemLayoutA; + + // Shared memory layout + using SmemLayoutB = typename MmaComplexCore::SmemLayoutB; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = typename MmaComplexCore::IteratorThreadMapA; + + /// Shared memory iterator to A operand + using SmemIteratorA = typename MmaComplexCore::SmemIteratorA; + + /// ThreadMap of iterator B + using IteratorThreadMapB = typename MmaComplexCore::IteratorThreadMapB; + + /// Shared memory iterator to B operand + using SmemIteratorB = typename MmaComplexCore::SmemIteratorB; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename MmaComplexCore::MmaTensorOp; + + /// Policy used to define MmaPipelined + using MmaPolicy = typename MmaComplexCore::MmaPolicy; +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementA))>; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementB))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major +/// B: column-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousB = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedB = + kWarpSize / kWarpThreadArrangementContiguousB; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementA))>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: row-major +/// B: row-major +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kWarpThreadArrangementContiguousA = + Shape::kK / (kAccessSizeInBits / sizeof_bits::value); + + static int const kWarpThreadArrangementStridedA = + kWarpSize / kWarpThreadArrangementContiguousA; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, Shape::kK>; + + // Shared memory layout + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous< + sizeof_bits::value, int(128 / sizeof(ElementB))>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization: +/// +/// A: column-major-interleaved +/// B: row-major-interleaved +/// Operator: tensor op class +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by MMA + typename Operator_, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Number of interleaved K + int InterleavedK> +struct DefaultMmaCore, ElementB_, + layout::RowMajorInterleaved, ElementC_, + LayoutC_, arch::OpClassTensorOp, Stages, Operator_, + AccumulatorsInRowMajor, CacheOpA, CacheOpB> { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajorInterleaved; + using ElementB = ElementB_; + using LayoutB = layout::RowMajorInterleaved; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + static int const kInterleavedK = InterleavedK; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped access + static int const kAccessSizeInBits = 128; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = + kAccessSizeInBits / sizeof_bits::value; + + static int const kWarpThreadArrangementContiguous = + kInterleavedK / kElementsPerAccess; + + static int const kWarpThreadArrangementStrided = + kWarpSize / kWarpThreadArrangementContiguous; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, kInterleavedK>; + + // Shared memory layout + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise< + sizeof_bits::value, kInterleavedK>; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMap< + IteratorThreadMapA, + layout::PitchLinearShape>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, + kThreads, layout::PitchLinearShape<32, 1>, kElementsPerAccess>; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapB = transform::TransposePitchLinearThreadMap< + IteratorThreadMapB, + layout::PitchLinearShape>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, ElementA, SmemLayoutA, ElementB, SmemLayoutB, + ElementC, LayoutC, Operator, WarpCount::kK, AccumulatorsInRowMajor>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape<0, 0>, + MatrixShape<0, Shape::kK / 32>, + WarpCount::kK>; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::ColumnMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + IteratorThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape<0, 0>, + MatrixShape<0, 0>, + WarpCount::kK>; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::ColumnMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapB = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + SmemThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape, + MatrixShape<0, Shape::kK / 32>, + WarpCount::kK>; +}; + +/// Partial specialization for SIMT GEMMs using multistage pipeline. +/// +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A operand + typename ElementA_, + /// Data type of B operand + typename ElementB_, + /// Data type of accumulator + typename ElementC_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Operation performed by Simt + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMmaCore { + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = InstructionShape_; + using ElementA = ElementA_; + using LayoutA = layout::RowMajor; + using ElementB = ElementB_; + using LayoutB = layout::RowMajor; + using ElementC = ElementC_; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Default Operator + using Operator = Operator_; + + // Warp thread arrangement + static int const kElementsPerAccess = 1; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajor; + + // Shared memory layout + using SmemLayoutB = layout::RowMajor; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Transpose the ThreadMap of iterator A + using SmemThreadMapA = transform::TransposePitchLinearThreadMapSimt; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 0, + SmemThreadMapA>; + + /// Policy of iterator B + using IteratorThreadMapB = transform::PitchLinearStripminedThreadMap< + layout::PitchLinearShape, + kThreads, + kElementsPerAccess + >; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 1, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level op + static const int WarpNumThreadsM = 4; // TODO need to extract these from template data + static const int WarpNumThreadsN = 8; + static_assert(!(WarpShape::kM % WarpNumThreadsM) && !(WarpShape::kN % WarpNumThreadsN), + "WarpShape must be divisible by ThreadTile shape."); + static const int ThreadTileM = WarpShape::kM / WarpNumThreadsM; + static const int ThreadTileN = WarpShape::kN / WarpNumThreadsN; + static const int LaneLayout = ThreadTileM > 4 && ThreadTileN > 4 ? 2 : 1; + static const int numElementsA = 128 / sizeof_bits::value; + static const int numElementsB = 128 / sizeof_bits::value; + static const int LaneM = cutlass::const_min(numElementsA, ThreadTileM); + static const int LaneN = cutlass::const_min(numElementsB, ThreadTileN); + // these should have max of thread tile also + using LaneMmaShape = cutlass::gemm::GemmShape< + LaneM, + LaneN, + 1>; + using Policy = cutlass::gemm::warp::MmaSimtPolicy< + cutlass::MatrixShape, // WarpShape + cutlass::layout::RowMajorInterleaved, // LaneLayout + LaneMmaShape + >; + + using MmaWarpSimt = cutlass::gemm::warp::MmaSimt< + WarpShape, /// Size of the Gemm problem - concept: gemm::GemmShape<> 128, 128, 8 + ElementA, /// Data type of A elements + SmemLayoutA, /// Layout of A matrix (concept: MatrixLayout) + ElementB, /// Data type of B elements + SmemLayoutB, /// Layout of B matrix (concept: MatrixLayout) + ElementC, /// Element type of C matrix + LayoutC, /// Layout of C matrix (concept: MatrixLayout) + Policy /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + >; /// Used for partial specialization + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy< + MmaWarpSimt, + MatrixShape, + MatrixShape<0, 0>, + WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass diff --git a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h index ef51be23..82144943 100644 --- a/include/cutlass/gemm/threadblock/default_mma_core_wmma.h +++ b/include/cutlass/gemm/threadblock/default_mma_core_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h new file mode 100644 index 00000000..2f4a0796 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_mma_planar_complex_multistage.h @@ -0,0 +1,130 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/arch/arch.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/gemm/threadblock/default_mma.h" +#include "cutlass/gemm/threadblock/mma_planar_complex_multistage.h" + +#include "cutlass/numeric_types.h" +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Access granularity of A matrix in units of elements + int kAlignmentA, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Access granularity of B matrix in units of elements + int kAlignmentB, + /// Element type for internal accumulation + typename ElementAccumulator_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Operator class tag + typename OperatorClass_, + /// Tag indicating architecture to tune for + typename ArchTag_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape_, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transformation on operand B + ComplexTransform TransformB = ComplexTransform::kNone, + /// Math operator tag (e.g. arch::OpMultiplyAdd) + typename Operator = arch::OpMultiplyAdd +> +struct DefaultMmaPlanarComplexMultistage { + + // Construct a planar complex variant from the real-valued variant + using RealMmaMultistage = typename DefaultMma< + ElementA_, + LayoutA_, + kAlignmentA, + ElementB_, + LayoutB_, + kAlignmentB, + ElementAccumulator_, + LayoutC_, + OperatorClass_, + ArchTag_, + ThreadblockShape_, + WarpShape_, + InstructionShape_, + Stages, + Operator + >::ThreadblockMma; + + using ThreadblockMma = MmaPlanarComplexMultistage< + ThreadblockShape_, + typename RealMmaMultistage::IteratorA, + typename RealMmaMultistage::SmemIteratorA, + cutlass::arch::CacheOperation::Global, + typename RealMmaMultistage::IteratorB, + typename RealMmaMultistage::SmemIteratorB, + cutlass::arch::CacheOperation::Global, + ElementAccumulator_, + LayoutC_, + typename RealMmaMultistage::Policy, + Stages, + TransformA, + TransformB + >; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h new file mode 100644 index 00000000..7f3d534a --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex.h @@ -0,0 +1,154 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Template for a multistage GEMM kernel. Does not compute batching or support split-K. +*/ + +#pragma once + +#include "cutlass/arch/arch.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/threadblock/default_mma_core_sm80.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/threadblock/predicated_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Element type for A matrix operand + typename ElementA_, + /// Layout type for A matrix operand + typename LayoutA_, + /// Element type for B matrix operand + typename ElementB_, + /// Layout type for B matrix operand + typename LayoutB_, + /// Element type for internal accumulation + typename ElementAccumulator_, + /// Layout type for C and D matrix operands + typename LayoutC_, + /// Operator class tag + typename OperatorClass_, + /// Tag indicating architecture to tune for + typename ArchTag_, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape_, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape_, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape_, + /// Number of stages used in the pipelined mainloop + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transformation on operand B + ComplexTransform TransformB = ComplexTransform::kNone, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator = arch::OpMultiplyAddComplex, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor = false> +struct DefaultMultistageMmaComplex; + +//////////////////////////////////////////////////////////////////////////////// + +/// Specialization for row-major output +template < + /// Element type for A matrix operand + typename ElementA, + /// Layout type for A matrix operand + typename LayoutA, + /// Element type for B matrix operand + typename ElementB, + /// Layout type for B matrix operand + typename LayoutB, + /// Element type for internal accumulation + typename ElementAccumulator, + /// Tag indicating architecture to tune for + typename OperatorClass, + /// Tag indicating architecture to tune for + typename ArchTag, + /// Threadblock-level tile size (concept: GemmShape) + typename ThreadblockShape, + /// Warp-level tile size (concept: GemmShape) + typename WarpShape, + /// Instruction-level tile size (concept: GemmShape) + typename InstructionShape, + /// Number of stages used in the multistage mainloop + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator> +struct DefaultMultistageMmaComplex { + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMultistageMmaComplexCore< + ThreadblockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementAccumulator, layout::RowMajor, OperatorClass, + Stages, TransformA, TransformB, Operator>; + + // Define iterators over tiles from the A operand + using ThreadMapA = typename MmaCore::IteratorThreadMapA; + using AccessTypeA = cutlass::Array; + using IteratorA = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementA, LayoutA, 1, ThreadMapA, AccessTypeA>; + + // Define iterators over tiles from the B operand + using ThreadMapB = typename MmaCore::IteratorThreadMapB; + using AccessTypeB = cutlass::Array; + using IteratorB = + cutlass::transform::threadblock::PredicatedTileAccessIterator< + cutlass::MatrixShape, + ElementB, LayoutB, 0, ThreadMapB, AccessTypeB>; + + // Define the threadblock-scoped multistage matrix multiply + using ThreadblockMma = cutlass::gemm::threadblock::MmaMultistage< + typename MmaCore::Shape, IteratorA, typename MmaCore::SmemIteratorA, + MmaCore::kCacheOpA, IteratorB, typename MmaCore::SmemIteratorB, + MmaCore::kCacheOpB, ElementAccumulator, layout::RowMajor, + typename MmaCore::MmaPolicy, Stages>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h new file mode 100644 index 00000000..613c88e3 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core.h @@ -0,0 +1,113 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/array.h" +#include "cutlass/complex.h" + +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" +#include "cutlass/gemm/warp/default_mma_tensor_op.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/gemm/threadblock/default_mma_core.h" + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" + +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Template defininng default matrix multiply operators inferred from +/// threadblock tile size, global memory data layout, and target math +/// instruction. +template < + /// Shape of threadblock-scoped matrix multiply operator + typename Shape, + /// Shape of warp-level matrix multiply operator + typename WarpShape, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape, + /// Element data type of A operand + typename ElementA, + /// Layout of operand A + typename LayoutA, + /// Element data type of B operand + typename ElementB, + /// Layout of operand B + typename LayoutB, + /// Data type of accumulator + typename ElementC, + /// Layout of accumulator + typename LayoutC, + /// Indicates type of math operator (arch::OpClassSimt or arch::OpClassTensorOp) + typename OperatorClass, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator = arch::OpMultiplyAddComplex, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA = + cutlass::arch::CacheOperation::Global, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB = + cutlass::arch::CacheOperation::Global> +struct DefaultMultistageMmaComplexCore; + + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h new file mode 100644 index 00000000..230e8d76 --- /dev/null +++ b/include/cutlass/gemm/threadblock/default_multistage_mma_complex_core_sm80.h @@ -0,0 +1,1113 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines basic properties needed by CTA-level GEMMs assuming + expectations about data layout of the global memory fragments, data types, + and internal tile sizes. + + Partial specializations for threadblock::Mma operations targeting TensorOp + instructions. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" + +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/gemm/warp/mma_simt_policy.h" +#include "cutlass/gemm/warp/mma_simt.h" +#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/gemm/threadblock/default_multistage_mma_complex_core.h" + +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" +#include "cutlass/transform/pitch_linear_thread_map.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h" +#include "cutlass/gemm/threadblock/mma_multistage.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::ColumnMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +/// Partial specialization for complex double-precision +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::ColumnMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + using Operator = Operator_; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex double-precision +/// +/// A: row-major +/// B: column-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::RowMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +/// Partial specialization for complex double-precision +/// +/// A: row-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex or arch::OpMultiplyGaussianComplex +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<8, 8, 4>, + complex, layout::RowMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<8, 8, 4>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped 128 + static int const kAccessSizeInBits = 128; + + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous128b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<8, 4>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + + +/// Partial specialization for complex floating-point +/// +/// A: column-major +/// B: column-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::ColumnMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + + +/// Partial specialization for complex floating-point +/// +/// A: column-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::ColumnMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::ColumnMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex floating-point +/// +/// A: row-major +/// B: column-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::RowMajor, + complex, layout::ColumnMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::ColumnMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex floating-point +/// +/// A: row-major +/// B: row-major +/// Operator: arch::OpMultiplyAddComplex +/// Math Instruction: MMA.1688.F32.TF32 +/// +/// This uses the default warp-level operator given tile sizes +template < + /// Shape of threadblock-scoped matrix multiply operator (concept: + /// GemmShape) + typename Shape_, + /// Shape of warp-level matrix multiply operator (concept: GemmShape) + typename WarpShape_, + /// Layout of accumulator + typename LayoutC_, + /// Number of stages + int Stages, + /// Complex transformation on operand A + ComplexTransform TransformA, + /// Complex transformation on operand B + ComplexTransform TransformB, + /// Multiply-add operator (arch::OpMultiplyAddComplex) + typename Operator_, + /// Cache operation of operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Cache operation of operand B + cutlass::arch::CacheOperation::Kind CacheOpB> +struct DefaultMultistageMmaComplexCore< + Shape_, WarpShape_, GemmShape<16, 8, 8>, + complex, layout::RowMajor, + complex, layout::RowMajor, + complex, LayoutC_, + arch::OpClassTensorOp, + Stages, + TransformA, TransformB, + Operator_, + CacheOpA, CacheOpB> { + + using Shape = Shape_; + using WarpShape = WarpShape_; + using InstructionShape = GemmShape<16, 8, 8>; + using ElementA = complex; + using LayoutA = layout::RowMajor; + using ElementB = complex; + using LayoutB = layout::RowMajor; + using ElementC = complex; + using LayoutC = LayoutC_; + static int const kStages = Stages; + static ComplexTransform const kTransformA = TransformA; + static ComplexTransform const kTransformB = TransformB; + using Operator = Operator_; + static cutlass::arch::CacheOperation::Kind const kCacheOpA = cutlass::arch::CacheOperation::Always; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = cutlass::arch::CacheOperation::Always; + + /// Number of warps present + using WarpCount = GemmShape; + + // Divisility requirements + static_assert( + !(Shape::kM % WarpShape::kM) && !(Shape::kN % WarpShape::kN), + "Threadblock-scoped GEMM should be divisible by warp-scoped GEMM size."); + + static_assert(WarpCount::kCount > 1, + "This specialization requires at least two warps."); + + /// Number of threads per warp + static int const kWarpSize = warp::WarpSize::value; + + /// Number of threads total + static int const kThreads = WarpCount::kCount * kWarpSize; + + /// Size of a threadblock-scoped + static int const kAccessSizeInBits = 64; + + // + // Shared memory layouts + // + + using SmemLayoutA = layout::RowMajorTensorOpMultiplicand64bCrosswise; + + using SmemLayoutB = layout::RowMajorTensorOpMultiplicandCongruous64b; + + // + // Iterators to write to shared memory + // + + /// ThreadMap of iterator A + using IteratorThreadMapA = transform::PitchLinearWarpRakedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to A operand + using SmemIteratorA = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementA, SmemLayoutA, 1, + IteratorThreadMapA>; + + /// ThreadMap of iterator B + using IteratorThreadMapB = transform::PitchLinearWarpStripedThreadMap< + layout::PitchLinearShape, kThreads, + layout::PitchLinearShape<16, 2>, + kAccessSizeInBits / sizeof_bits::value>; + + /// Shared memory iterator to B operand + using SmemIteratorB = transform::threadblock::RegularTileAccessIterator< + MatrixShape, ElementB, SmemLayoutB, 0, + IteratorThreadMapB>; + + // + // Warp-level matrix multiply operator + // + + // Define the warp-level tensor op + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + WarpShape, InstructionShape, + ElementA, SmemLayoutA, + ElementB, SmemLayoutB, + ElementC, LayoutC, + kTransformA, kTransformB, + Operator>::Type; + + /// Policy used to define MmaPipelined + using MmaPolicy = MmaPolicy, + MatrixShape<0, 0>, WarpCount::kK>; +}; + +//////////////////////////////////////////////////////////////////////////////// + + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/gemv.h b/include/cutlass/gemm/threadblock/gemv.h deleted file mode 100755 index 54da93a9..00000000 --- a/include/cutlass/gemm/threadblock/gemv.h +++ /dev/null @@ -1,140 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ -/*! \file - \brief Template for a threadblock-scoped GEMV kernel. -*/ - -#pragma once - -#include "cutlass/cutlass.h" -#include "cutlass/array.h" -#include "cutlass/numeric_types.h" -#include "cutlass/matrix_shape.h" - -#include "cutlass/gemm/gemm.h" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass { -namespace gemm { -namespace threadblock { - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Structure to compute the matrix-vector product using SIMT math instructions. -template < - class Core_ //< GemvCore -> -class Gemv { -public: - using Shape = typename Core_::Shape; - - /// The MMA operator that computes GEMV - using Operator = typename Core_::Operator; - - /// Iterates over A in global memory - using IteratorA = typename Core_::IteratorA; - - /// Iterates over B in global memory - using IteratorB = typename Core_::IteratorB; - - /// Fragment of operand C loaded from global memory - using IteratorC = typename Core_::IteratorC; - - /// Fragment of operand A loaded from global memory - using FragmentA = typename IteratorA::Fragment; - - /// Fragment of operand B loaded from global memory - using FragmentB = typename IteratorB::Fragment; - - /// Fragment of operand accumulator loaded/stored to global memory - using FragmentC = typename Operator::FragmentC; - - /// Shape of the per-thread GEMV operation - using ThreadShape = typename Core_::ThreadShape; - -public: - CUTLASS_DEVICE - Gemv() { } - - CUTLASS_DEVICE - void operator()( - GemmCoord const &problem_size, ///< problem size of batched GEMV - FragmentC &accum, ///< destination accumulator tile - IteratorA iterator_A, ///< iterator over A operand in global memory - IteratorB iterator_B, ///< iterator over B operand in global memory - FragmentC const &src_accum) { ///< source accumualtor tile - - // - // Prologue - // - - FragmentA frag_A; - FragmentB frag_B; - frag_A.clear(); - frag_B.clear(); - - iterator_A.load(frag_A); - iterator_B.load(frag_B); - ++iterator_A; - ++iterator_B; - - // - // Mainloop - // - Operator thread_mma; - int gemm_k = problem_size.k(); - - if (gemm_k < Shape::kK) - { - iterator_A.clear_mask(); - iterator_B.clear_mask(); - } - - // iterate over K to accumulate result - CUTLASS_GEMM_LOOP - for (; gemm_k > 0; gemm_k -= Shape::kK) { - thread_mma(accum, frag_A, frag_B, accum); - - iterator_A.load(frag_A); - iterator_B.load(frag_B); - ++iterator_A; - ++iterator_B; - - if (gemm_k < Shape::kK) - { - iterator_A.clear_mask(); - iterator_B.clear_mask(); - } - } - - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace threadblock -} // namespace gemm -} // namespace cutlass diff --git a/include/cutlass/gemm/threadblock/mma_base.h b/include/cutlass/gemm/threadblock/mma_base.h index 7e6d4fe6..dbf3d31f 100644 --- a/include/cutlass/gemm/threadblock/mma_base.h +++ b/include/cutlass/gemm/threadblock/mma_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_multistage.h b/include/cutlass/gemm/threadblock/mma_multistage.h new file mode 100644 index 00000000..0431c306 --- /dev/null +++ b/include/cutlass/gemm/threadblock/mma_multistage.h @@ -0,0 +1,526 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/gemm.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/threadblock/mma_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Used for partial specialization + typename Enable = bool> +class MmaMultistage : + public MmaBase { +public: + ///< Base class + using Base = MmaBase; + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + ///< Data type of accumulator matrix + using ElementC = ElementC_; + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + ///< Policy describing tuning details + using Policy = Policy_; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = typename Policy::Operator::FragmentC; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Minimum architecture is Sm80 to support cp.async + using ArchTag = arch::Sm80; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = Operator::kTransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = Operator::kTransformB; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of cp.async instructions to load one stage of operand A + static int const AsyncCopyIterationsPerStageA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of cp.async instructions to load one stage of operand B + static int const AsyncCopyIterationsPerStageB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of cp.async instructions to load on group of operand A + static int const kAccessesPerGroupA = + (AsyncCopyIterationsPerStageA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + + /// Number of cp.async instructions to load on group of operand B + static int const kAccessesPerGroupB = + (AsyncCopyIterationsPerStageB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + }; + + private: + + using WarpLoadedFragmentA = typename Operator::FragmentA; + using WarpLoadedFragmentB = typename Operator::FragmentB; + using WarpTransformedFragmentA = typename Operator::TransformedFragmentA; + using WarpTransformedFragmentB = typename Operator::TransformedFragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + MmaMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset( + {warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset( + {Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + + CUTLASS_DEVICE + void copy_tiles_and_advance(IteratorA &iterator_A, IteratorB &iterator_B, + int group_start_A = 0, int group_start_B = 0) { + iterator_A.set_iteration_index(group_start_A * + IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + if (group_start_A + j < Detail::AsyncCopyIterationsPerStageA) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_A.get(); + + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_A.valid()); + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + } + + iterator_B.set_iteration_index(group_start_B * + IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + if (group_start_B + j < Detail::AsyncCopyIterationsPerStageB) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + int const kSrcBytes = sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr = iterator_B.get(); + + cutlass::arch::cp_async( + dst_ptr + v, gmem_ptr, iterator_B.valid()); + + ++iterator_B; + } + ++this->smem_iterator_B_; + } + } + } + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC &accum, + ///< iterator over A operand in global memory + IteratorA iterator_A, + ///< iterator over B operand in global memory + IteratorB iterator_B, + ///< initial value of accumulator + FragmentC const &src_accum) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations) { + + if (gemm_k_iterations == 0) { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + + iterator_A.set_iteration_index(0); + this->smem_iterator_A_.set_iteration_index(0); + + // Async Copy for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageA; ++j) { + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / + IteratorA::kAccessesPerVector / 8; + + int src_bytes = (iterator_A.valid() ? kSrcBytes : 0); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_A.get(), iterator_A.valid()); + + ++iterator_A; + } + + ++this->smem_iterator_A_; + } + + iterator_B.set_iteration_index(0); + this->smem_iterator_B_.set_iteration_index(0); + + // Async Copy for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::AsyncCopyIterationsPerStageB; ++j) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast( + this->smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / + IteratorB::kAccessesPerVector / 8; + + cutlass::arch::cp_async_zfill( + dst_ptr + v, iterator_B.get(), iterator_B.valid()); + + ++iterator_B; + } + + ++this->smem_iterator_B_; + } + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Defines the boundary of a stage of cp.async. + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // Waits until kStages-2 stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + WarpLoadedFragmentA warp_loaded_frag_A[2]; + WarpLoadedFragmentB warp_loaded_frag_B[2]; + WarpTransformedFragmentA warp_transformed_frag_A[2]; + WarpTransformedFragmentB warp_transformed_frag_B[2]; + + Operator warp_mma; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[0]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[0]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (gemm_k_iterations == 0) { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + warp_mma.transform(warp_transformed_frag_A[0], warp_transformed_frag_B[0], + warp_loaded_frag_A[0], warp_loaded_frag_B[0]); + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-Base::kStages + 1);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_loaded_frag_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load(warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (warp_mma_k > 0) + warp_mma.transform(warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + warp_loaded_frag_A[warp_mma_k % 2], + warp_loaded_frag_B[warp_mma_k % 2]); + + warp_mma( + accum, + warp_transformed_frag_A[warp_mma_k % 2], + warp_transformed_frag_B[warp_mma_k % 2], + accum + ); + + // Issue global->shared copies for the this stage + if (warp_mma_k < Base::kWarpGemmIterations - 1) { + int group_start_iteration_A, group_start_iteration_B; + + group_start_iteration_A = warp_mma_k * Detail::kAccessesPerGroupA; + group_start_iteration_B = warp_mma_k * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + } + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + int group_start_iteration_A, group_start_iteration_B; + group_start_iteration_A = + (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = + (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + + copy_tiles_and_advance(iterator_A, iterator_B, group_start_iteration_A, + group_start_iteration_B); + + // Inserts a memory fence between stages of cp.async instructions. + cutlass::arch::cp_async_fence(); + + // Waits until kStages-2 stages have committed. + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A.add_tile_offset({0, 1}); + iterator_B.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + if (gemm_k_iterations == 0) { + iterator_A.clear_mask(); + iterator_B.clear_mask(); + } + } + + // Do any conversions feeding the first stage at the end of the loop so + // we can start right away on mma instructions + if (warp_mma_k + 1 == Base::kWarpGemmIterations) + warp_mma.transform(warp_transformed_frag_A[(warp_mma_k + 1) % 2], + warp_transformed_frag_B[(warp_mma_k + 1) % 2], + warp_loaded_frag_A[(warp_mma_k + 1) % 2], + warp_loaded_frag_B[(warp_mma_k + 1) % 2]); + } + + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/mma_pipelined.h b/include/cutlass/gemm/threadblock/mma_pipelined.h index 735950cf..80954f6c 100644 --- a/include/cutlass/gemm/threadblock/mma_pipelined.h +++ b/include/cutlass/gemm/threadblock/mma_pipelined.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -75,7 +75,7 @@ template < typename IteratorA_::Element, IteratorA_::Fragment::kElements>, /// - /// Transformation applied to A operand + /// Transformation applied to B operand typename TransformB_ = NumericArrayConverter< typename SmemIteratorB_::Element, typename IteratorB_::Element, diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h index 9491f56f..b37b4184 100644 --- a/include/cutlass/gemm/threadblock/mma_planar_complex_base.h +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_base.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h new file mode 100644 index 00000000..18e63b58 --- /dev/null +++ b/include/cutlass/gemm/threadblock/mma_planar_complex_multistage.h @@ -0,0 +1,642 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Template for a double-buffered threadblock-scoped GEMM kernel. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/arch/memory.h" +#include "cutlass/array.h" +#include "cutlass/array_planar_complex.h" +#include "cutlass/functional.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/threadblock/mma_planar_complex_base.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace threadblock { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Structure to compute the matrix product targeting CUDA cores and SIMT math +/// instructions. +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Iterates over tiles of A operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorA_, + /// Iterates over tiles of A operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorA_, + /// Cache operation for operand A + cutlass::arch::CacheOperation::Kind CacheOpA, + /// Iterates over tiles of B operand in global memory + // (concept: ReadableTileIterator | ForwardTileIterator | + // MaskedTileIterator) + typename IteratorB_, + /// Iterates over tiles of B operand in shared memory + /// (concept: WriteableTileIterator | RandomAccessTileIterator) + typename SmemIteratorB_, + /// Cache operation for operand B + cutlass::arch::CacheOperation::Kind CacheOpB, + /// Data type of accumulator matrix + typename ElementC_, + /// Data type of accumulator matrix + typename LayoutC_, + /// Policy describing tuning details (concept: MmaPolicy) + typename Policy_, + /// Number of stages, + int Stages, + /// Transformation applied to A + ComplexTransform TransformA = ComplexTransform::kNone, + /// Transformation applied to B + ComplexTransform TransformB = ComplexTransform::kNone +> +class MmaPlanarComplexMultistage : + public MmaPlanarComplexBase { +public: + ///< Base class + using Base = MmaPlanarComplexBase; + + ///< Size of the Gemm problem - concept: gemm::GemmShape<> + using Shape = Shape_; + + ///< Iterates over tiles of A operand in global memory + using IteratorA = IteratorA_; + + ///< Iterates over tiles of B operand in global memory + using IteratorB = IteratorB_; + + ///< Data type of accumulator matrix + using ElementC = ElementC_; + + ///< Layout of accumulator matrix + using LayoutC = LayoutC_; + + ///< Policy describing tuning details + using Policy = Policy_; + + ///< Archtecture tag + using ArchTag = arch::Sm80; + + using SmemIteratorA = SmemIteratorA_; + using SmemIteratorB = SmemIteratorB_; + + static cutlass::arch::CacheOperation::Kind const kCacheOpA = CacheOpA; + static cutlass::arch::CacheOperation::Kind const kCacheOpB = CacheOpB; + + /// Transformation applied to A + static ComplexTransform const kTransformA = TransformA; + + /// Transformation applied to B + static ComplexTransform const kTransformB = TransformB; + + // + // Dependent types + // + + /// Fragment of accumulator tile + using FragmentC = ArrayPlanarComplex< + typename Policy::Operator::FragmentC::Element, + Policy::Operator::FragmentC::kElements + >; + + /// Warp-level Mma + using Operator = typename Policy::Operator; + + /// Internal structure exposed for introspection. + struct Detail { + + static_assert(Base::kWarpGemmIterations > 1, + "The pipelined structure requires at least two warp-level " + "GEMM operations."); + + /// Number of LDGSTS instructions to load one stage of operand A + static int const TBLDGSTSIterationsA = + IteratorA::ThreadMap::Iterations::kCount; + + /// Number of LDGSTS instructions to load one stage of operand B + static int const TBLDGSTSIterationsB = + IteratorB::ThreadMap::Iterations::kCount; + + /// Number of stages + static int const kStages = Stages; + + /// Number of LDGSTS instructions to load on group of operand A + static int const kAccessesPerGroupA = + (TBLDGSTSIterationsA + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + + /// Number of LDGSTS instructions to load on group of operand B + static int const kAccessesPerGroupB = + (TBLDGSTSIterationsB + Base::kWarpGemmIterations - 1) / Base::kWarpGemmIterations; + }; + + private: + + using WarpFragmentA = typename Operator::FragmentA; + using WarpFragmentB = typename Operator::FragmentB; + + private: + + // + // Data members + // + + /// Iterator to write threadblock-scoped tile of A operand to shared memory + SmemIteratorA smem_iterator_A_; + + /// Iterator to write threadblock-scoped tile of B operand to shared memory + SmemIteratorB smem_iterator_B_; + +public: + + /// Construct from tensor references + CUTLASS_DEVICE + MmaPlanarComplexMultistage( + ///< Shared storage needed for internal use by threadblock-scoped GEMM + typename Base::SharedStorage &shared_storage, + ///< ID within the threadblock + int thread_idx, + ///< ID of warp + int warp_idx, + ///< ID of each thread within a warp + int lane_idx + ): + Base(shared_storage, thread_idx, warp_idx, lane_idx), + smem_iterator_A_(shared_storage.operand_A_ref(), thread_idx), + smem_iterator_B_(shared_storage.operand_B_ref(), thread_idx) + { + // Compute warp location within threadblock tile by mapping the warp_id to + // three coordinates: + // _m: the warp's position within the threadblock along the M dimension + // _n: the warp's position within the threadblock along the N dimension + // _k: the warp's position within the threadblock along the K dimension + + int warp_idx_mn = warp_idx % (Base::WarpCount::kM * Base::WarpCount::kN); + int warp_idx_k = warp_idx / (Base::WarpCount::kM * Base::WarpCount::kN); + + int warp_idx_m = warp_idx_mn % Base::WarpCount::kM; + int warp_idx_n = warp_idx_mn / Base::WarpCount::kM; + + // Add per-warp offsets in units of warp-level tiles + this->warp_tile_iterator_A_.add_tile_offset({warp_idx_m, Base::kWarpGemmIterations * warp_idx_k}); + this->warp_tile_iterator_B_.add_tile_offset({Base::kWarpGemmIterations * warp_idx_k, warp_idx_n}); + } + +private: + + CUTLASS_DEVICE + void copy_tiles_and_advance( + IteratorA &iterator_A_real, + IteratorA &iterator_A_imag, + + IteratorB &iterator_B_real, + IteratorB &iterator_B_imag, + + int group_start_A = 0, + int group_start_B = 0) { + + iterator_A_real.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector); + iterator_A_imag.set_iteration_index(group_start_A * IteratorA::kAccessesPerVector); + this->smem_iterator_A_.set_iteration_index(group_start_A); + + // LDGSTS for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupA; ++j) { + + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_A_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + + auto gmem_ptr_real = iterator_A_real.get(); + auto gmem_ptr_imag = iterator_A_imag.get(); + + bool pred_guard = iterator_A_real.valid(); + cutlass::arch::cp_async( + dst_ptr + v, + gmem_ptr_real, + pred_guard); + cutlass::arch::cp_async( + dst_ptr + v + (Base::SharedStorage::kImaginaryStrideA / IteratorA::ThreadMap::kElementsPerAccess), + reinterpret_cast(gmem_ptr_imag), + pred_guard); + + ++iterator_A_real; + ++iterator_A_imag; + } + + ++this->smem_iterator_A_; + } + + iterator_B_real.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector); + iterator_B_imag.set_iteration_index(group_start_B * IteratorB::kAccessesPerVector); + this->smem_iterator_B_.set_iteration_index(group_start_B); + + // LDGSTS for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::kAccessesPerGroupB; ++j) { + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_B_.get()); + + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8; + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + auto gmem_ptr_real = iterator_B_real.get(); + auto gmem_ptr_imag = iterator_B_imag.get(); + + bool pred_guard = iterator_B_real.valid(); + cutlass::arch::cp_async( + dst_ptr + v, + gmem_ptr_real, + pred_guard); + cutlass::arch::cp_async( + dst_ptr + v + (Base::SharedStorage::kImaginaryStrideB / IteratorB::ThreadMap::kElementsPerAccess), + reinterpret_cast(gmem_ptr_imag), + pred_guard); + + ++iterator_B_real; + ++iterator_B_imag; + } + ++this->smem_iterator_B_; + } + } + + CUTLASS_DEVICE + void warp_mma_planar_complex( + Operator & warp_mma, + FragmentC &accum, + WarpFragmentA const & real_A, + WarpFragmentA const & imag_A, + WarpFragmentB const & real_B, + WarpFragmentB const & imag_B) { + + cutlass::negate> neg_op_B; + + WarpFragmentB neg_real_B = neg_op_B(real_B); + WarpFragmentB neg_imag_B = neg_op_B(imag_B); + + warp_mma(accum.real, real_A, real_B, accum.real); + + if (kTransformB == ComplexTransform::kNone) { + warp_mma(accum.imag, real_A, imag_B, accum.imag); + } + else { + warp_mma(accum.imag, real_A, neg_imag_B, accum.imag); + } + + if (kTransformA == ComplexTransform::kNone) { + warp_mma(accum.imag, imag_A, real_B, accum.imag); + } + else { + warp_mma(accum.imag, imag_A, neg_real_B, accum.imag); + } + + if (kTransformA == ComplexTransform::kNone ^ kTransformB == ComplexTransform::kNone) { + warp_mma(accum.real, imag_A, imag_B, accum.real); + } + else { + warp_mma(accum.real, imag_A, neg_imag_B, accum.real); + } + } + +public: + + /// Perform a threadblock-scoped matrix multiply-accumulate + CUTLASS_DEVICE + void operator()( + ///< problem size of GEMM + int gemm_k_iterations, + ///< destination accumulator tile + FragmentC &accum, + ///< iterator over A operand in global memory + IteratorA iterator_A_real, + ///< iterator over A operand in global memory + IteratorA iterator_A_imag, + ///< iterator over B operand in global memory + IteratorB iterator_B_real, + ///< iterator over B operand in global memory + IteratorB iterator_B_imag, + ///< initial value of accumulator + FragmentC const &src_accum) { + + // + // Prologue + // + + // Issue several complete stages + CUTLASS_PRAGMA_UNROLL + for (int stage = 0; stage < Base::kStages - 1; + ++stage, --gemm_k_iterations) { + + if (gemm_k_iterations == 0) { + iterator_A_real.clear_mask(); + iterator_A_imag.clear_mask(); + iterator_B_real.clear_mask(); + iterator_B_imag.clear_mask(); + } + + iterator_A_real.set_iteration_index(0); + iterator_A_imag.set_iteration_index(0); + + this->smem_iterator_A_.set_iteration_index(0); + + // LDGSTS for operand A + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLDGSTSIterationsA; ++j) { + + typename IteratorA::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_A_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorA::kAccessesPerVector; ++v) { + + int const kSrcBytes = + sizeof_bits::value * + IteratorA::ThreadMap::kElementsPerAccess / IteratorA::kAccessesPerVector / 8; + + bool pred_guard = iterator_A_real.valid(); + + auto src_ptr_real = iterator_A_real.get(); + auto src_ptr_imag = iterator_A_imag.get(); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, src_ptr_real, pred_guard); + + cutlass::arch::cp_async_zfill( + dst_ptr + v + + Base::SharedStorage::kImaginaryStrideA / + IteratorA::ThreadMap::kElementsPerAccess, + reinterpret_cast(src_ptr_imag), + pred_guard); + + ++iterator_A_real; + ++iterator_A_imag; + } + + ++this->smem_iterator_A_; + } + + iterator_B_real.set_iteration_index(0); + iterator_B_imag.set_iteration_index(0); + + this->smem_iterator_B_.set_iteration_index(0); + + // LDGSTS for operand B + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < Detail::TBLDGSTSIterationsB; ++j) { + + typename IteratorB::AccessType *dst_ptr = + reinterpret_cast(this->smem_iterator_B_.get()); + + CUTLASS_PRAGMA_UNROLL + for (int v = 0; v < IteratorB::kAccessesPerVector; ++v) { + + int const kSrcBytes = + sizeof_bits::value * + IteratorB::ThreadMap::kElementsPerAccess / IteratorB::kAccessesPerVector / 8; + + bool pred_guard = iterator_B_real.valid(); + + auto src_ptr_real = iterator_B_real.get(); + auto src_ptr_imag = iterator_B_imag.get(); + + cutlass::arch::cp_async_zfill( + dst_ptr + v, src_ptr_real, pred_guard); + + cutlass::arch::cp_async_zfill( + dst_ptr + v + + Base::SharedStorage::kImaginaryStrideB / + IteratorB::ThreadMap::kElementsPerAccess, + reinterpret_cast(src_ptr_imag), + pred_guard); + + ++iterator_B_real; + ++iterator_B_imag; + } + + ++this->smem_iterator_B_; + } + + // Move to the next stage + iterator_A_real.add_tile_offset({0, 1}); + iterator_A_imag.add_tile_offset({0, 1}); + + iterator_B_real.add_tile_offset({1, 0}); + iterator_B_imag.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Inserts a memory fence between stages of cp.async instructions + cutlass::arch::cp_async_fence(); + } + + // Perform accumulation in the 'd' output operand + accum = src_accum; + + // Blocks until all but kStages-2 cp.async stages have committed. + cutlass::arch::cp_async_wait(); + __syncthreads(); + + // Pair of fragments used to overlap shared memory loads and math + // instructions + + WarpFragmentA warp_frag_real_A[2]; + WarpFragmentA warp_frag_imag_A[2]; + + WarpFragmentB warp_frag_real_B[2]; + WarpFragmentB warp_frag_imag_B[2]; + + this->warp_tile_iterator_A_.set_kgroup_index(0); + this->warp_tile_iterator_B_.set_kgroup_index(0); + + this->warp_tile_iterator_A_.load(warp_frag_real_A[0]); + this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[0], Base::SharedStorage::kImaginaryStrideA); + + this->warp_tile_iterator_B_.load(warp_frag_real_B[0]); + this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[0], Base::SharedStorage::kImaginaryStrideB); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + if (gemm_k_iterations == 0) { + iterator_A_real.clear_mask(); + iterator_A_imag.clear_mask(); + iterator_B_real.clear_mask(); + iterator_B_imag.clear_mask(); + } + + // Start issuing the first group of the next stage outside of the mainloop + copy_tiles_and_advance(iterator_A_real, iterator_A_imag, iterator_B_real, iterator_B_imag); + + Operator warp_mma; + + int smem_write_stage_idx = Base::kStages - 1; + int smem_read_stage_idx = 0; + + // + // Mainloop + // + + CUTLASS_GEMM_LOOP + for (; gemm_k_iterations > (-Base::kStages + 1);) { + // + // Loop over GEMM K dimension + // + + // Computes a warp-level GEMM on data held in shared memory + // Each "warp_mma_k" refers to a warp-level matrix multiply-accumulate + CUTLASS_PRAGMA_UNROLL + for (int warp_mma_k = 0; warp_mma_k < Base::kWarpGemmIterations; + ++warp_mma_k) { + + // Load warp-level tiles from shared memory, wrapping to k offset if + // this is the last group as the case may be. + + this->warp_tile_iterator_A_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + this->warp_tile_iterator_B_.set_kgroup_index((warp_mma_k + 1) % Base::kWarpGemmIterations); + + this->warp_tile_iterator_A_.load(warp_frag_real_A[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_A_.load_with_pointer_offset(warp_frag_imag_A[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideA); + + this->warp_tile_iterator_B_.load(warp_frag_real_B[(warp_mma_k + 1) % 2]); + this->warp_tile_iterator_B_.load_with_pointer_offset(warp_frag_imag_B[(warp_mma_k + 1) % 2], Base::SharedStorage::kImaginaryStrideB); + + ++this->warp_tile_iterator_A_; + ++this->warp_tile_iterator_B_; + + // Issue global->shared copies for the next stage + int group_start_iteration_A, group_start_iteration_B; + + if (warp_mma_k + 1 == Base::kWarpGemmIterations) { + group_start_iteration_A = 0; + group_start_iteration_B = 0; + } + else { + group_start_iteration_A = (warp_mma_k + 1) * Detail::kAccessesPerGroupA; + group_start_iteration_B = (warp_mma_k + 1) * Detail::kAccessesPerGroupB; + } + + copy_tiles_and_advance( + iterator_A_real, + iterator_A_imag, + iterator_B_real, + iterator_B_imag, + group_start_iteration_A, + group_start_iteration_B); + + if (warp_mma_k + 2 == Base::kWarpGemmIterations) { + // Inserts a memory fence between stages of cp.async instructions + cutlass::arch::cp_async_fence(); + + // Blocks until all but kStages-2 cp.async stages have committed. + arch::cp_async_wait(); + __syncthreads(); + + // Move to the next stage + iterator_A_real.add_tile_offset({0, 1}); + iterator_A_imag.add_tile_offset({0, 1}); + + iterator_B_real.add_tile_offset({1, 0}); + iterator_B_imag.add_tile_offset({1, 0}); + + this->smem_iterator_A_.add_tile_offset({0, 1}); + this->smem_iterator_B_.add_tile_offset({1, 0}); + + // Add negative offsets to return iterators to the 'start' of the + // circular buffer in shared memory + if (smem_write_stage_idx == (Base::kStages - 1)) { + this->smem_iterator_A_.add_tile_offset({0, -Base::kStages}); + this->smem_iterator_B_.add_tile_offset({-Base::kStages, 0}); + smem_write_stage_idx = 0; + } else { + ++smem_write_stage_idx; + } + + if (smem_read_stage_idx == (Base::kStages - 1)) { + + this->warp_tile_iterator_A_.add_tile_offset( + {0, -Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations}); + + this->warp_tile_iterator_B_.add_tile_offset( + {-Base::kStages * Policy::kPartitionsK * + Base::kWarpGemmIterations, + 0}); + smem_read_stage_idx = 0; + } else { + ++smem_read_stage_idx; + } + + --gemm_k_iterations; + if (gemm_k_iterations == 0) { + iterator_A_real.clear_mask(); + iterator_A_imag.clear_mask(); + iterator_B_real.clear_mask(); + iterator_B_imag.clear_mask(); + } + } + + warp_mma_planar_complex( + warp_mma, + accum, + warp_frag_real_A[warp_mma_k % 2], + warp_frag_imag_A[warp_mma_k % 2], + warp_frag_real_B[warp_mma_k % 2], + warp_frag_imag_B[warp_mma_k % 2]); + } + + } + + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/threadblock/mma_singlestage.h b/include/cutlass/gemm/threadblock/mma_singlestage.h index fd9890a4..32d4d4ee 100644 --- a/include/cutlass/gemm/threadblock/mma_singlestage.h +++ b/include/cutlass/gemm/threadblock/mma_singlestage.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle.h b/include/cutlass/gemm/threadblock/threadblock_swizzle.h index 1beec2c2..03d71d31 100644 --- a/include/cutlass/gemm/threadblock/threadblock_swizzle.h +++ b/include/cutlass/gemm/threadblock/threadblock_swizzle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -99,61 +99,13 @@ int RematerializeBlockDimZ() { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Threadblock swizzling function for GEMMs +template struct GemmIdentityThreadblockSwizzle { CUTLASS_HOST_DEVICE GemmIdentityThreadblockSwizzle() { } - int const kTile = 1; - - /// Returns the shape of the problem in units of logical tiles - CUTLASS_HOST_DEVICE - GemmCoord get_tiled_shape( - GemmCoord problem_size, - GemmCoord tile_size, - int split_k_slices) const { - - return GemmCoord( - (problem_size.m() + tile_size.m() - 1) / tile_size.m(), - (problem_size.n() + tile_size.n() - 1) / tile_size.n(), - split_k_slices); - } - - /// Computes CUDA grid dimensions given a size in units of logical tiles - CUTLASS_HOST_DEVICE - dim3 get_grid_shape(GemmCoord tiled_shape) const { - return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k()); - } - - /// Obtains the threadblock offset (in units of threadblock-scoped tiles) - CUTLASS_DEVICE - GemmCoord get_tile_offset() const { - - int block_idx_x = RematerializeBlockIdxX(); - int block_idx_y = RematerializeBlockIdxY(); - - return GemmCoord{ - (block_idx_x / kTile), - (block_idx_y * kTile) + (block_idx_x % kTile), - RematerializeBlockIdxZ() - }; - } -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// A special version of GemmIdentityThreadblockSwizzle. See the choice of kTile below. -template -struct GemmCohortThreadblockSwizzle -{ - const int kTile = - (platform::is_same::value || - platform::is_same::value) - ? 4 - : 1; - - CUTLASS_HOST_DEVICE - GemmCohortThreadblockSwizzle() { } + int const kTile = N; /// Returns the shape of the problem in units of logical tiles CUTLASS_HOST_DEVICE @@ -271,8 +223,11 @@ struct GemmBatchedIdentityThreadblockSwizzle { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Threadblock swizzling function for split-K GEMMs +template struct GemmSplitKIdentityThreadblockSwizzle { + int const kTile = N; + /// Returns the shape of the problem in units of logical tiles CUTLASS_HOST_DEVICE GemmCoord get_tiled_shape( @@ -289,16 +244,20 @@ struct GemmSplitKIdentityThreadblockSwizzle { /// Computes CUDA grid dimensions given a size in units of logical tiles CUTLASS_HOST_DEVICE dim3 get_grid_shape(GemmCoord tiled_shape) const { - return dim3(tiled_shape.m(), tiled_shape.n(), tiled_shape.k()); + return dim3(tiled_shape.m() * kTile, (tiled_shape.n() + kTile - 1) / kTile, tiled_shape.k()); } /// Obtains the threadblock offset (in units of threadblock-scoped tiles) CUTLASS_DEVICE GemmCoord get_tile_offset() const { + + int block_idx_x = RematerializeBlockIdxX(); + int block_idx_y = RematerializeBlockIdxY(); + return GemmCoord{ - RematerializeBlockIdxX(), - RematerializeBlockIdxY(), + (block_idx_x / kTile), + (block_idx_y * kTile) + (block_idx_x % kTile), RematerializeBlockIdxZ() }; } diff --git a/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h new file mode 100644 index 00000000..3c6772af --- /dev/null +++ b/include/cutlass/gemm/warp/default_mma_complex_tensor_op.h @@ -0,0 +1,401 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/warp/mma_complex_tensor_op.h" +#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Data type of A elements + typename ElementA_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename ElementB_, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename ElementC_, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Complex transform on A operand + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transform on B operand + ComplexTransform TransformB = ComplexTransform::kNone, + /// Multiply-add operator (arch::OpMultiplyAddComplex, arch::OpMultiplyGaussianComplex) + typename Operator_ = arch::OpMultiplyAddComplex> +struct DefaultMmaComplexTensorOp; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex case +// 4 real-valued mma operations +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Real-valued underlying type of complex-valued A operand + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Real-valued underlying type of complex-valued B operand + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Real-valued underlying type of complex-valued C operand + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddComplex> { + + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + RealElementA, + cutlass::layout::RowMajor, + RealElementB, + cutlass::layout::ColumnMajor, + RealElementC, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex case using GaussianComplex operation +// 3 real-valued mma operations +// A = (ar + j ai), B = (br +j bi), D = AB +// P1 = (ar + ai) * br, P2 = - ar * (br - bi), P3 = ai * (br + bi) +// D = dr + j di = (P1 - P3) + j (P1 + P2) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Real-valued underlying type of complex-valued A operand + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Real-valued underlying type of complex-valued B operand + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Real-valued underlying type of complex-valued C operand + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddGaussianComplex> { + + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + RealElementA, + cutlass::layout::RowMajor, + RealElementB, + cutlass::layout::ColumnMajor, + RealElementC, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaGaussianComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization - input and output types are complex*complex +// Use TF32 tensor operation internally +// 4 real-valued MMA.1688.F32.TF32 operations on TF32 +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddComplex> { + + // Complex floating point tensor operation use MMA.1688.F32.TF32 mma instruction + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + tfloat32_t, + cutlass::layout::RowMajor, + tfloat32_t, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization - input and output types are complex*complex +// Use BF16 tensor operation internally +// 4 real-valued MMA.1688.F32.BF16 operations on BF16 +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddFastBF16> { + + // Complex floating point tensor operation use MMA.1688.F32.BF16 mma instruction + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + bfloat16_t, + cutlass::layout::RowMajor, + bfloat16_t, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// Partial specialization - input and output types are complex*complex +// Use F16 tensor operation internally +// 4 real-valued MMA.1688.F32.F16 operations on F16 +// A = (ar + j ai), B (br +j bi), D = AB +// D = dr + j di = (ar*br - ai*bi) + j (ar*bi + ai*br) +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename WarpShape_, + /// Shape of one matrix production operation (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB> +struct DefaultMmaComplexTensorOp< + WarpShape_, + InstructionShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + TransformA, + TransformB, + arch::OpMultiplyAddFastF16> { + + // Complex floating point tensor operation use MMA.1688.F32.F16 mma instruction + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + half_t, + cutlass::layout::RowMajor, + half_t, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + arch::OpMultiplyAdd>, + cutlass::MatrixShape<1, 1> + >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaComplexTensorOp< + WarpShape_, + complex, + LayoutA, + complex, + LayoutB, + complex, + LayoutC, + Policy, + TransformA, + TransformB>; +}; + +} // namespace warp +} // namespace gemm +} // namespace cutlass diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_tensor_op.h index f64f46f9..ea9ab5c9 100644 --- a/include/cutlass/gemm/warp/default_mma_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -60,10 +60,7 @@ template < int PartitionsK = 1, /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. - bool AccumulatorsInRowMajor = false, - /// Number of partitions along N dimension per warp - int PartitionsN = 1 -> + bool AccumulatorsInRowMajor = false> struct DefaultMmaTensorOp; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -92,9 +89,7 @@ template < int PartitionsK, /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. - bool AccumulatorsInRowMajor, - /// Number of partitions along N dimension per warp - int PartitionsN> + bool AccumulatorsInRowMajor> struct DefaultMmaTensorOp { using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< cutlass::arch::Mma; + Policy, PartitionsK, AccumulatorsInRowMajor>; }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -117,3 +112,6 @@ struct DefaultMmaTensorOp { ///////////////////////////////////////////////////////////////////////////////////////////////// +#include "default_mma_tensor_op_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h new file mode 100644 index 00000000..06d3afa5 --- /dev/null +++ b/include/cutlass/gemm/warp/default_mma_tensor_op_sm80.h @@ -0,0 +1,186 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Default warp-level GEMM operators selected by data type, size, and layouts of operands. +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/numeric_types.h" +#include "cutlass/arch/mma.h" +#include "cutlass/gemm/warp/mma_tensor_op.h" +#include "cutlass/gemm/warp/default_mma_tensor_op.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial Specialization - inputs and output types are float - uses BF16 internally +template < + /// Shape of one matrix production operation (concept: GemmShape) + typename WarpShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Number of partitions along K dimension + int PartitionsK, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor> +struct DefaultMmaTensorOp< + WarpShape_, + GemmShape<16, 8, 8>, + float, LayoutA, + float, LayoutB, + float, LayoutC, + arch::OpMultiplyAddFastBF16, + PartitionsK, AccumulatorsInRowMajor> { + + // Uses BF16 internally + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + GemmShape<16, 8, 8>, + 32, + bfloat16_t, cutlass::layout::RowMajor, + bfloat16_t, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaTensorOp< + WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC, + Policy, PartitionsK, AccumulatorsInRowMajor>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial Specialization - inputs and output types are float - uses F16 internally +template < + /// Shape of one matrix production operation (concept: GemmShape) + typename WarpShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Number of partitions along K dimension + int PartitionsK, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor> +struct DefaultMmaTensorOp< + WarpShape_, + GemmShape<16, 8, 8>, + float, LayoutA, + float, LayoutB, + float, LayoutC, + arch::OpMultiplyAddFastF16, + PartitionsK, AccumulatorsInRowMajor> { + + // Uses F16 internally + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + GemmShape<16, 8, 8>, + 32, + half_t, cutlass::layout::RowMajor, + half_t, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaTensorOp< + WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC, + Policy, PartitionsK, AccumulatorsInRowMajor>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial Specialization - inputs and output types are float - uses TF32 internally +template < + /// Shape of one matrix production operation (concept: GemmShape) + typename WarpShape_, + /// Shape of target matrix multiply instruction (concept: GemmShape) + typename InstructionShape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC, + /// Number of partitions along K dimension + int PartitionsK, + /// Store the accumulators in row major or column major. Row major is used + /// when output layout is interleaved. + bool AccumulatorsInRowMajor> +struct DefaultMmaTensorOp< + WarpShape_, + InstructionShape_, + float, LayoutA, + float, LayoutB, + float, LayoutC, + arch::OpMultiplyAdd, PartitionsK, AccumulatorsInRowMajor> { + + // Uses TF32 internally + using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< + cutlass::arch::Mma< + InstructionShape_, + 32, + tfloat32_t, cutlass::layout::RowMajor, + tfloat32_t, cutlass::layout::ColumnMajor, + float, cutlass::layout::RowMajor, + arch::OpMultiplyAdd + >, + cutlass::MatrixShape<1, 1> >; + + // Define the warp-level tensor op + using Type = cutlass::gemm::warp::MmaTensorOp< + WarpShape_, float, LayoutA, float, LayoutB, float, LayoutC, + Policy, PartitionsK, AccumulatorsInRowMajor>; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h index 11964944..582fb472 100644 --- a/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h +++ b/include/cutlass/gemm/warp/default_mma_wmma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -61,9 +61,7 @@ template < /// Operator describing the tensor operation typename Operator_ = arch::OpMultiplyAdd, /// Number of partitions along K dimension - int PartitionsK = 1, - /// Number of partitions along N dimension per warp - int PartitionsN = 1 + int PartitionsK = 1 > struct DefaultMmaTensorOpWmma; @@ -90,9 +88,7 @@ template < /// Operator describing the tensor operation typename Operator_, /// Number of partitions along K dimension - int PartitionsK, - /// Number of partitions along N dimension per warp - int PartitionsN> + int PartitionsK> struct DefaultMmaTensorOpWmma { using Policy = cutlass::gemm::warp::MmaTensorOpPolicy< cutlass::arch::Wmma< @@ -116,8 +112,7 @@ struct DefaultMmaTensorOpWmma { ElementC, LayoutC, Policy, - PartitionsK, - PartitionsN>; + PartitionsK>; }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -127,4 +122,3 @@ struct DefaultMmaTensorOpWmma { } // namespace cutlass #endif - diff --git a/include/cutlass/gemm/warp/mma.h b/include/cutlass/gemm/warp/mma.h index 5fb96d9f..16c736e2 100644 --- a/include/cutlass/gemm/warp/mma.h +++ b/include/cutlass/gemm/warp/mma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_complex_tensor_op.h new file mode 100644 index 00000000..2dc72fd3 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op.h @@ -0,0 +1,843 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing warp-level matrix multiply-accumulate operations targeting + Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/complex.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/functional.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma.h" + +#include "cutlass/gemm/warp/mma_tensor_op_policy.h" +#include "cutlass/gemm/warp/mma_tensor_op.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" +#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template < + /// Data type of real & imag members of complex numbers in the SourceFragment + typename RealElement, + /// Destination fragment required by the mma operation + typename DestinationFragment, + /// Source fragment holding complex elements + typename SourceFragment, + /// Number of mma operations performed + typename MmaIterations, + /// Shape of operand elements + typename MmaOperandShape, + /// Complex transform on A operand + ComplexTransform Transform_, + /// Operand A or Operand B + Operand Operand_, + /// Floating-point rounding style + FloatRoundStyle Round_> +struct UnpackComplexConvertAndPackForMma; + +// Partial specialization for OperandA and Congruous smem layout +template < + typename RealElement, + typename DestinationFragment, + typename SourceFragment, + typename MmaIterations, + typename MmaOperandShape, + ComplexTransform Transform_, + FloatRoundStyle Round_> +struct UnpackComplexConvertAndPackForMma < + RealElement, + DestinationFragment, + SourceFragment, + MmaIterations, + MmaOperandShape, + Transform_, + Operand::kA, + Round_> { + + // + // Type definitions + // + static Operand const kOperand = Operand::kA; + static ComplexTransform const kTransform = Transform_; + static FloatRoundStyle const kRound = Round_; + + // Data type of elements in the destination fragment + using MmaElement = typename DestinationFragment::Element; + + // Numeric convertor MmaElement <= RealElement + using Converter = NumericConverter; + + // Operand layout parameters + using SourceFragmentLayout = layout::ColumnMajor; + static int const kLdm = MmaIterations::kRow * MmaOperandShape::kRow; + + /// Ctor + CUTLASS_DEVICE + UnpackComplexConvertAndPackForMma() {} + + CUTLASS_DEVICE + void operator()(DestinationFragment *dest, SourceFragment const &source) { + + Converter convert_op; + SourceFragmentLayout layout(kLdm); + + CUTLASS_PRAGMA_UNROLL + for(int i=0; i and apply rounding on real and imag parts + MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real()); + MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag()); + + // Unpack rounded complex and pack into DestinationFragment for mma operation + dest[i][pos] = a; + dest[i+MmaIterations::kRow][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b); + + } + } + } + } +}; + +// Partial specialization for OperandB and Congruous smem layout +template < + typename RealElement, + typename DestinationFragment, + typename SourceFragment, + typename MmaIterations, + typename MmaOperandShape, + ComplexTransform Transform_, + FloatRoundStyle Round_> +struct UnpackComplexConvertAndPackForMma < + RealElement, + DestinationFragment, + SourceFragment, + MmaIterations, + MmaOperandShape, + Transform_, + Operand::kB, + Round_> { + + // + // Type definitions + // + static Operand const kOperand = Operand::kB; + static ComplexTransform const kTransform = Transform_; + static FloatRoundStyle const kRound = Round_; + + // Data type of elements in the destination fragment + using MmaElement = typename DestinationFragment::Element; + + // Numeric convertor MmaElement <= RealElement + using Converter = NumericConverter; + + // Operand layout parameters + using SourceFragmentLayout = layout::RowMajor; + static int const kLdm = MmaIterations::kColumn * MmaOperandShape::kColumn; + + /// Ctor + CUTLASS_DEVICE + UnpackComplexConvertAndPackForMma() {} + + CUTLASS_HOST_DEVICE + void operator()(DestinationFragment *dest, SourceFragment const &source) { + + Converter convert_op; + SourceFragmentLayout layout(kLdm); + + CUTLASS_PRAGMA_UNROLL + for(int i=0; i apply rounding on real and imag parts + MmaElement a = convert_op(source[layout(MatrixCoord{row,col})].real()); + MmaElement b = convert_op(source[layout(MatrixCoord{row,col})].imag()); + + // Unpack rounded complex and pack into DestinationFragment for mma operation + dest[i][pos] = a; + dest[i+MmaIterations::kColumn][pos++] = (kTransform == ComplexTransform::kConjugate ? -b : b); + } + } + } + } +}; +} // namespace detail + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transform on B operand + ComplexTransform TransformB = ComplexTransform::kNone, + /// Used for partial specialization + typename Enable = bool +> +class MmaComplexTensorOp; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB, + /// Used for partial specialization + typename Enable +> +class MmaComplexTensorOp< + Shape_, + complex, + LayoutA_, + complex, + LayoutB_, + complex, + LayoutC_, + Policy_, + TransformA, + TransformB, + Enable> { +public: + /// Shape of warp-level matrix operation (concept: GemmShape) + using Shape = Shape_; + + /// Data type of multiplicand A + using ElementA = complex; + + /// Layout of multiplicand A + using LayoutA = LayoutA_; + + /// Data type of multiplicand B + using ElementB = complex; + + /// Layout of multiplicand B + using LayoutB = LayoutB_; + + /// Data type of accumulator matrix C + using ElementC = complex; + + /// Layout of accumulator matrix C + using LayoutC = LayoutC_; + + /// Shape of the warp in units of thread (concept: MmaLanePolicyTensorOp) + using Policy = Policy_; + + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = TransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = TransformB; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + + /// Number of threads participating in warp-level matrix product + static int const kThreadCount = 32; + +public: + + /// Iterates over the A operand in memory + using IteratorA = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kA, + ElementA, + LayoutA, + MatrixShape, + Policy::OpDelta::kRow, + 32, + 1 + >; + + /// Storage for A tile + using FragmentA = typename IteratorA::Fragment; + + /// Storage for transformed A tile + using TransformedFragmentA = FragmentA; + + /// Iterates over the B operand in memory + using IteratorB = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kB, + ElementB, + LayoutB, + MatrixShape, + Policy::OpDelta::kColumn, + 32, + 1 + >; + + /// Storage for B tile + using FragmentB = typename IteratorB::Fragment; + + /// Storage for transformed B tile + using TransformedFragmentB = FragmentB; + + static_assert( + !(Shape::kM % Policy::Operator::Shape::kM) && + !(Shape::kN % Policy::Operator::Shape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape< + Shape::kM / Policy::Operator::Shape::kM, + Shape::kN / Policy::Operator::Shape::kN + >; + + /// Iterates over the C operand in memory + using IteratorC = MmaTensorOpAccumulatorTileIterator< + MatrixShape, + ElementC, + LayoutC, + typename Policy::Operator::Shape, + typename Policy::OpDelta>; + + /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this + /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued + /// parts are stored consecutively followed by all imaginary parts. This matches the structure + /// of Tensor Cores which are always real-valued matrix multiplies. + using FragmentC = typename IteratorC::Fragment; + + static_assert( + FragmentC::kElements == 2 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements, + "Unexpected planar complex fragment length."); + +private: + + // + // Data members + // + + /// Underlying real-valued matrix multiply operator (concept: arch::Mma) + typename Policy::Operator mma; + +public: + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + MmaComplexTensorOp() {} + + /// Performs a warp-level matrix multiply-accumulate operation + CUTLASS_DEVICE + void operator()( + FragmentC &D, + FragmentA const &A, + FragmentB const &B, + FragmentC const &C + ) const { + + // Alias types for underlying real-valued matrix multiply operator + using MmaOperandA = typename Policy::Operator::FragmentA; + using MmaOperandB = typename Policy::Operator::FragmentB; + using MmaOperandC = typename Policy::Operator::FragmentC; + + static_assert(MmaOperandA::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the A operand." + "We can geneneralize later."); + + static_assert(MmaOperandB::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the B operand." + "We can geneneralize later."); + + D = C; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + // mma(accum.real(), a.real(), b.real(), accum.real()); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + operand_A[0] = A[m].real(); + operand_B[0] = B[n].real(); + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A, operand_B, *accum); + } + + // mma(accum.imag(), a.real(), b.imag(), accum.imag()); + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + operand_A[0] = A[m].real(); + operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag()); + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A, operand_B, *accum); + } + + // mma(accum.real(), -a.imag(), b.imag(), accum.real()) + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + // A imaginary part is intentionally negated + operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? A[m].imag() : -A[m].imag()); + operand_B[0] = (kTransformB == ComplexTransform::kConjugate ? -B[n].imag() : B[n].imag()); + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A, operand_B, *accum); + } + + // mma(accum.imag(), a.imag(), b.real(), accum.imag()) + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_A; + MmaOperandB operand_B; + + operand_A[0] = (kTransformA == ComplexTransform::kConjugate ? -A[m].imag() : A[m].imag()); + operand_B[0] = B[n].real(); + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A, operand_B, *accum); + } + } + } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + //TODO: Implement this + dst_A = A; + dst_B = B; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex+complex => complex: +// Operands data type: complex +// Rounding: float -> tfloat32_t (round half_ulp_truncate nearest) +// Math instruction: MMA.1688.F32.TF32 +// Output data type: complex +// +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB, + /// Used for partial specialization + typename Enable +> +class MmaComplexTensorOp< + Shape_, + complex, + LayoutA_, + complex, + LayoutB_, + complex, + LayoutC_, + Policy_, + TransformA, + TransformB, + Enable> { +public: + /// Shape of warp-level matrix operation (concept: GemmShape) + using Shape = Shape_; + + /// Data type of members of complex multiplicand A + using RealElementA = float; + + /// Data type of multiplicand A + using ElementA = complex; + + /// Layout of multiplicand A + using LayoutA = LayoutA_; + + /// Data type of members of complex multiplicand B + using RealElementB = float; + + /// Data type of multiplicand B + using ElementB = complex; + + /// Layout of multiplicand B + using LayoutB = LayoutB_; + + /// Data type of members of complex accumulator matrix C + using RealElementC = float; + + /// Data type of accumulator matrix C + using ElementC = complex; + + /// Layout of accumulator matrix C + using LayoutC = LayoutC_; + + /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) + using Policy = Policy_; + + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + + /// Underlying arch tag + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = TransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = TransformB; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + + /// Number of threads participating in warp-level matrix product + static int const kThreadCount = 32; + +public: + + /// Iterates over the A operand in memory + using IteratorA = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kA, + ElementA, + LayoutA, + MatrixShape, + Policy::OpDelta::kRow, + 32, + 1 + >; + + /// Storage for A tile + using FragmentA = typename IteratorA::Fragment; + + /// Storage for transformed A tile + using TransformedFragmentA = + Array; + + /// Iterates over the B operand in memory + using IteratorB = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kB, + ElementB, + LayoutB, + MatrixShape, + Policy::OpDelta::kColumn, + 32, + 1 + >; + + /// Storage for B tile + using FragmentB = typename IteratorB::Fragment; + + /// Storage for transformed B tile + using TransformedFragmentB = + Array; + + static_assert( + !(Shape::kM % Policy::Operator::Shape::kM) && + !(Shape::kN % Policy::Operator::Shape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of complex products operations performed (one complex product needs four mma instructions) + using MmaIterations = MatrixShape< + Shape::kM / Policy::Operator::Shape::kM, + Shape::kN / Policy::Operator::Shape::kN + >; + + /// Iterates over the C operand in memory + using IteratorC = MmaTensorOpAccumulatorTileIterator< + MatrixShape, + ElementC, + LayoutC, + typename Policy::Operator::Shape, + typename Policy::OpDelta>; + + /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this + /// storage arrangement is to be considered 'planar complex' in the sense that all real-valued + /// parts are stored consecutively followed by all imaginary parts. This matches the structure + /// of Tensor Cores which are always real-valued matrix multiplies. + using FragmentC = typename IteratorC::Fragment; + +private: + + // + // Data members + // + + /// Underlying real-valued matrix multiply operator (concept: arch::Mma) + typename Policy::Operator mma; + +public: + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + MmaComplexTensorOp() {} + + /// Performs a warp-level matrix multiply-accumulate operation + CUTLASS_DEVICE + void operator()( + FragmentC &D, + TransformedFragmentA const &A, + TransformedFragmentB const &B, + FragmentC const &C + ) const { + + // Alias types for underlying real-valued matrix multiply operator + using InstMmaOperandA = typename Policy::Operator::FragmentA; + using InstMmaOperandB = typename Policy::Operator::FragmentB; + using MmaOperandC = typename Policy::Operator::FragmentC; + + static_assert(platform::is_same, typename Policy::Operator::Shape>::value, + "This implementation only supports MMA.1688 math instructions."); + + static_assert(InstMmaOperandA::kElements == 4, + "This implementation only supports math instructions in which exactly four element is needed for the A operand." + "We can geneneralize later."); + + static_assert(InstMmaOperandB::kElements == 2, + "This implementation only supports math instructions in which exactly two element is needed for the B operand." + "We can geneneralize later."); + + // Instruction Operands A & B holding real part followed by imaginary part for mma operations + InstMmaOperandA const *operand_A = reinterpret_cast(&A); + InstMmaOperandB const *operand_B = reinterpret_cast(&B); + + // + // Accumulate in place + // + D = C; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + // mma(accum.real(), a.real(), b.real(), accum.real()); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A[m], operand_B[n], *accum); + } + + // mma(accum.imag(), a.real(), b.imag(), accum.imag()); + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A[m], operand_B[n+MmaIterations::kColumn], *accum); + } + + // mma(accum.real(), a.imag(), -b.imag(), accum.real()) + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // negate OperandB to accumulate -(a.imag()*b.imag()) + // negating OperandB emits less instrucitons than negating OperandA as OperandB has less elements + negate negate_op; + + // Real-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_A[m+MmaIterations::kRow], negate_op(operand_B[n+MmaIterations::kColumn]), *accum); + } + + // mma(accum.imag(), a.imag(), b.real(), accum.imag()) + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Complex-valued accumulator part + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_A[m+MmaIterations::kRow], operand_B[n], *accum); + } + } + } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + // Alias types for underlying real-valued matrix multiply operator + using InstMmaOperandA = typename Policy::Operator::FragmentA; + using InstMmaOperandB = typename Policy::Operator::FragmentB; + + // + // Define conversions from source type to instruction operands' type + // + + FloatRoundStyle const kRoundA = FloatRoundStyle::round_half_ulp_trunc_dntz; + FloatRoundStyle const kRoundB = FloatRoundStyle::round_half_ulp_trunc_dntz; + + detail::UnpackComplexConvertAndPackForMma < + RealElementA, + InstMmaOperandA, + FragmentA, + MmaIterations, + MatrixShape<2, 2>, + kTransformA, + Operand::kA, + kRoundA> convert_A; + + detail::UnpackComplexConvertAndPackForMma < + RealElementB, + InstMmaOperandB, + FragmentB, + MmaIterations, + MatrixShape<2, 1>, + kTransformB, + Operand::kB, + kRoundB> convert_B; + + // Convert Fragment[A|B] holding complex to InstMmaOperand[A|B] holding InstMmaOperand[A|B]::Element + convert_A(reinterpret_cast(&dst_A), A); + convert_B(reinterpret_cast(&dst_B), B); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO - partial specializations of real*complex and complex*real + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h new file mode 100644 index 00000000..b95af0df --- /dev/null +++ b/include/cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h @@ -0,0 +1,2448 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 128b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCongruous128b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 8) && !(Shape::kStrided % 4), "Divisibility."); + + static_assert(sizeof_bits::value == 128, "This is specialized for 128b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous128b; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 1; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<8, 4>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + Shape::kContiguous / Delta::kContiguous, + InstructionShape::kStrided / Delta::kStrided + >; + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) { + + int quad_pair = lane_id / 8; + int quad = lane_id / 4; + int lane = lane_id % 4; + + int row = (quad & 1) * 4 + (lane ^ quad_pair); + + byte_offset_ = (row + quad_pair * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + int offset = + (tile_offset.contiguous() * Shape::kContiguous) + + (tile_offset.strided() * InstructionShape::kStrided * stride_); + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kStrided; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c + + Policy::Delta::kStrided * s * stride_; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous128b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous128b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// Partial specialization for complex +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of underlying field of reals. + typename RealElement, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_> +class MmaTensorOpAccumulatorTileIterator< + Shape_, complex, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kC; + + /// Element type + using Element = complex; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + using OpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + static_assert(platform::is_same::value, + "Layouts must be defined for logical MatrixCoord coordinate space."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape; + }; + +private: + + // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire + // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements + // of that row. The accumulators within one row are assumed to be consecutive. + static int const kElementsPerAccess = InstructionShape::kN / 4; + static int const kRowsPerTile = 8; + static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators + /// are stored in a planar complex arrangement with the real parts as entirely contiguous + /// followed by the imaginary parts. + using Fragment = Array; + + static int const kRealIndex = 0; + static int const kImaginaryIndex = Shape::kCount / kThreads; + +private: + + /// Reference to output tensor + TensorRef ref_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator( + TensorRef const &ref, + int lane_id + ): + ref_(ref) { + + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + + MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess); + + ref_.add_coord_offset(lane_offset); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn)); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator & operator++() { + // deliberate no-op + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpAccumulatorTileIterator & operator--() { + // deliberate no-op + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + Fragment &frag, ///< fragment to load from the tensor + Index pointer_offset) const { ///< loads a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + + Element z = offset_ref.at({accum_m, accum_n}); + + frag[mma_accum_start + row * kElementsPerAccess + col + kRealIndex] = z.real(); + frag[mma_accum_start + row * kElementsPerAccess + col + kImaginaryIndex] = z.imag(); + } + } + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + Fragment &frag, ///< fragment to load from the tensor + Index byte_offset) const { ///< loads a tile with a linear offset + + load_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset) const { ///< loads a tile with a logical offset in units of whole tiles + + load(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset, ///< loads a tile with a logical offset in units of whole tiles + Index pointer_offset) const { ///< loads a tile with a logical offset AND a pointer offset + + load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } + + /// Stores a fragment to memory + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) const { + store_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_pointer_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index pointer_offset) const { ///< store a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + int idx = mma_accum_start + row * kElementsPerAccess + col; + + Element z(frag[kRealIndex + idx], frag[kImaginaryIndex + idx]); + + offset_ref.at({accum_m, accum_n}) = z; + } + } + } + } + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_byte_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index byte_offset) const { ///< store a tile with a linear offset + + store_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Stores a fragment to memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + Fragment &frag, ///< fragment to store to the tensor + TensorCoord const &tile_offset) const { ///< stores a tile with a logical offset in units of whole tiles + + store(frag, tile_offset, 0); + } + + /// Stores a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + /// fragment to store to the tensor + Fragment const &frag, + /// stores a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// stores a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 128b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCrosswise128x4, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 8), "Divisibility."); + + static_assert(sizeof_bits::value == 128, "This is specialized for 128b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCrosswise128x4; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 1; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<4, 8>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + InstructionShape::kContiguous / Delta::kContiguous, + Shape::kStrided / Delta::kStrided + >; + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0) { + + int quad = lane_id / 4; + int liq = lane_id % 4; + + int c = liq + (quad & 1) * 4; + int s = (quad / 2); + + byte_offset_ = (c + s * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + // Compute the offset in units of elements. Note, the external coordinate system is + // approximately transposed with respect to the tiled internal structure + int offset = + (tile_offset.contiguous() * InstructionShape::kContiguous) * stride_ + + (tile_offset.strided() * Shape::kStrided); + + add_pointer_offset(offset); + + byte_offset_ ^= (tile_offset.contiguous() & 1) * 4 * sizeof(AccessType); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kContiguous; + + byte_offset_ ^= 4 * sizeof(AccessType); + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + int access_idx = s + c * Policy::Iterations::kStrided; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c * stride_ + + Policy::Delta::kStrided * s; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = + tile_offset.contiguous() * InstructionShape::kContiguous * stride_ + + tile_offset.strided() * Shape::kStrided; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCrosswise128x4, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.column(), -tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCrosswise128x4, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(layout::PitchLinearCoord(-tile_offset.row(), -tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Congruous shared memory layout +// Warp-level iterators for complex*complex + complex => complex +// The underlying iterators are similar to that for MMA f64*f64 + f64 = f64 +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, cutlass::complex, + cutlass::layout::TensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 8), "Divisibility."); + + /// Element type + using Element = cutlass::complex; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<8, 4>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + Shape::kContiguous / kElementsPerAccess / Delta::kContiguous, + InstructionShape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + + /// Internal counter used to jump to next K partition + int k_group_idx_; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / Policy::Delta::kContiguous; + int access_contiguous = (lane_id % Policy::Delta::kContiguous) ^ access_strided; + + pointer_= reinterpret_cast(ref.data()) + + access_contiguous + access_strided * stride_; + + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + byte_offset_ += offset * sizeof(Element); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + int offset = + (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + + tile_offset.contiguous() * Shape::kContiguous; + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + add_tile_offset({0, 1}); + + return *this; + } + + /// Advances the iterator along the opposite of the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + add_tile_offset({0, -1}); + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c + + Policy::Delta::kStrided * s * stride_; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Crosswise shared memory layout +// Warp-level iterators for complex*complex + complex => complex +// The underlying iterators are similar to that for f64*f64 + f64 = f64 +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, complex, + cutlass::layout::TensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility."); + + static_assert(sizeof_bits>::value == 64, "This is specialized for 64b accesses."); + + /// Element type + using Element = complex; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<4, 16>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + InstructionShape::kContiguous / Delta::kContiguous, + Shape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + + /// Internal counter for tracking K-group + Index k_group_idx_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / 8; + int access_contiguous = (lane_id % 8); + + byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset / kElementsPerAccess; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) * + stride_ * kElementsPerAccess + + tile_offset.strided() * Shape::kStrided; + + add_pointer_offset(offset); + + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kContiguous; + + // xor ptr + byte_offset_ ^= 0x40; + + ++k_group_idx_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + int access_idx = c * Policy::Iterations::kStrided + s; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c * stride_ + + Policy::Delta::kStrided * s / kElementsPerAccess; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + + Element *exchange_ptr = reinterpret_cast(&frag); + + // exchange on 64b granularity only for fragments held in k=8/2 to k=8 + CUTLASS_PRAGMA_UNROLL + for (int i = Fragment::kElements/2; i < Fragment::kElements; i += 2) { + Element tmp = exchange_ptr[i]; + exchange_ptr[i] = exchange_ptr[i + 1]; + exchange_ptr[i + 1] = tmp; + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = tile_offset.contiguous() * + InstructionShape::kContiguous / + Layout::kElementsPerAccess + + tile_offset.strided() * Shape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + k_group_idx_ = k_group; + } +}; + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h new file mode 100644 index 00000000..bf3d98df --- /dev/null +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op.h @@ -0,0 +1,357 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing warp-level matrix multiply-accumulate operations targeting + Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/complex.h" +#include "cutlass/numeric_types.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + +#include "cutlass/gemm/gemm.h" +#include "cutlass/gemm/warp/mma.h" + +#include "cutlass/gemm/warp/mma_tensor_op_policy.h" +#include "cutlass/gemm/warp/mma_tensor_op.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" +#include "cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA = ComplexTransform::kNone, + /// Complex transform on B operand + ComplexTransform TransformB = ComplexTransform::kNone, + /// Used for partial specialization + typename Enable = bool +> +class MmaGaussianComplexTensorOp; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for complex*complex+complex => complex using real-valued TensorOps +template < + /// Size of the Gemm problem - concept: gemm::GemmShape<> + typename Shape_, + /// Data type of A elements + typename RealElementA, + /// Layout of A matrix (concept: MatrixLayout) + typename LayoutA_, + /// Data type of B elements + typename RealElementB, + /// Layout of B matrix (concept: MatrixLayout) + typename LayoutB_, + /// Element type of C matrix + typename RealElementC, + /// Layout of C matrix (concept: MatrixLayout) + typename LayoutC_, + /// Policy describing warp-level MmaTensorOp (concept: MmaTensorOp policy) + typename Policy_, + /// Complex transform on A operand + ComplexTransform TransformA, + /// Complex transform on B operand + ComplexTransform TransformB, + /// Used for partial specialization + typename Enable +> +class MmaGaussianComplexTensorOp< + Shape_, + complex, + LayoutA_, + complex, + LayoutB_, + complex, + LayoutC_, + Policy_, + TransformA, + TransformB, + Enable> { +public: + /// Shape of warp-level matrix operation (concept: GemmShape) + using Shape = Shape_; + + /// Data type of multiplicand A + using ElementA = complex; + + /// Layout of multiplicand A + using LayoutA = LayoutA_; + + /// Data type of multiplicand B + using ElementB = complex; + + /// Layout of multiplicand B + using LayoutB = LayoutB_; + + /// Data type of accumulator matrix C + using ElementC = complex; + + /// Layout of accumulator matrix C + using LayoutC = LayoutC_; + + /// Shape of the warp in units of thread (concept: MmaLanePolicySimt) + using Policy = Policy_; + + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + + /// Underlying architecture tag + using ArchTag = typename Policy::Operator::ArchTag; + + /// Complex transform on A operand + static ComplexTransform const kTransformA = TransformA; + + /// Complex transform on B operand + static ComplexTransform const kTransformB = TransformB; + + /// Indicates class of matrix operator + using OperatorClass = arch::OpClassTensorOp; + + /// Number of threads participating in warp-level matrix product + static int const kThreadCount = 32; + +public: + + /// Iterates over the A operand in memory + using IteratorA = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kA, + ElementA, + LayoutA, + MatrixShape, + Policy::OpDelta::kRow, + 32, + 1 + >; + + /// Storage for A tile + using FragmentA = typename IteratorA::Fragment; + + /// Storage for transformed A tile + using TransformedFragmentA = FragmentA; + + /// Iterates over the B operand in memory + using IteratorB = MmaTensorOpMultiplicandTileIterator< + MatrixShape, + Operand::kB, + ElementB, + LayoutB, + MatrixShape, + Policy::OpDelta::kColumn, + 32, + 1 + >; + + /// Storage for B tile + using FragmentB = typename IteratorB::Fragment; + + /// Storage for transformed B tile + using TransformedFragmentB = FragmentB; + + static_assert( + !(Shape::kM % Policy::Operator::Shape::kM) && + !(Shape::kN % Policy::Operator::Shape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape< + Shape::kM / Policy::Operator::Shape::kM, + Shape::kN / Policy::Operator::Shape::kN + >; + + /// Iterates over the C operand in memory + using IteratorC = MmaTensorOpGaussianComplexAccumulatorTileIterator< + MatrixShape, + ElementC, + LayoutC, + typename Policy::Operator::Shape, + typename Policy::OpDelta>; + + /// Storage for C tile, the accumulator. Note, regardless of multiplicand type, this + /// storage arrangement is to be considered 'gaussian complex' in the sense that the accumulation is + /// done in three parts namely part1, part2, and part3. The parts 1, 2, and 3 are stored consecutively + /// in InteratorC::Frament. This matches the structure of Tensor Cores which are always real-valued matrix multiplies. + using FragmentC = typename IteratorC::Fragment; + + static_assert( + FragmentC::kElements == 3 * MmaIterations::kCount * Policy::Operator::FragmentC::kElements, + "Unexpected gaussian complex fragment length."); + +private: + + // + // Data members + // + + /// Underlying real-valued matrix multiply operator (concept: arch::Mma) + typename Policy::Operator mma; + +public: + + // + // Methods + // + + /// Ctor + CUTLASS_DEVICE + MmaGaussianComplexTensorOp() {} + + /// Performs a warp-level matrix multiply-accumulate operation + CUTLASS_DEVICE + void operator()( + FragmentC &D, + FragmentA const &A, + FragmentB const &B, + FragmentC const &C + ) const { + + // Alias types for underlying real-valued matrix multiply operator + using MmaOperandA = typename Policy::Operator::FragmentA; + using MmaOperandB = typename Policy::Operator::FragmentB; + using MmaOperandC = typename Policy::Operator::FragmentC; + + static_assert(MmaOperandA::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the A operand." + "We can geneneralize later."); + + static_assert(MmaOperandB::kElements == 1, + "This implementation only supports math instructions in which exactly one element is needed for the B operand." + "We can geneneralize later."); + + D = C; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + // mma(accum.part1(), (a.real() + a.imag()), b.real(), accum.part1()); + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_Asum; + MmaOperandB operand_Br; + + operand_Asum[0] = A[m].real() + ((kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag()); + operand_Br[0] = B[n].real(); + + // accumulator part1 + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow); + + mma(*accum, operand_Asum, operand_Br, *accum); + } + + // mma(accum.part2(), -a.real(), (b.real() - b.imag()), accum.part2()); + CUTLASS_PRAGMA_UNROLL + for (int n = MmaIterations::kColumn - 1; n >= 0; --n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_Ar; + MmaOperandB operand_Bdiff; + + operand_Ar[0] = -A[m].real(); + operand_Bdiff[0] = B[n].real() - ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag()); + + // accumulator part2 + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + MmaIterations::kCount; + + mma(*accum, operand_Ar, operand_Bdiff, *accum); + } + + // mma(accum.part3(), a.imag(), (b.real() + b.imag()), accum.part3()) + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + // Pack operands together. This may result in actual MOVs + MmaOperandA operand_Ai; + MmaOperandB operand_Bsum; + + operand_Ai[0] = (kTransformA == ComplexTransform::kConjugate) ? -A[m].imag() : +A[m].imag(); + operand_Bsum[0] = B[n].real() + ((kTransformB == ComplexTransform::kConjugate) ? -B[n].imag() : +B[n].imag()); + + // accumulator part3 + MmaOperandC *accum = reinterpret_cast(&D) + + (m + n * MmaIterations::kRow) + 2 * MmaIterations::kCount; + + mma(*accum, operand_Ai, operand_Bsum, *accum); + } + } + } + + /// Transform the mma operands to the required types + CUTLASS_DEVICE + void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, + FragmentA const &A, FragmentB const &B) const { + //TODO: Implement this + dst_A = A; + dst_B = B; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// TODO - partial specializations of real*complex and complex*real + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h new file mode 100644 index 00000000..8d9417b0 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_gaussian_complex_tensor_op_tile_iterator_sm80.h @@ -0,0 +1,384 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" +#include "cutlass/gemm/warp/mma_complex_tensor_op_tile_iterator_sm80.h" + +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +///////////////////////////////////////////////////////////////////////////////////////////////// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Element type + typename Element_, + /// Layout of operand in memory + typename Layout_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_> +class MmaTensorOpGaussianComplexAccumulatorTileIterator; + +//////////////////////////////////////////////////////////////////////////////// + +///////////////////////////////////////////////////////////////////////////////////////////////// +/// +/// Partial specialization for complex +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Data type of underlying field of reals. + typename RealElement, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions, concept: MatrixShape) + typename OpDelta_> +class MmaTensorOpGaussianComplexAccumulatorTileIterator< + Shape_, complex, cutlass::layout::RowMajor, InstructionShape_, OpDelta_> { + public: + + /// Shape of tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand::kC; + + /// Element type + using Element = complex; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + using OpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + + static_assert(platform::is_same::value, + "Layouts must be defined for logical MatrixCoord coordinate space."); + + /// Number of mma operations performed + using MmaIterations = MatrixShape; + }; + +private: + + // Assume accumulator tile is an arrangement of 8-by-8 tiles replicated over the entire + // shape, with each quad mapped to one row and each thread mapped to 1/4 of the elements + // of that row. The accumulators within one row are assumed to be consecutive. + static int const kElementsPerAccess = InstructionShape::kN / 4; + static int const kRowsPerTile = 8; + static int const kAccumulatorRows = InstructionShape::kM / kRowsPerTile; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile. It is assumed that the accumulators + /// are stored in a gaussian complex arrangement with parts 1, 2, and 3 as entirely contiguous + /// arranged as [part1, part2, part3] + using Fragment = Array; + + static int const kPart1Index = (Shape::kCount / kThreads) * 0; + static int const kPart2Index = (Shape::kCount / kThreads) * 1; + static int const kPart3Index = (Shape::kCount / kThreads) * 2; + +private: + + /// Reference to output tensor + TensorRef ref_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator( + TensorRef const &ref, + int lane_id + ): + ref_(ref) { + + int quad = (lane_id >> 2); + int lane_in_quad = (lane_id & 3); + + MatrixCoord lane_offset(quad, lane_in_quad * kElementsPerAccess); + + ref_.add_coord_offset(lane_offset); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator &add_pointer_offset(LongIndex offset) { + ref_.add_pointer_offset(offset); + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + ref_.add_coord_offset(tile_offset * make_Coord(Shape::kRow, Shape::kColumn)); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator++() { + // deliberate no-op + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator--() { + // deliberate no-op + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpGaussianComplexAccumulatorTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + load_with_pointer_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + Fragment &frag, ///< fragment to load from the tensor + Index pointer_offset) const { ///< loads a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + + Element z = offset_ref.at({accum_m, accum_n}); + + frag[mma_accum_start + row * kElementsPerAccess + col + kPart1Index] = z.real() + z.imag(); + frag[mma_accum_start + row * kElementsPerAccess + col + kPart2Index] = -z.real(); + frag[mma_accum_start + row * kElementsPerAccess + col + kPart3Index] = z.imag(); + } + } + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + Fragment &frag, ///< fragment to load from the tensor + Index byte_offset) const { ///< loads a tile with a linear offset + + load_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset) const { ///< loads a tile with a logical offset in units of whole tiles + + load(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + Fragment &frag, ///< fragment to load from the tensor + TensorCoord const &tile_offset, ///< loads a tile with a logical offset in units of whole tiles + Index pointer_offset) const { ///< loads a tile with a logical offset AND a pointer offset + + load_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } + + /// Stores a fragment to memory + CUTLASS_HOST_DEVICE + void store(Fragment const &frag) const { + store_with_pointer_offset(frag, 0); + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_pointer_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index pointer_offset) const { ///< store a tile with a linear offset + + TensorRef offset_ref(ref_); + offset_ref.add_pointer_offset(pointer_offset); + + CUTLASS_PRAGMA_UNROLL + for (int mma_n = 0; mma_n < Policy::MmaIterations::kColumn; ++mma_n) { + CUTLASS_PRAGMA_UNROLL + for (int mma_m = 0; mma_m < Policy::MmaIterations::kRow; ++mma_m) { + + int mma_accum_start = kAccumulatorRows * kElementsPerAccess * + (mma_n * Policy::MmaIterations::kRow + mma_m); + + CUTLASS_PRAGMA_UNROLL + for (int row = 0; row < kAccumulatorRows; ++row) { + CUTLASS_PRAGMA_UNROLL + for (int col = 0; col < kElementsPerAccess; ++col) { + int accum_m = mma_m * InstructionShape::kM * OpDelta::kRow + + row * kRowsPerTile; + int accum_n = mma_n * InstructionShape::kN * OpDelta::kColumn + col; + int idx = mma_accum_start + row * kElementsPerAccess + col; + + Element z(frag[kPart1Index + idx] - frag[kPart3Index + idx], + frag[kPart1Index + idx] + frag[kPart2Index + idx]); + + offset_ref.at({accum_m, accum_n}) = z; + } + } + } + } + } + + /// Stores a fragment to memory with additional pointer offset + CUTLASS_DEVICE + void store_with_byte_offset( + Fragment const &frag, ///< fragment to store from the tensor + Index byte_offset) const { ///< store a tile with a linear offset + + store_with_pointer_offset(byte_offset / sizeof(Element)); + } + + /// Stores a fragment to memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + Fragment &frag, ///< fragment to store to the tensor + TensorCoord const &tile_offset) const { ///< stores a tile with a logical offset in units of whole tiles + + store(frag, tile_offset, 0); + } + + /// Stores a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void store( + /// fragment to store to the tensor + Fragment const &frag, + /// stores a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// stores a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + store_with_pointer_offset(frag, ref_.offset(tile_offset) + pointer_offset); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_simt.h b/include/cutlass/gemm/warp/mma_simt.h index 9166fe7c..1bf23c74 100644 --- a/include/cutlass/gemm/warp/mma_simt.h +++ b/include/cutlass/gemm/warp/mma_simt.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -147,6 +147,9 @@ public: dp4a_type >; + /// Shape of the underlying instruction + using InstructionShape = GemmShape<1,1,use_dp4a ? 4 : 1>; + public: /// Iterates over the A operand in memory diff --git a/include/cutlass/gemm/warp/mma_simt_policy.h b/include/cutlass/gemm/warp/mma_simt_policy.h index 78247433..6abd0bf6 100644 --- a/include/cutlass/gemm/warp/mma_simt_policy.h +++ b/include/cutlass/gemm/warp/mma_simt_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h index 1d47e8f1..ed1e5987 100644 --- a/include/cutlass/gemm/warp/mma_simt_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_simt_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op.h b/include/cutlass/gemm/warp/mma_tensor_op.h index 4e082db1..3eff7b90 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op.h +++ b/include/cutlass/gemm/warp/mma_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -39,12 +39,16 @@ #include "cutlass/arch/memory_sm75.h" #include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + #include "cutlass/gemm/gemm.h" #include "cutlass/gemm/warp/mma.h" #include "cutlass/gemm/warp/mma_tensor_op_policy.h" #include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -77,6 +81,27 @@ struct ConvertAndPack { } }; +template +struct ConvertAndPack { + + using Converter = NumericArrayConverter; + + CUTLASS_HOST_DEVICE + Array operator()(Array const &source) { + Converter converter; + + Array tmp; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N; ++i) { + int idx = (((i << 1) & 2) | ((i >> 1) & 1) | (i & 0xfffffffc)); + tmp[i] = source[idx]; + } + + return converter(tmp); + } +}; + template struct ConvertAndPack { @@ -130,8 +155,6 @@ template < /// Store the accumulators in row major or column major. Row major is used /// when output layout is interleaved. bool AccumulatorsInRowMajor = false, - /// PartitionsN indicating how many PartitionsN for multiplicand B - int PartitionsN_ = 1, /// Used for partial specialization typename Enable = bool > @@ -167,6 +190,9 @@ public: /// Indicates class of matrix operator using OperatorClass = arch::OpClassTensorOp; + /// Shape of underlying instruction + using InstructionShape = typename Policy::Operator::Shape; + /// Complex transform on A operand static ComplexTransform const kTransformA = ComplexTransform::kNone; @@ -179,9 +205,6 @@ public: /// Number of partitions along K dimension static int const kPartitionsK = PartitionsK_; - /// PartitionsN indicating how many PartitionsN for multiplicand B - static int const kPartitionsN = PartitionsN_; - public: /// Iterates over the A operand in memory @@ -228,9 +251,7 @@ private: /// Number of mma operations performed using MmaIterations = MatrixShape< Shape::kM / Policy::Operator::Shape::kM, - (Shape::kN / Policy::Operator::Shape::kN / kPartitionsN > 0) ? - Shape::kN / Policy::Operator::Shape::kN / kPartitionsN : - 1 + Shape::kN / Policy::Operator::Shape::kN >; public: @@ -254,8 +275,8 @@ public: FragmentC &D, TransformedFragmentA const &A, TransformedFragmentB const &B, - FragmentC const &C, - int const &partitionN_idx = 0) const { + FragmentC const &C + ) const { using MmaOperandA = typename Policy::Operator::FragmentA; using MmaOperandB = typename Policy::Operator::FragmentB; @@ -267,8 +288,7 @@ public: MmaOperandB const *ptr_B = reinterpret_cast(&B); MmaOperandC *ptr_D = reinterpret_cast(&D); - // The offset of multilicand B for current partition - const int n_off = partitionN_idx * FragmentB::kElements / MmaOperandB::kElements / kPartitionsN; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) // Serpentine visitation order maximizing reuse of Rb CUTLASS_PRAGMA_UNROLL for (int n = 0; n < MmaIterations::kColumn; ++n) { @@ -286,24 +306,46 @@ public: ptr_D[n + m_serpentine * MmaIterations::kColumn]); } else { mma( - ptr_D[m_serpentine + (n + n_off) * MmaIterations::kRow], + ptr_D[m_serpentine + n * MmaIterations::kRow], ptr_A[m_serpentine], - ptr_B[n + n_off], - ptr_D[m_serpentine + (n + n_off) * MmaIterations::kRow]); + ptr_B[n], + ptr_D[m_serpentine + n * MmaIterations::kRow]); } } } + #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + // Serpentine visitation order maximizing reuse of Ra + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; ++m) { + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; ++n) { + + int n_serpentine = ((m % 2) ? (MmaIterations::kColumn - 1 - n) : n); + + if (AccumulatorsInRowMajor) { // matrix B is reordered + mma( + ptr_D[n_serpentine + m * MmaIterations::kColumn], + ptr_A[m], + ptr_B[n_serpentine], + ptr_D[n_serpentine + m * MmaIterations::kColumn]); + } else { + mma(ptr_D[m + n_serpentine * MmaIterations::kRow], + ptr_A[m], + ptr_B[n_serpentine], + ptr_D[m + n_serpentine * MmaIterations::kRow]); + } + } + } + #else + assert(0); + #endif } /// Transform the mma operands to the required types CUTLASS_DEVICE void transform(TransformedFragmentA &dst_A, TransformedFragmentB &dst_B, FragmentA const &A, FragmentB const &B) const { - bool midway_depstage = - !(platform::is_same::value && - platform::is_same::value); // // Define conversions from source type to instruction type @@ -314,6 +356,7 @@ public: FloatRoundStyle const kRoundB = PreferredRoundingMode::kRound; + #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) detail::ConvertAndPack convert_A; @@ -331,6 +374,26 @@ public: ptr_dst_B[0] = convert_B(ptr_B[0]); ptr_dst_B[1] = convert_B(ptr_B[1]); + #elif defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + detail::ConvertAndPack + convert_A; + NumericArrayConverter + convert_B; + Array const *ptr_A = + reinterpret_cast const *>(&A); + Array * + ptr_dst_A = reinterpret_cast *>(&dst_A); + + dst_B = convert_B(B); + + ptr_dst_A[0] = convert_A(ptr_A[0]); + ptr_dst_A[1] = convert_A(ptr_A[1]); + #else + assert(0); + #endif } }; diff --git a/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h new file mode 100644 index 00000000..85f5009d --- /dev/null +++ b/include/cutlass/gemm/warp/mma_tensor_op_fragment_iterator.h @@ -0,0 +1,428 @@ +/*! \file + \brief This defines a "fragment" iterator for visiting the fragments of a warp tile + that participate in one warp-level mma operation. + + Typically, this is used to access the accumulator tile/fragement of a warp-level mma operation. + The accumulator tile is then partitioned into smaller tiles/fragments that can be fed into + next warp-level mma operation. + + This iterator is necessary to accomplish warp-level mma fusion where the accumulator tile is + reused as multiplicand tile for the next mma. + +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/numeric_conversion.h" + +namespace cutlass { +namespace gemm { +namespace warp { + + +//////////////////////////////////////////////////////////////////////////////// + +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Size of the accumulation tile shape (concept: MatrixShape) + typename AccumulatorShape_, + /// KBlocks columns to compute residual + int KBlocksColumn_, + /// Accumulator Element type + typename ElementAccumulator_, + /// Element type + typename Element_, + /// Layout of operand in memory + typename Layout_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Output operation on the fragment + typename OutputOp_, + /// Whether beta is zero + bool IsBetaZero_ > +class MmaTensorOpFragmentIterator; + + +// Partial specialization for col-major accumulator tile +// And Element type is the same as Accumulator Element type + +template < + /// Shape of warp tile to load (concept: MatrixShape) + typename Shape_, + /// Shape of the warp accumulation tile (concept: MatrixShape) + typename AccumulatorShape_, + /// KBlocks columns to compute residual + int KBlocksColumn_, + /// Element type + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Output operation on fragment + typename OutputOp_> +class MmaTensorOpFragmentIterator { + public: + + /// Shape of warp tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Shape of the warp accumulation tile (concept: MatrixShape) + using AccumulatorShape = AccumulatorShape_; + + /// KBlocks columns to compute residual + static int const kKBlockColumn = KBlocksColumn_; + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Output operation on fragment + using OutputOp = OutputOp_; + + /// Whether beta is zero + static bool const IsBetaZero = true; + + /// Number of participating threads + static int const kThreads = 32; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + static_assert( + !(AccumulatorShape::kRow % Shape::kRow) && + !(AccumulatorShape::kColumn % Shape::kColumn), + "Shape of Warp Accumulator must be divisible by warp shape."); + static_assert( + !(kKBlockColumn % Shape::kColumn), + "KBlock size must be divisible by warp shape."); + + /// Number of times this iterator can be incremented + static int const kIterations = AccumulatorShape::kCount / Shape::kCount; + }; + +private: + + static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads; + + /// Number of mma operations performed by a warp + using MmaIterations = MatrixShape; + /// Number of mma operations performed by the entire accumulator + using AccumulatorIterations = MatrixShape; + + /// Number of K iterations + static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn; + static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn; + static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + static int const kResidualIndex = kResidualColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array; + + /// Accumulator Fragment object + using AccumulatorFragment = Array; + + +private: + + /// Internal access type + using AccessType = Array; + +private: + // + // Data members + // + + /// Accumulator tile + AccessType const *accumulators_; + + /// Internal index + int index_; + + /// Used to access residual tile first + bool is_residual_tile_; + +public: + /// Constructs an iterator + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator(AccumulatorFragment const &accum) + : accumulators_(reinterpret_cast(&accum)), + index_(0), is_residual_tile_(true) {} + + /// Add offset + CUTLASS_HOST_DEVICE + void add_offset(int index_offset) { + index_ += index_offset; + if(is_residual_tile_ && index_ >= kKBlockColumnIterations) { + index_ = index_ - kKBlockColumnIterations + kResidualIndex; + is_residual_tile_ = false; + } + } + + /// Increments + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator++() { + add_offset(1); + return *this; + } + + /// Decrements + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator--() { + add_offset(-1); + return *this; + } + + /// Loads a fragment from the referenced part of the accumulator tile + CUTLASS_HOST_DEVICE + void load(Fragment &frag, OutputOp output_op) const { + + if (output_op.is_source_needed()) //beta must be zero + assert(0); + + AccessType src_fragment; + src_fragment.clear(); + + + AccessType *frag_ptr = reinterpret_cast(&frag); + + int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow; + int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow + * MmaIterations::kColumn; + + CUTLASS_PRAGMA_UNROLL + for (int n = 0; n < MmaIterations::kColumn; n++) { + for (int m = 0; m < MmaIterations::kRow; m++) { + int accumulator_access_offset = + (n + index_n) * AccumulatorIterations::kRow + m + index_m; + + frag_ptr[n * MmaIterations::kRow + m].clear(); + if(!(is_residual_tile_ && index_ >= kResidualIndex)) + //frag_ptr[n * MmaIterations::kRow + m] = accumulators_[accumulator_access_offset]; + frag_ptr[n * MmaIterations::kRow + m] = output_op(accumulators_[accumulator_access_offset], src_fragment); + } + } + } + +}; + +// Partial specialization for row-major accumulator tile + +template < + /// Shape of warp tile to load (concept: MatrixShape) + typename Shape_, + /// Shape of the warp accumulation tile (concept: MatrixShape) + typename AccumulatorShape_, + /// KBlocks columns to compute residual + int KBlocksColumn_, + /// Accumulator Element type + typename ElementAccumulator_, + /// Element type + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Output operation on fragment + typename OutputOp_> +class MmaTensorOpFragmentIterator { + public: + + /// Shape of warp tile to load (concept: MatrixShape) + using Shape = Shape_; + + /// Shape of the warp accumulation tile (concept: MatrixShape) + using AccumulatorShape = AccumulatorShape_; + + /// KBlocks columns to compute residual + static int const kKBlockColumn = KBlocksColumn_; + + /// Accumulator Element type + using ElementAccumulator = ElementAccumulator_; + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajor; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Output operation on fragment + using OutputOp = OutputOp_; + + /// Whether beta is zero + static bool const IsBetaZero = true; + + /// Number of participating threads + static int const kThreads = 32; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kRow % InstructionShape::kM) && + !(Shape::kColumn % InstructionShape::kN), + "Shape of warp-level Mma must be divisible by operator shape."); + static_assert( + !(AccumulatorShape::kRow % Shape::kRow) && + !(AccumulatorShape::kColumn % Shape::kColumn), + "Shape of Warp Accumulator must be divisible by warp shape."); + static_assert( + !(kKBlockColumn % Shape::kColumn), + "KBlock size must be divisible by warp shape."); + + /// Number of times this iterator can be incremented + static int const kIterations = AccumulatorShape::kCount / Shape::kCount; + }; + +private: + + static int const kElementsPerAccess = InstructionShape::kM * InstructionShape::kN / kThreads; + + /// Number of mma operations performed by a warp + using MmaIterations = MatrixShape; + /// Number of mma operations performed by the entire accumulator + using AccumulatorIterations = MatrixShape; + + /// Number of K iterations + static int const kKBlockIterations = (AccumulatorShape::kColumn + kKBlockColumn - 1) / kKBlockColumn; + static int const kResidualColumn = AccumulatorShape::kColumn - (kKBlockIterations - 1) * kKBlockColumn; + static int const kKBlockColumnIterations = kKBlockColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + static int const kResidualIndex = kResidualColumn / Shape::kColumn + * (AccumulatorShape::kRow / Shape::kRow); + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + /// This is the fragment size produced by one access of the iterator. + using Fragment = Array; + + /// Accumulator Fragment object + using AccumulatorFragment = Array; + + +private: + + /// Internal access type + using AccessType = Array; + using FragmentAccessType = Array; + +private: + // + // Data members + // + + /// Accumulator tile + AccessType const *accumulators_; + + /// Internal index + int index_; + + /// Used to access residual tile first + bool is_residual_tile_; + +public: + /// Constructs an iterator + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator(AccumulatorFragment const &accum) + : accumulators_(reinterpret_cast(&accum)), + index_(0), is_residual_tile_(true) {} + + /// Add offset + CUTLASS_HOST_DEVICE + void add_offset(int index_offset) { + index_ += index_offset; + if(is_residual_tile_ && index_ >= kKBlockColumnIterations) { + index_ = index_ - kKBlockColumnIterations + kResidualIndex; + is_residual_tile_ = false; + } + } + + /// Increments + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator++() { + add_offset(1); + return *this; + } + + /// Decrements + CUTLASS_HOST_DEVICE + MmaTensorOpFragmentIterator &operator--() { + add_offset(-1); + return *this; + } + + /// Loads a fragment from the referenced part of the accumulator tile + CUTLASS_HOST_DEVICE + void load(Fragment &frag, OutputOp output_op) const { + + if (output_op.is_source_needed()) //beta must be zero + assert(0); + + FragmentAccessType src_fragment; + src_fragment.clear(); + + FragmentAccessType *frag_ptr = reinterpret_cast(&frag); +// NumericArrayConverter fragmentConverter; + + int index_m = (index_ * MmaIterations::kRow) % AccumulatorIterations::kRow; + int index_n = (index_ * MmaIterations::kRow) / AccumulatorIterations::kRow + * MmaIterations::kColumn; + + CUTLASS_PRAGMA_UNROLL + for (int m = 0; m < MmaIterations::kRow; m++) { + for (int n = 0; n < MmaIterations::kColumn; n++) { + int accumulator_access_offset = + (m + index_m) * AccumulatorIterations::kColumn + n + index_n; + + frag_ptr[m * MmaIterations::kColumn + n].clear(); + if(!(is_residual_tile_ && index_ >= kResidualIndex)) +// frag_ptr[m * MmaIterations::kColumn + n] = fragmentConverter(accumulators_[accumulator_access_offset]); + frag_ptr[m * MmaIterations::kColumn + n] = output_op(accumulators_[accumulator_access_offset], src_fragment); + } + } + } + +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_tensor_op_policy.h b/include/cutlass/gemm/warp/mma_tensor_op_policy.h index 82386011..68b28bff 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_policy.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_policy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h index 59515b5b..063c77f9 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -106,6 +106,9 @@ public: /// Architecture tag using ArchTag = arch::Sm70; + /// Underlying instruction shape + using InstructionShape = typename Policy::Operator::Shape; + /// Complex transform on A operand static ComplexTransform const kTransformA = ComplexTransform::kNone; @@ -210,8 +213,7 @@ public: FragmentC &D, FragmentA const &A, FragmentB const &B, - FragmentC const &C, - int const &partitionN_idx = 0) { + FragmentC const &C) { using MmaOperandA = typename Policy::Operator::FragmentA; using MmaOperandB = typename Policy::Operator::FragmentB; diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h index 45048d38..1a8fa4f9 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -229,8 +229,11 @@ public: k_group_idx_(0) { int quad_pair = (lane_id >> 3); + int quad_quad = (lane_id >> 4); int lane_in_quad = (lane_id & 3); int lane_in_quad_pair = (lane_id & 7); + int lane_in_quad_quad = (lane_id & 15); + CUTLASS_PRAGMA_UNROLL for (int i = 0; i < kPointerCount; ++i) { int partition_contiguous_idx = -1; @@ -242,6 +245,24 @@ public: access_contiguous_idx = (quad_pair ^ lane_in_quad); access_strided_idx = lane_in_quad_pair; } + else if (Policy::LdsmShape::kContiguous == 2 && + kOperand == Operand::kA) { + // Matrix multiply 16816 A + // Q0 Q2 + // Q1 Q3 + partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1)); + access_contiguous_idx = + (((quad_pair & 1) + ((i & 1) << 1)) ^ lane_in_quad); + access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3); + } else if (Policy::LdsmShape::kContiguous == 2 && + kOperand == Operand::kB) { + // Matrix multiply 16816 B + // Q0 Q1 + // Q2 Q3 + partition_contiguous_idx = ((lane_in_quad_pair >> 2) ^ (i >> 1)); + access_contiguous_idx = ((quad_quad + ((i & 1) << 1)) ^ lane_in_quad); + access_strided_idx = lane_in_quad_quad; + } int access_contiguous = partition_contiguous_idx * Layout::PartitionShape::kContiguous + access_contiguous_idx; @@ -436,6 +457,364 @@ public: }; //////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for 32-thread MMA.TF32 NT TensorOps. It +/// uses LDS.32 to load from shared memory and therefore must be initialized +/// with a TensorRef to shared memory. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCongruous<32, 32>, InstructionShape_, + OpDelta_, 32, PartitionsK_> { + public: + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand == Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for " + "A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous<32, 32>; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: + /// MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Internal structure of iterator - made public to enable introspection + struct Policy { + static_assert( + !(Shape::kContiguous % InstructionShape::kContiguous), + "Shape of warp-level Mma must be divisible by operator shape."); + + // Determine number of elements along outer dimension per individual LDS.32 + // op. Every one warp of LDS.32 loads 8x4 elements + static int const kLdsOpInner = Layout::TileShape::kStrided; + static int const kLdsOpOuter = kThreads / kLdsOpInner; + + static_assert(!(Shape::kContiguous % kLdsOpOuter), + "Shape of warp-level mma must be divisible by LDS.32's " + "fundamental tile size."); + + static_assert(!(Shape::kStrided % kLdsOpInner), + "Shape of warp-level mma must be divisible by LDS.32's " + "fundamental tile size."); + + /// Number of LDS.32 instructions needed by one MMA instruction + /// 1684 A 2x1 + /// 1684 B 1x1 + /// 1688 A 2x2 + /// 1688 B 1x2 + static int const LdsShapeContiguous = + InstructionShape::kContiguous / kLdsOpOuter; + static int const LdsShapeStrided = InstructionShape::kStrided / kLdsOpInner; + using LdsShape = + layout::PitchLinearShape; + + /// Number and arrangement of LDS instructions + using LdsIterations = layout::PitchLinearShape< + Shape::kContiguous / LdsShapeContiguous / kLdsOpOuter, 1>; + + /// Number of groups for each tile + static int const kGroupsPerTile = + Shape::kStrided / InstructionShape::kStrided; + }; + + private: + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Number of internal pointers needed to reference shared memory + static int const kPointerCount = Layout::TileShape::kContiguous * + Layout::kElementsPerAccess / + Policy::kLdsOpOuter; + + /// Vectorized access is not used + static int const kElementsPerAccess = 1; + + /// Pointer type used for accesses + using AccessType = Element; + + /// Internal counter used to jump to next K partition + int k_group_idx_; + + public: + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + + private: + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_[kPointerCount]; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + + public: + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() : stride_(0), byte_offset_(0) {} + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator(TensorRef const &ref, int lane_id) + : stride_(ref.stride(0)), byte_offset_(0), k_group_idx_(0) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kPointerCount; ++i) { + int access_strided = lane_id % Policy::kLdsOpInner; + int access_contiguous = (lane_id / Policy::kLdsOpInner) + + (access_strided ^ i) * Policy::kLdsOpOuter; + + pointer_[i] = reinterpret_cast(ref.data()) + + access_contiguous + access_strided * stride_; + } + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + byte_offset_ += offset * sizeof(Element); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset( + TensorCoord const &tile_offset) { + int contiguous_offset = tile_offset.contiguous(); + if (Shape::kContiguous == + Layout::TileShape::kContiguous * Layout::kElementsPerAccess / 2) { + if (tile_offset.contiguous() % 2) { + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < kPointerCount / 2; ++i) { + AccessType const *tmp_pointer = pointer_[i]; + pointer_[i] = pointer_[i + kPointerCount / 2]; + pointer_[i + kPointerCount / 2] = tmp_pointer; + } + } + contiguous_offset = (tile_offset.contiguous() >> 1) << 1; + } + + int offset = (tile_offset.strided() * InstructionShape::kStrided) * stride_ + + contiguous_offset * Shape::kContiguous; + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &operator++() { + add_tile_offset({0, 1}); + + if (kPartitionsK > 1) { + ++k_group_idx_; + // Jump to next stage + if (k_group_idx_ == Policy::kGroupsPerTile) { + k_group_idx_ = 0; + add_tile_offset( + {0, ((kPartitionsK - 1) * Policy::kGroupsPerTile)}); + } + } + + return *this; + } + + /// Advances the iterator along the opposite of the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &operator--() { + byte_offset_ -= stride_ * InstructionShape::kStrided * sizeof(Element) * + kElementsPerAccess; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of + ///< the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &operator+=( + TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of + ///< the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &operator-=( + TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { load_with_byte_offset(frag, 0); } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + Element *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::LdsIterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::LdsIterations::kContiguous; ++c) { + CUTLASS_PRAGMA_UNROLL + for (int ss = 0; ss < Policy::LdsShape::kStrided; ++ss) { + CUTLASS_PRAGMA_UNROLL + for (int cc = 0; cc < Policy::LdsShape::kContiguous; ++cc) { + int access_idx = + cc + (ss + (c + s * Policy::LdsIterations::kContiguous) * + Policy::LdsShape::kStrided) * + Policy::LdsShape::kContiguous; + int access_idx_contiguous = cc + c * Policy::LdsShape::kContiguous; + int access_idx_strided = + (ss + s * Policy::LdsShape::kStrided) * Policy::kLdsOpInner; + + AccessType const *source_ptr = + pointer_[access_idx_contiguous % kPointerCount] + + Layout::TileShape::kContiguous * Layout::kElementsPerAccess * + (access_idx_contiguous / kPointerCount) + + access_idx_strided * stride_; + + char const *source_byte_ptr = + reinterpret_cast(source_ptr) + byte_offset + + byte_offset_; + + fetch_ptr[access_idx] = + *reinterpret_cast(source_byte_ptr); + } + } + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous / + Layout::kElementsPerAccess + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + // no op + } +}; + +//////////////////////////////////////////////////////////////////////////////// + /// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared /// memory and therefore must be initialized with a TensorRef to shared memory. /// @@ -1069,7 +1448,6 @@ class MmaTensorOpMultiplicandTileIterator< k_group_idx_(0) { // Warp level iterator at most use double buffer to hide latency. If there // are more than 2 sections, every stage should have more than 1 section. - // TODO: refactor code after every case is implemented // Turing silicon requires all 32 threads in a warp provide valid addresses // even for LDSM.1 and LDSM.2 @@ -1077,6 +1455,8 @@ class MmaTensorOpMultiplicandTileIterator< lane_id = lane_id % (Policy::LdsmShape::kCount * Policy::kLdsmOpInner); #endif + int quad_quad = (lane_id >> 4); + int quad_pair = (lane_id >> 3); int lane_in_pair = (lane_id & 1); int lane_in_quad = (lane_id & 3); int lane_in_quad_pair = (lane_id & 7); @@ -1100,6 +1480,26 @@ class MmaTensorOpMultiplicandTileIterator< (lane_in_quad_quad / Layout::kFactor)); access_strided_idx = lane_id / Layout::kFactor; } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kA) { + // Integer matrix multiply 16832 A + partition_contiguous_idx = lane_in_quad / factor_in_partition; + access_strided_idx = lane_in_quad_quad / Layout::kFactor; + access_contiguous_idx = + ((lane_in_pair * factor_in_partition + quad_quad) ^ + access_strided_idx); + } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kB) { + // Integer matrix multiply 16832 B + partition_contiguous_idx = lane_in_quad / factor_in_partition; + access_strided_idx = lane_in_quad_pair / Layout::kFactor + quad_quad * 2; + access_contiguous_idx = + ((lane_in_pair * factor_in_partition + ((lane_id & 8) >> 3)) ^ + access_strided_idx); + } } else if (Layout::kFactor == 2) { // Super Matrix multiply kBlock = 32 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) { @@ -1113,6 +1513,28 @@ class MmaTensorOpMultiplicandTileIterator< access_contiguous_idx = (lane_in_quad_pair / Layout::kFactor); access_strided_idx = lane_id / Layout::kFactor; } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kA) { + // Matrix multiply 16816|1688.TF32 A + // Q0 Q2 + // Q1 Q3 + partition_contiguous_idx = (lane_id % Layout::kFactor); + access_contiguous_idx = + (quad_quad ^ (lane_in_quad_pair / Layout::kFactor)); + access_strided_idx = (lane_in_quad_quad / Layout::kFactor); + } else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kB) { + // Matrix multiply 16816|1688.TF32 B + // Q0 Q1 + // Q2 Q3 + partition_contiguous_idx = (lane_id % Layout::kFactor); + access_contiguous_idx = + ((quad_pair & 1) ^ (lane_in_quad_pair / Layout::kFactor)); + access_strided_idx = + (lane_in_quad_pair + (lane_id >> 4 << 3)) / Layout::kFactor; + } } else if (Layout::kFactor == 1) { // Super Matrix multiply kBlock = 64 if (Policy::LdsmShape::kStrided == Policy::LdsmShape::kCount) { @@ -1124,6 +1546,25 @@ class MmaTensorOpMultiplicandTileIterator< access_contiguous_idx = lane_in_quad; access_strided_idx = lane_id; } + else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kA) { + // Matrix multiply 16816|1688.TF32 A + // Q0 Q2 + // Q1 Q3 + partition_contiguous_idx = (lane_in_quad_pair >> 2); + access_contiguous_idx = (quad_quad ^ lane_in_quad); + access_strided_idx = lane_in_quad_quad; + } else if (Policy::LdsmShape::kStrided == + (Policy::LdsmShape::kCount / 2) && + kOperand == Operand::kB) { + // Matrix multiply 16816|1688.TF32 B + // Q0 Q1 + // Q2 Q3 + partition_contiguous_idx = (lane_in_quad_pair >> 2); + access_contiguous_idx = ((quad_pair & 1) ^ lane_in_quad); + access_strided_idx = lane_in_quad_pair + (lane_id >> 4 << 3); + } } int access_contiguous = @@ -1161,16 +1602,68 @@ class MmaTensorOpMultiplicandTileIterator< return *this; } + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative( + TensorCoord const &tile_offset) { + + int whole_tiles = tile_offset.contiguous() / Policy::kGroupsPerTile; + int k_groups_delta = tile_offset.contiguous() % Policy::kGroupsPerTile; + if (k_groups_delta < 0) { + whole_tiles -= 1; + k_groups_delta += Policy::kGroupsPerTile; + } + + if ((Policy::kGroupsPerTile / kPartitionsK) >= 2) { + byte_offset_ ^= (k_groups_delta & 1) * Policy::LdsmShape::kContiguous * + sizeof_bits::value * + Layout::kElementsPerAccess / 8; + } + if ((Policy::kGroupsPerTile / kPartitionsK) >= 4) { + byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 1)) & 2) * + Policy::LdsmShape::kContiguous * + sizeof_bits::value * + Layout::kElementsPerAccess / 8; + } + if ((Policy::kGroupsPerTile / kPartitionsK) == 8) { + byte_offset_ ^= ((k_groups_delta + (k_group_idx_ & 3)) & 4) * + Policy::LdsmShape::kContiguous * + sizeof_bits::value * + Layout::kElementsPerAccess / 8; + } + + k_group_idx_ += k_groups_delta; + whole_tiles += k_group_idx_ / (Policy::kGroupsPerTile / kPartitionsK); + k_group_idx_ = k_group_idx_ % (Policy::kGroupsPerTile / kPartitionsK); + + pointer_ += + tile_offset.strided() * stride_ * Shape::kStrided / Layout::kFactor + + whole_tiles * stride_ / sections_; + return *this; + } + /// Advances the iterator along the advance dimension CUTLASS_DEVICE MmaTensorOpMultiplicandTileIterator &operator++() { + // Integer matrix multiply 16832 Interleaved-32 + // NONE + // Integer matrix multiply 16816 Interleaved-32 || Integer matrix multiply 16816 kblock=32 + // Integer matrix multiply 8816 Interleaved-32 // ^1 ^1 + // Matrix multiply 1684.TF32 kblock=16 || Integer matrix multiply 16816 kblock=64 // Matrix multiply 1688 kblock=32 || Integer matrix multiply 8816 kblock=64 // ^1 ^3 ^1 ^3 // Matrix multiply 1688 kblock=64 // ^1 ^3 ^1 ^7 ^1 ^3 ^1 ^7 + + // Matrix multiply 16816 kblock=32 | 1688.TF32 kblock=16 || Integer matrix multiply 16832 kblock=64 + // ^2 ^2 + // Matrix multiply 16816 kblock=64 | 1688.TF32 kblock=32 || Integer matrix multiply 16832 kblock=128 + // ^2 ^6 ^2 ^6 + if ((Policy::kGroupsPerTile / kPartitionsK) > 1) { int mask = ((Policy::kGroupsPerTile / kPartitionsK) == 8) ? 3 @@ -1443,6 +1936,16 @@ class MmaTensorOpMultiplicandTileIterator< return *this; } + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative( + TensorCoord const &tile_offset) { + iterator_.add_tile_offset_negative({tile_offset.row(), tile_offset.column()}); + + return *this; + } + /// Advances the iterator along the advance dimension CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator &operator++() { @@ -1673,6 +2176,16 @@ class MmaTensorOpMultiplicandTileIterator< return *this; } + /// Advances an iterator along logical dimensions of matrix in units of whole + /// tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset_negative( + TensorCoord const &tile_offset) { + iterator_.add_tile_offset_negative({tile_offset.column(), tile_offset.row()}); + + return *this; + } + /// Advances the iterator along the advance dimension CUTLASS_HOST_DEVICE MmaTensorOpMultiplicandTileIterator &operator++() { @@ -1782,6 +2295,7 @@ class MmaTensorOpMultiplicandTileIterator< }; //////////////////////////////////////////////////////////////////////////////// + template < /// Size of the matrix to load (concept: MatrixShape) typename Shape_, @@ -2682,6 +3196,7 @@ public: }; //////////////////////////////////////////////////////////////////////////////// + } // namespace warp } // namespace gemm } // namespace cutlass diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h index 51c5ce26..ed6384f0 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h new file mode 100644 index 00000000..e43373b6 --- /dev/null +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_sm80.h @@ -0,0 +1,1579 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Defines iterators used by warp-level matrix multiply operations targeting Tensor Cores. +*/ + +#pragma once + +#include "cutlass/cutlass.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/matrix_shape.h" + +#include "cutlass/arch/memory_sm75.h" +#include "cutlass/gemm/gemm.h" + +#include "cutlass/layout/matrix.h" +#include "cutlass/layout/tensor.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" + +#include "cutlass/platform/platform.h" +#include "cutlass/fast_math.h" + +#include "cutlass/gemm/warp/mma_tensor_op_tile_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace gemm { +namespace warp { + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 16) && !(Shape::kStrided % 4), "Divisibility."); + + static_assert(sizeof_bits::value == 64, "This is specialized for 64b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<8, 4>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + Shape::kContiguous / kElementsPerAccess / Delta::kContiguous, + InstructionShape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + + /// Internal counter used to jump to next K partition + int k_group_idx_; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / Policy::Delta::kContiguous; + int access_contiguous = (lane_id % Policy::Delta::kContiguous) ^ access_strided; + + pointer_= reinterpret_cast(ref.data()) + + access_contiguous + access_strided * stride_; + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + byte_offset_ += offset * sizeof(Element); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + int offset = + (tile_offset.strided() * InstructionShape::kStrided) * stride_ * kElementsPerAccess + + tile_offset.contiguous() * Shape::kContiguous; + + add_pointer_offset(offset); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + add_tile_offset({0, 1}); + + return *this; + } + + /// Advances the iterator along the opposite of the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + add_tile_offset({0, -1}); + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c + + Policy::Delta::kStrided * s * stride_; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + + Index pointer_offset = + tile_offset.contiguous() * Shape::kContiguous / Layout::kElementsPerAccess + + tile_offset.strided() * InstructionShape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous64b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for 32-thread TensorOps. It uses LDSM to load from shared +/// memory and therefore must be initialized with a TensorRef to shared memory. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicandCongruous64b, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// This tile iterator is specialized for loading 128b vectors of 64b elements. +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: PitchLinearShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: PitchLinearShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::TensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + static_assert(!(Shape::kContiguous % 4) && !(Shape::kStrided % 16), "Divisibility."); + + static_assert(sizeof_bits::value == 64, "This is specialized for 64b accesses."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::TensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: GemmShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// Number of partitions along K dimension + static int const kPartitionsK = PartitionsK_; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Load two elements per access + static int const kElementsPerAccess = 2; + + /// Policy defining internal details of tile iterator + struct Policy { + + /// Shape of one access + using Delta = layout::PitchLinearShape<4, 16>; + + /// Number of iterations to load + using Iterations = layout::PitchLinearShape< + InstructionShape::kContiguous / Delta::kContiguous, + Shape::kStrided / Delta::kStrided + >; + + }; + +private: + + /// Not working on this feature at the moment. + static_assert(kOpDelta == 1, + "Alternative arrangements not supported at present."); + + /// Pointer type used for accesses + using AccessType = AlignedArray; + +public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = + Array; + +private: + + /// Layout object storing stride values + Index stride_; + + /// Shared memory base pointers - not advanced + AccessType const *pointer_; + + /// Byte offset incremented as iterator advances + Index byte_offset_; + + /// Internal counter for tracking K-group + Index k_group_idx_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator(): stride_(0), byte_offset_(0) { } + + /// Constructor from TensorRef + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): + stride_(ref.stride(0) / kElementsPerAccess), byte_offset_(0), + k_group_idx_(0) { + + int access_strided = lane_id / 8; + int access_contiguous = (lane_id % 8); + + byte_offset_ = (access_contiguous + access_strided * stride_) * sizeof(AccessType); + + pointer_= reinterpret_cast(ref.data()); + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + pointer_ += offset / kElementsPerAccess; + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + int offset = (tile_offset.contiguous() * InstructionShape::kContiguous) * + stride_ * kElementsPerAccess + + tile_offset.strided() * Shape::kStrided; + + add_pointer_offset(offset); + + int old_k_group_idx = k_group_idx_; + + k_group_idx_ += tile_offset.contiguous(); + + if ((k_group_idx_ & 2) ^ (old_k_group_idx & 2)) { + byte_offset_ ^= 0x40; + } + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + pointer_ += stride_ * InstructionShape::kContiguous; + + if (k_group_idx_ & 0x1) { + // xor ptr + byte_offset_ ^= 0x40; + } + + ++k_group_idx_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(tile_offset); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + load_with_byte_offset(frag, 0); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset in units of bytes + Index byte_offset) const { + + AccessType *fetch_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < Policy::Iterations::kContiguous; ++c) { + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < Policy::Iterations::kStrided; ++s) { + + int access_idx = c + s * Policy::Iterations::kContiguous; + + AccessType const *source_ptr = pointer_ + + Policy::Delta::kContiguous * c * stride_ + + Policy::Delta::kStrided * s / kElementsPerAccess; + + char const *source_byte_ptr = reinterpret_cast(source_ptr) + byte_offset + byte_offset_; + + AccessType const *source = reinterpret_cast(source_byte_ptr); + + fetch_ptr[access_idx] = *source; + } + } + + Element *exchange_ptr = reinterpret_cast(&frag); + + if (k_group_idx_ & 1) { + // exchange on 64b granularity + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < Fragment::kElements; i += 2) { + Element tmp = exchange_ptr[i]; + exchange_ptr[i] = exchange_ptr[i + 1]; + exchange_ptr[i + 1] = tmp; + } + } + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + + load_with_byte_offset(frag, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + + load_with_byte_offset(frag, tile_offset, 0); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + + load_with_byte_offset(frag, tile_offset, pointer_offset * sizeof(Element)); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + Index pointer_offset = tile_offset.contiguous() * + InstructionShape::kContiguous / + Layout::kElementsPerAccess + + tile_offset.strided() * Shape::kStrided * stride_; + + byte_offset += sizeof(AccessType) * pointer_offset; + + load_with_byte_offset(frag, byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + k_group_idx_ = k_group; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicand64bCrosswise, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.column(), tile_offset.row()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.column(), tile_offset.row())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.strided(), tile_offset.contiguous()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// +/// +/// Satisfies: +/// ReadableRandomAccessContiguousTileIteratorConcept +/// +template < + /// Size of the matrix to load (concept: MatrixShape) + typename Shape_, + /// Identifies A or B multiplicand + Operand Operand_, + /// Data type of elements + typename Element_, + /// Shape of one matrix product operation (concept: MatrixShape) + typename InstructionShape_, + /// Interval between adjacent *MMA instructions (in units of MMA + /// instructions) + int OpDelta_, + /// Number of partitions along K dimension + int PartitionsK_> +class MmaTensorOpMultiplicandTileIterator< + Shape_, Operand_, Element_, + cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise, + InstructionShape_, OpDelta_, 32, PartitionsK_> { + public: + + /// Shape of tile to load (concept: PitchLinearShape) + using Shape = Shape_; + + /// Operand tag + static Operand const kOperand = Operand_; + + static_assert(kOperand == Operand::kA || kOperand== Operand::kB, + "MmaTensorOpMultiplicandIterator may only be instantiated for A or B operands to warp-level Mma."); + + /// Element type + using Element = Element_; + + /// Layout of source tile + using Layout = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + /// Shape of one matrix product operation (concept: MatrixShape) + using InstructionShape = InstructionShape_; + + /// Delta between *MMA operations (in units of *MMA operations, concept: MatrixShape) + static int const kOpDelta = OpDelta_; + + /// Number of participating threads + static int const kThreads = 32; + + /// TensorRef type for loading element from a tensor + using TensorRef = TensorRef; + + /// Index type + using Index = typename TensorRef::Index; + + /// Long Index type + using LongIndex = typename TensorRef::LongIndex; + + /// Coordinate for an element in the tensor + using TensorCoord = typename TensorRef::TensorCoord; + + /// Underlying tile iterator implementation + using Base = MmaTensorOpMultiplicandTileIterator< + layout::PitchLinearShape, kOperand, Element, + layout::TensorOpMultiplicand64bCrosswise, + layout::PitchLinearShape, + kOpDelta, kThreads, PartitionsK_>; + + public: + + // + // Derived quantities + // + + /// Fragment object holding a thread's part of a tile + using Fragment = typename Base::Fragment; + +private: + + /// Underlying tile iterator + Base iterator_; + +public: + + /// Default ctor constructs null iterator + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator() { } + + /// Constructor from TensorRef + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator( + TensorRef const &ref, + int lane_id + ): iterator_({ref.data(), ref.stride()}, lane_id) { + } + + /// Adds a pointer offset to internal pointer(s) to advance through memory + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_pointer_offset(LongIndex offset) { + + iterator_.add_pointer_offset(offset); + + return *this; + } + + /// Advances an iterator along logical dimensions of matrix in units of whole tiles + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator &add_tile_offset(TensorCoord const &tile_offset) { + + iterator_.add_tile_offset({tile_offset.row(), tile_offset.column()}); + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator++() { + + ++iterator_; + + return *this; + } + + /// Advances the iterator along the advance dimension + CUTLASS_HOST_DEVICE + MmaTensorOpMultiplicandTileIterator & operator--() { + + --iterator_; + + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator+=(TensorCoord const &tile_offset) { + add_tile_offset(PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + ///< advances in units of whole tiles along the logical coordinate space of the tensor + CUTLASS_DEVICE + MmaTensorOpMultiplicandTileIterator & operator-=(TensorCoord const &tile_offset) { + add_tile_offset(-PitchLinearCoord(tile_offset.row(), tile_offset.column())); + return *this; + } + + /// Loads a fragment from memory at the location pointed to by the iterator. + CUTLASS_HOST_DEVICE + void load(Fragment &frag) const { + + iterator_.load(frag); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_pointer_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index pointer_offset) const { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory with additional logical offset + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a linear offset + Index byte_offset) const { + iterator_.load_with_byte_offset(frag, byte_offset); + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index pointer_offset) const { + // TODO + } + + /// Loads a fragment from memory with logical offset in units of whole tiles. + CUTLASS_DEVICE + void load_with_byte_offset( + /// fragment to load from the tensor + Fragment &frag, + /// loads a tile with a logical offset in units of whole tiles + TensorCoord const &tile_offset, + /// loads a tile with a logical offset AND a pointer offset + Index byte_offset) const { + iterator_.load_with_byte_offset( + frag, + {tile_offset.contiguous(), tile_offset.strided()}, + byte_offset); + } + + /// Notify the iterator which k-group it is currently pointing to. + /// + /// This does not advance the iterator. Rather, it overrides its internal + /// tracking with constant-valued k-group index to enable the compiler to + /// fold constants and achieve more efficient code. + /// + /// This is used by some nontrivial permuted layouts. + CUTLASS_DEVICE + void set_kgroup_index(int k_group) { + iterator_.set_kgroup_index(k_group); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace warp +} // namespace gemm +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h index 0caf6247..64be6556 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_tile_iterator_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h index fe69867e..824e207d 100644 --- a/include/cutlass/gemm/warp/mma_tensor_op_wmma.h +++ b/include/cutlass/gemm/warp/mma_tensor_op_wmma.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -40,6 +40,8 @@ #include "cutlass/arch/memory_sm75.h" #include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/mma_sm80.h" + #include "cutlass/gemm/gemm.h" #include "cutlass/gemm/warp/mma.h" @@ -75,8 +77,6 @@ template < typename Policy_, ///< Number of partitions along K dimension int PartitionsK_ = 1, - ///< Number of partitions along N dimension - int PartitionsN_ = 1, ///< Used for partial specialization typename Enable = bool > @@ -106,6 +106,9 @@ public: /// Shape of the warp in units of thread (concept: MmaTensorOpPolicy) using Policy = Policy_; + /// Underlying instruction shape + using InstructionShape = typename Policy::Operator::Shape; + /// Underlying architecture tag using ArchTag = typename Policy::Operator::ArchTag; @@ -116,7 +119,7 @@ public: static ComplexTransform const kTransformB = ComplexTransform::kNone; /// Indicates class of matrix operator - using OperatorClass = arch::OpClassTensorOp; + using OperatorClass = arch::OpClassWmmaTensorOp; /// Number of threads participating in warp-level matrix product static int const kThreadCount = 32; @@ -124,9 +127,6 @@ public: /// Number of partitions along K dimension static int const kPartitionsK = PartitionsK_; - /// PartitionsN indicating how many PartitionsN for multiplicand B - static int const kPartitionsN = PartitionsN_; - public: /// Iterates over the A operand in memory @@ -163,9 +163,7 @@ private: /// Number of wmma operations performed using WmmaIterations = MatrixShape< Shape::kM / Policy::Operator::Shape::kM, - (Shape::kN / Policy::Operator::Shape::kN / kPartitionsN > 0) ? - Shape::kN / Policy::Operator::Shape::kN / kPartitionsN : - 1 + Shape::kN / Policy::Operator::Shape::kN >; public: @@ -189,8 +187,7 @@ public: FragmentC &D, FragmentA const &A, FragmentB const &B, - FragmentC const &C, - int const &partitionN_idx = 0) const { + FragmentC const &C) const { CUTLASS_PRAGMA_UNROLL for (int n = 0; n < WmmaIterations::kColumn; ++n) { diff --git a/include/cutlass/half.h b/include/cutlass/half.h index 8ac08722..10d00de1 100644 --- a/include/cutlass/half.h +++ b/include/cutlass/half.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/integer_subbyte.h b/include/cutlass/integer_subbyte.h index f6951769..6b97f822 100644 --- a/include/cutlass/integer_subbyte.h +++ b/include/cutlass/integer_subbyte.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/kernel_launch.h b/include/cutlass/kernel_launch.h index b48fd7d0..bd84a357 100644 --- a/include/cutlass/kernel_launch.h +++ b/include/cutlass/kernel_launch.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/layout.h b/include/cutlass/layout/layout.h index ba540e77..775357d1 100644 --- a/include/cutlass/layout/layout.h +++ b/include/cutlass/layout/layout.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/matrix.h b/include/cutlass/layout/matrix.h index 2ab907a5..7c02f8f2 100644 --- a/include/cutlass/layout/matrix.h +++ b/include/cutlass/layout/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/pitch_linear.h b/include/cutlass/layout/pitch_linear.h index 987c2bb8..a6158b32 100644 --- a/include/cutlass/layout/pitch_linear.h +++ b/include/cutlass/layout/pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor.h b/include/cutlass/layout/tensor.h index 2ef4e9d2..20d5bad7 100644 --- a/include/cutlass/layout/tensor.h +++ b/include/cutlass/layout/tensor.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm70.h b/include/cutlass/layout/tensor_op_multiplicand_sm70.h index 26bd427e..03f87db3 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm70.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm75.h b/include/cutlass/layout/tensor_op_multiplicand_sm75.h index b4b35667..00870fb5 100644 --- a/include/cutlass/layout/tensor_op_multiplicand_sm75.h +++ b/include/cutlass/layout/tensor_op_multiplicand_sm75.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/layout/tensor_op_multiplicand_sm80.h b/include/cutlass/layout/tensor_op_multiplicand_sm80.h new file mode 100644 index 00000000..e5963a2a --- /dev/null +++ b/include/cutlass/layout/tensor_op_multiplicand_sm80.h @@ -0,0 +1,1133 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief +*/ + +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace layout { + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicandCongruous64b { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 64; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous64b(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous64b(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) { + return TensorOpMultiplicandCongruous64b(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + int tc = coord.contiguous() / 16; + int ts = coord.strided() / 4; + + int c = coord.contiguous() % 16; + int s = coord.strided() % 4; + + + int bank = ((((c & 1) * 4 + (c & 6) / 2)) ^ (s & 1)) * 2 + (c / 8); + int row = (c & 6) / 2; + + bank ^= ((s & 2) * 2); + + LongIndex offset = tc * 16 + bank + (ts * 4 + row) * stride_[0]; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } + + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + return TensorCoord(); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a column-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct ColumnMajorTensorOpMultiplicandCongruous64b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous64b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicandCongruous64b(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.contiguous(), coord.strided()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a row-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct RowMajorTensorOpMultiplicandCongruous64b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous64b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous64b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous64b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicandCongruous64b packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicandCongruous64b(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.strided(), coord.contiguous()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicand64bCrosswise { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 64; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicand64bCrosswise(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicand64bCrosswise(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) { + return TensorOpMultiplicand64bCrosswise(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + int tc = coord.contiguous() / 16; + int ts = coord.strided() / 16; + + int c = coord.contiguous() % 16; + int s = coord.strided() % 16; + + int k_group = c / 4; + int access_s = s / 2; + + int row = access_s % 4; + int bank = ((k_group & 2) << 2) ^ ((s % 2) << 3) + (c % 4) * 2 + (access_s / 4) ^ (k_group & 1); + + int smem_row = (k_group * 4 + row) + tc * 16; + int smem_col = ts * 16 + bank; + + LongIndex offset = smem_row * stride_[0] + smem_col; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct ColumnMajorTensorOpMultiplicand64bCrosswise { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicand64bCrosswise; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicand64bCrosswise(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct RowMajorTensorOpMultiplicand64bCrosswise { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicand64bCrosswise; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicand64bCrosswise(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicand64bCrosswise(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicand64bCrosswise packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicand64bCrosswise(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicandCongruous128b { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 128; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous128b(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCongruous128b(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) { + return TensorOpMultiplicandCongruous128b(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + Index tc = coord.contiguous() / 8; + Index ts = coord.strided() / 4; + + Index c = coord.contiguous() % 8; + Index s = coord.strided() % 4; + + Index k_index = (c / 2); + + Index bank = (((c & 1) * 4) | (s ^ k_index)); + + LongIndex offset = tc * 8 + bank + (ts * 4 + k_index) * stride_[0]; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + return TensorCoord(); + } +}; + + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a column-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct ColumnMajorTensorOpMultiplicandCongruous128b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous128b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicandCongruous128b(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.contiguous(), coord.strided()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a row-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct RowMajorTensorOpMultiplicandCongruous128b { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCongruous128b; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous128b(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCongruous128b(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicandCongruous128b packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicandCongruous128b(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Inverse of layout function, mapping linear offset to logical coordinate + CUTLASS_HOST_DEVICE + TensorCoord inverse(LongIndex offset) const { + PitchLinearCoord coord = layout_.inverse(offset); + return MatrixCoord(coord.strided(), coord.contiguous()); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template based on element size (in bits) - defined in terms of pitch-linear +/// memory and Crosswise size (in elements). +struct TensorOpMultiplicandCrosswise128x4 { + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = PitchLinearCoord; + + /// Stride vector + using Stride = Coord; + + // + // Static constants + // + + static int const kElementSize = 128; + static int const kElementsPerAccess = 1; + + private: + + // + // Data members + // + + /// Stride data member. + Stride stride_; + + public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCrosswise128x4(Index ldm = 0) : stride_(ldm) {} + + /// Ctor + CUTLASS_HOST_DEVICE + TensorOpMultiplicandCrosswise128x4(Stride stride) : stride_(stride) {} + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static TensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) { + return TensorOpMultiplicandCrosswise128x4(extent[0]); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + + Index tc = coord.contiguous() / 8; + Index ts = coord.strided() / 8; + + Index c = coord.contiguous() % 8; + Index s = coord.strided() % 8; + + Index liq = c % 4; + + Index bank = liq + ((s & 1) * 4) ^ (c & 4); + + Index k_index = (c & 4) + (s / 4) * 2 + ((s & 2) / 2); + + LongIndex offset = (tc * 8 + k_index) * stride_[0] + ts * 8 + bank; + + return offset; + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { return stride_; } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride &stride() { return stride_; } + + /// Compute the number of contiguous elements needed to store a tensor with + /// the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return extent[1] * stride_[0]; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a column-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct ColumnMajorTensorOpMultiplicandCrosswise128x4 { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCrosswise128x4; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + ColumnMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static ColumnMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) { + return ColumnMajorTensorOpMultiplicandCrosswise128x4(extent.column()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.row(), coord.column())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.row(), extent.column())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Template mapping a row-major view of pitch-linear memory to +/// TensorOpMultiplicand +struct RowMajorTensorOpMultiplicandCrosswise128x4 { + + /// Logical rank of tensor + static int const kRank = 2; + + /// Rank of stride vector + static int const kStrideRank = 1; + + /// Index type used for coordinates + using Index = int32_t; + + /// Long index type used for offsets + using LongIndex = int64_t; + + /// Logical coordinate + using TensorCoord = MatrixCoord; + + /// Stride vector + using Stride = Coord; + + // + // Invariants + // + + using Base = TensorOpMultiplicandCrosswise128x4; + +private: + + // + // Data members + // + + Base layout_; + +public: + // + // Methods + // + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCrosswise128x4(Index ldm = 0): layout_(ldm) { } + + /// Ctor + CUTLASS_HOST_DEVICE + RowMajorTensorOpMultiplicandCrosswise128x4(Stride stride): layout_(stride) { } + + /// Helper returns a layout to a tightly packed tensor + CUTLASS_HOST_DEVICE + static RowMajorTensorOpMultiplicandCrosswise128x4 packed(TensorCoord const &extent) { + return RowMajorTensorOpMultiplicandCrosswise128x4(extent.row()); + } + + /// Returns the offset of a coordinate in linear memory. + /// Assumes coordinate has convention (contiguous, strided) + CUTLASS_HOST_DEVICE + LongIndex operator()(TensorCoord const &coord) const { + return layout_(PitchLinearCoord(coord.column(), coord.row())); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride stride() const { + return layout_.stride(); + } + + /// Returns the stride of the layout + CUTLASS_HOST_DEVICE + Stride & stride() { + return layout_.stride(); + } + + /// Compute the number of contiguous elements needed to store a tensor with the given size + CUTLASS_HOST_DEVICE + LongIndex capacity(TensorCoord const &extent) const { + return layout_.capacity(PitchLinearCoord(extent.column(), extent.row())); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace layout +} // namespace cutlass + +//////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/layout/vector.h b/include/cutlass/layout/vector.h index 0700e587..b54b6b3b 100644 --- a/include/cutlass/layout/vector.h +++ b/include/cutlass/layout/vector.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_coord.h b/include/cutlass/matrix_coord.h index 8ba61a5e..b432665e 100644 --- a/include/cutlass/matrix_coord.h +++ b/include/cutlass/matrix_coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_shape.h b/include/cutlass/matrix_shape.h index 1d0b4820..cb3118c2 100644 --- a/include/cutlass/matrix_shape.h +++ b/include/cutlass/matrix_shape.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/matrix_traits.h b/include/cutlass/matrix_traits.h index 8e7fe330..cf7002a4 100644 --- a/include/cutlass/matrix_traits.h +++ b/include/cutlass/matrix_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/numeric_conversion.h b/include/cutlass/numeric_conversion.h index ef4604cb..78181ce7 100644 --- a/include/cutlass/numeric_conversion.h +++ b/include/cutlass/numeric_conversion.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -45,7 +45,9 @@ enum class FloatRoundStyle { round_toward_zero, ///< round toward zero round_to_nearest, ///< round to nearest even round_toward_infinity, ///< round toward infinity - round_toward_neg_infinity ///< round toward negative infinity + round_toward_neg_infinity, ///< round toward negative infinity + round_half_ulp_truncate, ///< add 0.5ulp to integer representation then round toward zero + round_half_ulp_trunc_dntz ///< like round_half_ulp_truncate, except denorms are rounded *toward* zero }; ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -240,6 +242,232 @@ struct NumericConverter { } }; +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Partial specializations for float <=> bfloat16_t +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for float <= bfloat16_t +template +struct NumericConverter { + + using result_type = float; + using source_type = bfloat16_t; + static FloatRoundStyle const round_style = Round; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + return static_cast(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = bfloat16_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + return static_cast(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = bfloat16_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + uint32_t x32 = reinterpret_cast(s); + + #if defined(__CUDA_ARCH__) + if (::isfinite(s)) { + x32 += 0x8000; + } + #else + if (std::isfinite(s)) { + x32 += 0x8000; + } + #endif + + uint16_t x16 = uint16_t((x32 >> 16) & 0xffff); + return bfloat16_t::bitcast(x16); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = bfloat16_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + uint32_t x32 = reinterpret_cast(s); + uint16_t x16 = uint16_t(x32 >> 16); + + return bfloat16_t::bitcast(x16); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Partial specializations for float <=> tfloat32_t +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for float <= tfloat32_t +template +struct NumericConverter { + + using result_type = float; + using source_type = tfloat32_t; + static FloatRoundStyle const round_style = Round; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + return static_cast(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + unsigned storage = reinterpret_cast(s); + + if ((storage & 0x7f800000) != 0x7f800000) { + + bool mantissa_bit = ((storage & (1 << 13)) != 0); + bool round_bit = ((storage & (1 << 12)) != 0); + bool sticky_bit = ((storage & ((1 << 12) - 1)) != 0); + + if ((round_bit && sticky_bit) || (round_bit && mantissa_bit)) { + storage += uint32_t(1 << 13); + } + + // Note, the following is intentionally commented out. TF32 + // does not define the low order bits, so they may be left in + // an undefined state. + // + // By not truncating these bit explicitly, we avoid an extra logical + // operation. + // + // TF32 may be implicitly converted to float by performing this + // operation as needed. + // + // storage = (storage & ~0x1fff); + } + else if (storage & ~0xff800000) { + storage = 0x7fffffff; + } + + return tfloat32_t::bitcast(storage); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_truncate; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + return tfloat32_t::round_half_ulp_truncate(s); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +/// This rounding operation is similar to half_ulp_truncate except it rounds denorms toward zero. +/// It avoids predicated code, though it requires a temporary register. +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_half_ulp_trunc_dntz; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + + unsigned y = reinterpret_cast(s); + y = y & 0xff800000; + float d = reinterpret_cast(y); + float z = d / float(1 << 11) + s; + + return reinterpret_cast(z); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +template <> +struct NumericConverter { + using result_type = tfloat32_t; + using source_type = float; + static FloatRoundStyle const round_style = FloatRoundStyle::round_toward_zero; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & s) { + uint32_t x = reinterpret_cast(s); + return tfloat32_t::bitcast(x & 0xffffe000); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// // // Conversion and Clamp operator for Integers @@ -518,6 +746,77 @@ struct NumericArrayConverter { ///////////////////////////////////////////////////////////////////////////////////////////////// +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Array <= Array, round to nearest +template <> +struct NumericArrayConverter { + + using result_type = Array; + using source_type = Array; + static FloatRoundStyle const round_style = FloatRoundStyle::round_to_nearest; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & source) { + + unsigned d; + + asm("cvt.rn.bf16x2.f32 %0, %1, %2;\n" : "=r"(d) : "f"(source[1]), "f"(source[0]) ); + + return reinterpret_cast(d); + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Partial specialization for Array <= Array +template < + int N, + FloatRoundStyle Round +> +struct NumericArrayConverter { + + using result_type = Array; + using source_type = Array; + static FloatRoundStyle const round_style = Round; + + CUTLASS_HOST_DEVICE + static result_type convert(source_type const & source) { + + NumericArrayConverter convert_vector_; + NumericConverter convert_element_; + + result_type result; + + Array *result_ptr = reinterpret_cast *>(&result); + Array const *source_ptr = reinterpret_cast const *>(&source); + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < N / 2; ++i) { + result_ptr[i] = convert_vector_(source_ptr[i]); + } + + if (N % 2) { + result[N - 1] = convert_element_(source[N - 1]); + } + + return result; + } + + CUTLASS_HOST_DEVICE + result_type operator()(source_type const &s) { + return convert(s); + } +}; + +#endif // if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) + ///////////////////////////////////////////////////////////////////////////////////////////////// // Conditional guards to enable partial specialization for packed integers @@ -843,6 +1142,12 @@ struct PreferredRoundingMode { static FloatRoundStyle const kRound = FloatRoundStyle::round_to_nearest; }; +/// Defines preferred rounding mode for a pair of types +template <> +struct PreferredRoundingMode { + static FloatRoundStyle const kRound = FloatRoundStyle::round_half_ulp_truncate; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace cutlass diff --git a/include/cutlass/numeric_types.h b/include/cutlass/numeric_types.h index 2282e43e..9479ccb0 100644 --- a/include/cutlass/numeric_types.h +++ b/include/cutlass/numeric_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,6 +69,10 @@ struct sizeof_bits { ///////////////////////////////////////////////////////////////////////////////////////////////// #include "cutlass/integer_subbyte.h" + #include "cutlass/half.h" +#include "cutlass/bfloat16.h" +#include "cutlass/tfloat32.h" ///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/include/cutlass/platform/platform.h b/include/cutlass/platform/platform.h index 36d290bb..826b3977 100644 --- a/include/cutlass/platform/platform.h +++ b/include/cutlass/platform/platform.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/predicate_vector.h b/include/cutlass/predicate_vector.h index ac4f0278..92936962 100644 --- a/include/cutlass/predicate_vector.h +++ b/include/cutlass/predicate_vector.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/real.h b/include/cutlass/real.h index 8fa4d710..45ab1864 100644 --- a/include/cutlass/real.h +++ b/include/cutlass/real.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,7 @@ template struct RealType { using Type = T; +CUTLASS_HOST_DEVICE static T from_real(double x) { return static_cast(x); } diff --git a/include/cutlass/reduction/batched_reduction.h b/include/cutlass/reduction/batched_reduction.h index 83324ec0..16132a02 100644 --- a/include/cutlass/reduction/batched_reduction.h +++ b/include/cutlass/reduction/batched_reduction.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/batched_reduction_traits.h b/include/cutlass/reduction/batched_reduction_traits.h index c44238e1..46157dc7 100644 --- a/include/cutlass/reduction/batched_reduction_traits.h +++ b/include/cutlass/reduction/batched_reduction_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/device/reduce_split_k.h b/include/cutlass/reduction/device/reduce_split_k.h new file mode 100644 index 00000000..e3626f88 --- /dev/null +++ b/include/cutlass/reduction/device/reduce_split_k.h @@ -0,0 +1,215 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Kernel performing a reduction over densely packed tensors in global memory +*/ + +#pragma once + +#include "cutlass/device_kernel.h" +#include "cutlass/reduction/kernel/reduce_split_k.h" +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace reduction { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template < + typename ReductionKernel_ +> +class ReduceSplitK { +public: + using ReductionKernel = ReductionKernel_; + + using Shape = typename ReductionKernel::Shape; + using ReductionOp = typename ReductionKernel::ReductionOp; + using OutputOp = typename ReductionKernel::OutputOp; + + using ElementWorkspace = typename ReductionKernel::ElementWorkspace; + using ElementAccumulator = typename ReductionKernel::ElementAccumulator; + using ElementOutput = typename ReductionKernel::ElementOutput; + + using WorkspaceTensorRef = typename ReductionKernel::WorkspaceTensorRef; + using OutputTensorRef = typename ReductionKernel::OutputTensorRef; + + /// Argument structure + struct Arguments { + + // + // Data members + // + + MatrixCoord problem_size; + int partitions; + size_t partition_stride; + WorkspaceTensorRef workspace; + OutputTensorRef destination; + OutputTensorRef source; + typename OutputOp::Params output; + typename ReductionOp::Params reduction; + + // + // Methods + // + + /// Default ctor + CUTLASS_HOST_DEVICE + Arguments() : + problem_size(0, 0), + partitions(1), + partition_stride(0) { } + + CUTLASS_HOST_DEVICE + Arguments( + MatrixCoord const & problem_size + ): + problem_size(problem_size) { } + + CUTLASS_HOST_DEVICE + Arguments( + MatrixCoord problem_size_, + int partitions_, + size_t partition_stride_, + WorkspaceTensorRef workspace_, + OutputTensorRef destination_, + OutputTensorRef source_, + typename OutputOp::Params output_ = typename OutputOp::Params(), + typename ReductionOp::Params reduction_ = typename ReductionOp::Params() + ): + problem_size(problem_size_), + partitions(partitions_), + partition_stride(partition_stride_), + workspace(workspace_), + destination(destination_), + source(source_), + output(output_), + reduction(reduction_) + { + + } + + }; + +private: + /// Kernel parameters object + typename ReductionKernel::Params params_; + +public: + /// Constructs Reduction SplitK + ReduceSplitK() { } + + /// Determines whether the ReduceSplitK can execute the given problem. + static Status can_implement(Arguments const &args) { + + return Status::kSuccess; + } + + /// Gets the workspace size + static size_t get_workspace_size(Arguments const &args) { + // needs no additional workspace + return 0; + } + + /// Initializes Reduction state from arguments. + Status initialize( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + // initialize the params structure from the arguments + params_ = typename ReductionKernel::Params( + args.problem_size, + args.partitions, + args.partition_stride, + args.workspace, + args.destination, + args.source, + args.output, + args.reduction + ); + + return Status::kSuccess; + + } + + /// Initializes Reduction kernel state from arguments. + Status update(Arguments const &args, void *workspace = nullptr) { + + // update the params structure from the arguments + params_.workspace.reset(args.workspace.non_const_ref().data()); + params_.destination.reset(args.destination.non_const_ref().data()); + params_.source.reset(args.source.non_const_ref().data()); + params_.output = args.output; + params_.reduction = args.reduction; + + return Status::kSuccess; + } + + /// Runs the kernel using initialized state. + Status run(cudaStream_t stream = nullptr) { + + // + // Launch reduction kernel + // + dim3 block = ReductionKernel::block_shape(); + dim3 grid = ReductionKernel::grid_shape(params_.problem_size); + + Kernel<<< grid, block, 0, stream >>>(params_); + + cudaError_t result = cudaGetLastError(); + + return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal; + } + + + /// Runs the kernel using initialized state. + Status operator()(cudaStream_t stream = nullptr) { + return run(stream); + } + + /// Runs the kernel using initialized state. + Status operator()( + Arguments const &args, + void *workspace = nullptr, + cudaStream_t stream = nullptr) { + + Status status = initialize(args, workspace); + + if (status == Status::kSuccess) { + status = run(stream); + } + + return status; + } + +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace kernel +} // namespace reduction +} // namespace cutlass diff --git a/include/cutlass/reduction/kernel/reduce_split_k.h b/include/cutlass/reduction/kernel/reduce_split_k.h index 1869102f..586c90d8 100644 --- a/include/cutlass/reduction/kernel/reduce_split_k.h +++ b/include/cutlass/reduction/kernel/reduce_split_k.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -129,8 +129,8 @@ public: cutlass::MatrixCoord problem_size) { return dim3( - (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn, - (problem_size.row() + Shape::kRow -1) / Shape::kRow); + (problem_size.row() + Shape::kRow - 1) / Shape::kRow, + (problem_size.column() + Shape::kColumn - 1) / Shape::kColumn); } /// Determines the threadblock shape @@ -145,8 +145,8 @@ public: // Determine CTA position MatrixCoord thread_offset( - int(blockIdx.y) * Shape::kRow + threadIdx.y, - int(blockIdx.x) * Shape::kColumn + threadIdx.x * kElementsPerAccess + int(blockIdx.x) * Shape::kRow + threadIdx.y, + int(blockIdx.y) * Shape::kColumn + threadIdx.x * kElementsPerAccess ); // One guard conditional diff --git a/include/cutlass/reduction/thread/reduce.h b/include/cutlass/reduction/thread/reduce.h index ae03c821..698b174f 100644 --- a/include/cutlass/reduction/thread/reduce.h +++ b/include/cutlass/reduction/thread/reduce.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/thread/reduction_operators.h b/include/cutlass/reduction/thread/reduction_operators.h index 3eed6209..6f9aeb6f 100644 --- a/include/cutlass/reduction/thread/reduction_operators.h +++ b/include/cutlass/reduction/thread/reduction_operators.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/reduction/threadblock_swizzle.h b/include/cutlass/reduction/threadblock_swizzle.h index 6e42cada..2419cdf6 100644 --- a/include/cutlass/reduction/threadblock_swizzle.h +++ b/include/cutlass/reduction/threadblock_swizzle.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/relatively_equal.h b/include/cutlass/relatively_equal.h index cb6d68ca..5714fbd2 100644 --- a/include/cutlass/relatively_equal.h +++ b/include/cutlass/relatively_equal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -145,6 +145,28 @@ bool relatively_equal(half_t a, half_t b, half_t epsilon, half_t nonzero return detail::relatively_equal_float(a, b, epsilon, nonzero_floor); } +template <> +CUTLASS_HOST_DEVICE +bool relatively_equal( + bfloat16_t a, + bfloat16_t b, + bfloat16_t epsilon, + bfloat16_t nonzero_floor) { + + return detail::relatively_equal_float(a, b, epsilon, nonzero_floor); +} + +template <> +CUTLASS_HOST_DEVICE +bool relatively_equal( + tfloat32_t a, + tfloat32_t b, + tfloat32_t epsilon, + tfloat32_t nonzero_floor) { + + return detail::relatively_equal_float(a, b, epsilon, nonzero_floor); +} + template <> CUTLASS_HOST_DEVICE bool relatively_equal(float a, float b, float epsilon, float nonzero_floor) { diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h index 94b2eace..dc5523dc 100644 --- a/include/cutlass/semaphore.h +++ b/include/cutlass/semaphore.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,11 @@ public: CUTLASS_DEVICE void fetch() { if (wait_thread) { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + asm volatile ("ld.global.acquire.gpu.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); + #else asm volatile ("ld.global.cg.b32 %0, [%1];\n" : "=r"(state) : "l"(lock)); + #endif } } @@ -94,7 +98,11 @@ public: __syncthreads(); if (wait_thread) { + #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + asm volatile ("st.global.release.gpu.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); + #else asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status)); + #endif } } }; diff --git a/include/cutlass/subbyte_reference.h b/include/cutlass/subbyte_reference.h index 9ce52901..6f7aab2c 100644 --- a/include/cutlass/subbyte_reference.h +++ b/include/cutlass/subbyte_reference.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_coord.h b/include/cutlass/tensor_coord.h index 043f7a56..d7a6d0df 100644 --- a/include/cutlass/tensor_coord.h +++ b/include/cutlass/tensor_coord.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_ref.h b/include/cutlass/tensor_ref.h index 6567fe81..a805107c 100644 --- a/include/cutlass/tensor_ref.h +++ b/include/cutlass/tensor_ref.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/tensor_view.h b/include/cutlass/tensor_view.h index 3efb16a5..a9cf569d 100644 --- a/include/cutlass/tensor_view.h +++ b/include/cutlass/tensor_view.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -151,14 +151,20 @@ class TensorView : public TensorRef { /// Updates the pointer and layout object CUTLASS_HOST_DEVICE - void reset(Element* ptr, Layout const &layout, TensorCoord size) { + void reset(Element* ptr, Layout const &layout, TensorCoord const &extent) { Base::reset(ptr, layout); - this->resize(extent_); + this->resize(extent); + } + + /// Updates the pointer + CUTLASS_HOST_DEVICE + void reset(Element* ptr) { + Base::reset(ptr); } /// Changes the size of the view without affecting pointer or layout CUTLASS_HOST_DEVICE - void resize(TensorCoord extent) { + void resize(TensorCoord const &extent) { this->extent_ = extent; } diff --git a/include/cutlass/tfloat32.h b/include/cutlass/tfloat32.h new file mode 100644 index 00000000..64dc3914 --- /dev/null +++ b/include/cutlass/tfloat32.h @@ -0,0 +1,453 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! + \file + \brief Defines a proxy class for storing Tensor Float 32 data type. +*/ +#pragma once + +#if !defined(__CUDACC_RTC__) +#include +#include +#include +#endif + +#include "cutlass/cutlass.h" + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tensor Float 32 data type +struct alignas(4) tfloat32_t { + + // + // Data members + // + + /// Storage type + uint32_t storage; + + // + // Methods + // + + /// Constructs from an unsigned int + CUTLASS_HOST_DEVICE + static tfloat32_t bitcast(uint32_t x) { + tfloat32_t h; + h.storage = x; + return h; + } + + /// Emulated rounding is fast in device code + CUTLASS_HOST_DEVICE + static tfloat32_t round_half_ulp_truncate(float const &s) { + uint32_t x = reinterpret_cast(s); + + #if defined(__CUDA_ARCH__) + if (::isfinite(s)) { + x += 0x1000u; + } + #else + if (std::isfinite(s)) { + x += 0x1000u; + } + #endif + + return tfloat32_t::bitcast(x); + } + + /// Default constructor + CUTLASS_HOST_DEVICE + tfloat32_t() { } + + /// Floating-point conversion - round toward nearest even + CUTLASS_HOST_DEVICE + explicit tfloat32_t(float x): storage(round_half_ulp_truncate(x).storage) { } + + /// Floating-point conversion - round toward nearest even + CUTLASS_HOST_DEVICE + explicit tfloat32_t(double x): tfloat32_t(float(x)) { + + } + + /// Integer conversion - round toward zero + CUTLASS_HOST_DEVICE + explicit tfloat32_t(int x) { + float flt = static_cast(x); + storage = reinterpret_cast(flt); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator float() const { + + // Conversions to IEEE single-precision requires clearing dont-care bits + // of the mantissa. + unsigned bits = (storage & ~0x1fffu); + + return reinterpret_cast(bits); + } + + /// Converts to float + CUTLASS_HOST_DEVICE + operator double() const { + return double(float(*this)); + } + + /// Converts to int + CUTLASS_HOST_DEVICE + explicit operator int() const { + return int(float(*this)); + } + + /// Casts to bool + CUTLASS_HOST_DEVICE + operator bool() const { + return (float(*this) != 0.0f); + } + + /// Obtains raw bits + CUTLASS_HOST_DEVICE + uint32_t raw() const { + return storage; + } + + /// Returns the sign bit + CUTLASS_HOST_DEVICE + bool signbit() const { + return ((raw() & 0x80000000) != 0); + } + + /// Returns the biased exponent + CUTLASS_HOST_DEVICE + int exponent_biased() const { + return int((raw() >> 23) & 0x0ff); + } + + /// Returns the unbiased exponent + CUTLASS_HOST_DEVICE + int exponent() const { + return exponent_biased() - 127; + } + + /// Returns the mantissa + CUTLASS_HOST_DEVICE + int mantissa() const { + return int(raw() & 0x7fffff); + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool signbit(cutlass::tfloat32_t const& h) { + return h.signbit(); +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t abs(cutlass::tfloat32_t const& h) { + return cutlass::tfloat32_t::bitcast(h.raw() & 0x7fffffff); +} + +CUTLASS_HOST_DEVICE +bool isnan(cutlass::tfloat32_t const& h) { + return (h.exponent_biased() == 0x0ff) && h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isfinite(cutlass::tfloat32_t const& h) { + return (h.exponent_biased() != 0x0ff); +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t nan_tf32(const char*) { + // NVIDIA canonical NaN + return cutlass::tfloat32_t::bitcast(0x7fffffff); +} + +CUTLASS_HOST_DEVICE +bool isinf(cutlass::tfloat32_t const& h) { + return (h.exponent_biased() == 0x0ff) && !h.mantissa(); +} + +CUTLASS_HOST_DEVICE +bool isnormal(cutlass::tfloat32_t const& h) { + return h.exponent_biased() && h.exponent_biased() != 0x0ff; +} + +CUTLASS_HOST_DEVICE +int fpclassify(cutlass::tfloat32_t const& h) { + int exp = h.exponent_biased(); + int mantissa = h.mantissa(); + if (exp == 0x0ff) { + if (mantissa) { + return FP_NAN; + } + else { + return FP_INFINITE; + } + } + else if (!exp) { + if (mantissa) { + return FP_SUBNORMAL; + } + else { + return FP_ZERO; + } + } + return FP_NORMAL; +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t sqrt(cutlass::tfloat32_t const& h) { +#if defined(__CUDACC_RTC__) + return cutlass::tfloat32_t(sqrtf(float(h))); +#else + return cutlass::tfloat32_t(std::sqrt(float(h))); +#endif +} + +CUTLASS_HOST_DEVICE +tfloat32_t copysign(tfloat32_t const& a, tfloat32_t const& b) { + + uint32_t a_mag = (reinterpret_cast(a) & 0x7fffffff); + uint32_t b_sign = (reinterpret_cast(b) & 0x80000000); + uint32_t result = (a_mag | b_sign); + + return reinterpret_cast(result); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Standard Library operations and definitions +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace std { + +#if !defined(__CUDACC_RTC__) +/// Numeric limits +template <> +struct numeric_limits { + static bool const is_specialized = true; + static bool const is_signed = true; + static bool const is_integer = false; + static bool const is_exact = false; + static bool const has_infinity = true; + static bool const has_quiet_NaN = true; + static bool const has_signaling_NaN = false; + static std::float_denorm_style const has_denorm = std::denorm_present; + static bool const has_denorm_loss = true; + static std::float_round_style const round_style = std::round_to_nearest; + static bool const is_iec559 = false; + static bool const is_bounded = true; + static bool const is_modulo = false; + static int const digits = 19; + + /// Least positive value + static cutlass::tfloat32_t min() { return cutlass::tfloat32_t::bitcast(0x01); } + + /// Minimum finite value + static cutlass::tfloat32_t lowest() { return cutlass::tfloat32_t::bitcast(0xff7fffff); } + + /// Maximum finite value + static cutlass::tfloat32_t max() { return cutlass::tfloat32_t::bitcast(0x7f7fffff); } + + /// Returns smallest finite value + static cutlass::tfloat32_t epsilon() { return cutlass::tfloat32_t::bitcast(0x1000); } + + /// Returns smallest finite value + static cutlass::tfloat32_t round_error() { return cutlass::tfloat32_t(0.5f); } + + /// Returns smallest finite value + static cutlass::tfloat32_t infinity() { return cutlass::tfloat32_t::bitcast(0x7f800000); } + + /// Returns smallest finite value + static cutlass::tfloat32_t quiet_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); } + + /// Returns smallest finite value + static cutlass::tfloat32_t signaling_NaN() { return cutlass::tfloat32_t::bitcast(0x7fffffff); } + + /// Returns smallest finite value + static cutlass::tfloat32_t denorm_min() { return cutlass::tfloat32_t::bitcast(0x1); } +}; +#endif + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace std + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// +// Arithmetic operators +// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +CUTLASS_HOST_DEVICE +bool operator==(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) == float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator!=(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) != float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) < float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator<=(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) <= float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) > float(rhs); +} + +CUTLASS_HOST_DEVICE +bool operator>=(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return float(lhs) >= float(rhs); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator+(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) + float(rhs)); +} + + +CUTLASS_HOST_DEVICE +tfloat32_t operator-(tfloat32_t const& lhs) { + float x = -reinterpret_cast(lhs); + return reinterpret_cast(x); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator-(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) - float(rhs)); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator*(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) * float(rhs)); +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator/(tfloat32_t const& lhs, tfloat32_t const& rhs) { + return tfloat32_t(float(lhs) / float(rhs)); +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator+=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) + float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator-=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) - float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator*=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) * float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator/=(tfloat32_t & lhs, tfloat32_t const& rhs) { + lhs = tfloat32_t(float(lhs) / float(rhs)); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator++(tfloat32_t & lhs) { + float tmp(lhs); + ++tmp; + lhs = tfloat32_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t& operator--(tfloat32_t & lhs) { + float tmp(lhs); + --tmp; + lhs = tfloat32_t(tmp); + return lhs; +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator++(tfloat32_t & lhs, int) { + tfloat32_t ret(lhs); + float tmp(lhs); + tmp++; + lhs = tfloat32_t(tmp); + return ret; +} + +CUTLASS_HOST_DEVICE +tfloat32_t operator--(tfloat32_t & lhs, int) { + tfloat32_t ret(lhs); + float tmp(lhs); + tmp--; + lhs = tfloat32_t(tmp); + return ret; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// +// User-defined literals +// + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t operator "" _tf32(long double x) { + return cutlass::tfloat32_t(float(x)); +} + +CUTLASS_HOST_DEVICE +cutlass::tfloat32_t operator "" _tf32(unsigned long long int x) { + return cutlass::tfloat32_t(int(x)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/thread/matrix.h b/include/cutlass/thread/matrix.h index 1e1f3eeb..a54b3471 100644 --- a/include/cutlass/thread/matrix.h +++ b/include/cutlass/thread/matrix.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/pitch_linear_thread_map.h b/include/cutlass/transform/pitch_linear_thread_map.h index 71edb936..812dbd77 100644 --- a/include/cutlass/transform/pitch_linear_thread_map.h +++ b/include/cutlass/transform/pitch_linear_thread_map.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/thread/transpose.h b/include/cutlass/transform/thread/transpose.h index 552295d8..268e6481 100644 --- a/include/cutlass/transform/thread/transpose.h +++ b/include/cutlass/transform/thread/transpose.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/thread/unaryOp.h b/include/cutlass/transform/thread/unaryOp.h new file mode 100644 index 00000000..de4f79b9 --- /dev/null +++ b/include/cutlass/transform/thread/unaryOp.h @@ -0,0 +1,101 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +#pragma once + +#include "cutlass/cutlass.h" +#include "cutlass/complex.h" + +namespace cutlass { +namespace transform { +namespace thread { + +namespace UnaryTransform { + struct Identity; ///< None (i.e., identity) + struct Conjugate; ///< Complex conjugate +} + +/// Element-wise unary operator that transforms one element of a fragment at a time +template< + typename FragmentIn, ///< Input Fragment + typename FragmentOut,///< Output Fragment + typename Transform> ///< Unary transform operator +class UnaryOp +{ + public: + CUTLASS_DEVICE + static FragmentOut execute(FragmentIn &in) + { + static_assert(FragmentIn::kElements == FragmentOut::kElements, "Number of elements must match."); + static_assert(std::is_same::value || + std::is_same::value, + "Unary Operator not supported."); + + FragmentOut out; + if( std::is_same::value ) + { + CUTLASS_PRAGMA_UNROLL + for(int i=0; i < FragmentIn::kElements; ++i){ + out[i] = static_cast(in[i]); + } + } + else if( std::is_same::value ) + { + for(int i=0; i < FragmentIn::kElements; ++i){ + out[i] = conj(static_cast(in[i])); + } + } + return out; + } +}; + +template +class UnaryOp +{ + public: + CUTLASS_DEVICE + static FragmentIn execute(FragmentIn &in) + { + static_assert(std::is_same::value || + std::is_same::value, + "Unary Operator not supported."); + + if( std::is_same::value ) + { + return in; + } + else if( std::is_same::value ) + { + for(int i=0; i < FragmentIn::kElements; ++i){ + in[i] = conj(in[i]); + } + } + return in; + } +}; +} +} +} + + diff --git a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h index 2ab40add..c77a09ff 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/predicated_tile_access_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: @@ -216,6 +216,7 @@ class PredicatedTileAccessIterator(byte_ptr); - if (address_iterator_.valid()) { - frag_ptr[idx] = *access_ptr; - } + cutlass::arch::global_load( + frag_ptr[idx], access_ptr, address_iterator_.valid()); + ++address_iterator_; } } diff --git a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h index 58d2f7e3..0342a434 100644 --- a/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/predicated_tile_iterator_2dthreadtile.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h index 2047723d..0d775dff 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h index 73174b57..31f529e0 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h index 6230b7a7..6eef4b52 100644 --- a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -798,6 +798,269 @@ class RegularTileAccessIterator +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128bs"); + }; + + private: + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : byte_offset_(0) { + layout::PitchLinearCoord thread_offset_base = + ThreadMap::initial_offset(thread_id); + + // initialize pointer + pointer_ = reinterpret_cast( + ref.data() + ref.offset(thread_offset_base)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + AccessType *access_ptr = pointer_; + + int access_offset = + (iteration_strided_ * ThreadMap::Delta::kStrided * Layout::kInterleavedK + + iteration_contiguous_ * ThreadMap::Delta::kContiguous) / ThreadMap::kElementsPerAccess; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_strided_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + add_pointer_offset(coord.contiguous() * Shape::kCount); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for k interleaved arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// + +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + cutlass::MatrixShape, + Element, + layout::TensorOpMultiplicandRowMajorInterleaved::value, InterleavedK>, + (kAdvanceRank == 1 ? 0 : 1), + ThreadMap + >; + + private: + + /// Element type per access + using AccessType = Array; + + private: + + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + iterator_.set_iteration_index(index); + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return iterator_.get(); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.strided(), coord.contiguous()}); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace transform } // namespace cutlass diff --git a/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h new file mode 100644 index 00000000..5a0c74fd --- /dev/null +++ b/include/cutlass/transform/threadblock/regular_tile_access_iterator_tensor_op_sm80.h @@ -0,0 +1,1522 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Templates implementing computing the addresses of storing of tiles + from pitch-linear rank=2 tensors. +*/ + +#pragma once + +#include "cutlass/array.h" +#include "cutlass/cutlass.h" +#include "cutlass/layout/pitch_linear.h" +#include "cutlass/layout/tensor_op_multiplicand_sm75.h" +#include "cutlass/layout/tensor_op_multiplicand_sm80.h" +#include "cutlass/matrix_coord.h" +#include "cutlass/matrix_shape.h" +#include "cutlass/tensor_ref.h" +#include "cutlass/transform/threadblock/regular_tile_access_iterator.h" + +//////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace transform { +namespace threadblock { + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for congruous arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandCongruous64b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicandCongruous64b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 64; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 64b"); + + ///< Number of pointers + static int const kPointerCount = 1; + }; + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / Layout::kElementsPerAccess), + byte_offset_(0) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data() + ref.offset(thread_offset_in_threadblock_tile)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + + AccessType *access_ptr = pointer_; + + int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ + + iteration_contiguous_ * ThreadMap::Delta::kContiguous / + ThreadMap::kElementsPerAccess; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset( + coord.contiguous() * Shape::kContiguous + + coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicandCongruous64b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous64b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous64b, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicandCongruous64b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous64b, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for crosswise arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicand64bCrosswise, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicand64bCrosswise; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 64; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 64b"); + + ///< Number of pointers - two pointers are needed if making more than 4 iterations along + ///< strided dimension + static int const kPointerCount = (ThreadMap::Iterations::kStrided > 4 ? 2 : 1); + }; + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_[Detail::kPointerCount]; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / ThreadMap::kElementsPerAccess) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data()); + + byte_offset_[0] = ref.offset(thread_offset_in_threadblock_tile) * sizeof(Element); + + if (Detail::kPointerCount == 2) { + byte_offset_[1] = byte_offset_[0] ^ 8; + } + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + pointer_ += pointer_offset / ThreadMap::kElementsPerAccess; + } + + /// Returns a pointer + CUTLASS_DEVICE + AccessType *get() const { + + // Map the logical contiguous and strided access to the internal swizzled structure. + int uniform_offset = (iteration_strided_ & 0x3) * stride_ + (iteration_strided_ >> 3) * 16; + + char *access_byte_ptr = reinterpret_cast(pointer_ + uniform_offset); + + int byte_offset; + + // This iterator may require two byte offsets if it must load more than 8 rows (or 2 iterations) + // in the strided dimension + if (Detail::kPointerCount == 2 && (iteration_strided_ & 0x4)) { + byte_offset = byte_offset_[1]; + } + else { + byte_offset = byte_offset_[0]; + } + + return reinterpret_cast(access_byte_ptr + byte_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset(coord.strided() * Shape::kStrided + coord.contiguous() * Shape::kContiguous * stride_); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major crosswise TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicand64bCrosswise, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicand64bCrosswise, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major crosswise TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicand64bCrosswise; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicand64bCrosswise, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for congruous arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandCongruous128b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicandCongruous128b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128b"); + + ///< Number of pointers + static int const kPointerCount = 1; + }; + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / Layout::kElementsPerAccess), + byte_offset_(0) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data() + ref.offset(thread_offset_in_threadblock_tile)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + + AccessType *access_ptr = pointer_; + + int access_offset = iteration_strided_ * ThreadMap::Delta::kStrided * stride_ + + iteration_contiguous_ * ThreadMap::Delta::kContiguous / + ThreadMap::kElementsPerAccess; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset( + coord.contiguous() * Shape::kContiguous + + coord.strided() * Shape::kStrided * stride_ * Layout::kElementsPerAccess); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicandCongruous128b, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicandCongruous128b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous128b, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicandCongruous128b; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCongruous128b, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for congruous arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::TensorOpMultiplicandCrosswise128x4, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::TensorOpMultiplicandCrosswise128x4; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + static_assert(ThreadMap::kThreads / 32 > 1, + "This tile iterator requires at least two warps."); + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * + ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128b"); + + ///< Number of pointers + static int const kPointerCount = 1; + }; + + + static_assert(!(ThreadMap::Iterations::kStrided % 2), "This iterator requires at least two iterations along the strided dimension"); + + /// Element type per access + using AccessType = Array; + + private: + // + // Data members + // + + /// Stride value + Index stride_; + + /// Internal pointer to first access of tile + AccessType *pointer_; + + /// Internal byte offset + Index byte_offset_; + + /// Iteration in the contiguous dimension + int iteration_contiguous_; + + /// Iteration in the strided dimension + int iteration_strided_; + + public: + + /// Construct a TileIterator with zero threadblock offset + CUTLASS_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + stride_(ref.stride(0) / Layout::kElementsPerAccess), + byte_offset_(0) { + + layout::PitchLinearCoord thread_offset_base = ThreadMap::initial_offset(thread_id); + + // This is the offset of a thread within a threadblock tile for a specific + // pointer (units of elements) + layout::PitchLinearCoord thread_offset_in_threadblock_tile = thread_offset_base; + + // initialize pointer + pointer_ = reinterpret_cast(ref.data() + ref.offset(thread_offset_in_threadblock_tile)); + + set_iteration_index(0); + } + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { + + iteration_contiguous_ = index % ThreadMap::Iterations::kContiguous; + iteration_strided_ = index / ThreadMap::Iterations::kContiguous; + } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + + byte_offset_ += pointer_offset * sizeof(Element); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + + AccessType *access_ptr = pointer_; + + int offset_c = (iteration_contiguous_ * ThreadMap::Delta::kContiguous + (iteration_strided_ & 1) * 2); + int offset_s = (iteration_strided_ / 2) * 8; + + int access_offset = offset_c * stride_ + offset_s; + + char *access_byte_ptr = + reinterpret_cast(access_ptr + access_offset); + + return reinterpret_cast(access_byte_ptr + byte_offset_); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iteration_contiguous_; + + if (iteration_contiguous_ < ThreadMap::Iterations::kContiguous) + return *this; + + // Enter here only if (iteration_contiguous_ == + // ThreadMap::Iteration::kContiguous) + iteration_contiguous_ = 0; + ++iteration_strided_; + + if (iteration_strided_ < ThreadMap::Iterations::kStrided) { + return *this; + } + + // Enter here only if (iteration_stride_ == ThreadMap::Iteration::kStrided) + // which means we enter the next tile. + iteration_strided_ = 0; + + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + + RegularTileAccessIterator prev(*this); + + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + + add_pointer_offset( + coord.contiguous() * Shape::kContiguous * stride_ + + coord.strided() * Shape::kStrided * Layout::kElementsPerAccess); + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for column-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator< + Shape_, Element_, + layout::ColumnMajorTensorOpMultiplicandCrosswise128x4, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for column-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCrosswise128x4, + (kAdvanceRank == 0 ? 0 : 1), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.row(), coord.column()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile Iterator specialized for row-major congruous TensorOp formats. +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// +template +class RegularTileAccessIterator { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for row-major iterator may along advance along the " + "columns(rank=0) or rows(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = layout::RowMajorTensorOpMultiplicandCrosswise128x4; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileAccessIterator< + layout::PitchLinearShape, Element, + layout::TensorOpMultiplicandCrosswise128x4, + (kAdvanceRank == 0 ? 1 : 0), ThreadMap_>; + + using AccessType = typename UnderlyingIterator::AccessType; + + private: + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileAccessIterator( + TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ): + iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Overrides the internal iteration index + CUTLASS_HOST_DEVICE + void set_iteration_index(int index) { iterator_.set_iteration_index(index); } + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Returns a pointer + CUTLASS_HOST_DEVICE + AccessType *get() const { + return reinterpret_cast(iterator_.get()); + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.column(), coord.row()}); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileAccessIterator operator++(int) { + RegularTileAccessIterator prev(*this); + ++iterator_; + + return prev; + } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace threadblock +} // namespace transform +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator.h b/include/cutlass/transform/threadblock/regular_tile_iterator.h index 8445b836..d7928ac0 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h index 93849c65..c3f0b524 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h index 4ea47293..85d702fe 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear_2dthreadtile.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h index 21176880..c7f06907 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -831,6 +831,269 @@ class RegularTileIterator +class RegularTileIterator< + Shape_, Element_, + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandRowMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Internal details made public to facilitate introspection + struct Detail { + /// This iterator is specialized for an access size that is 128 bits in + /// length. + static int const kAccessSizeInBits = 128; + + static_assert(sizeof_bits::value * ThreadMap::kElementsPerAccess == + kAccessSizeInBits, + "This iterator requires a policy whose access size is 128bs"); + }; + + private: + + /// Element type per access + using AccessType = Array; + + public: + /// Fragment object to be loaded or stored + using Fragment = + Array; + + /// Underlying iterator to compute the addresses + using TileAccessIterator = RegularTileAccessIterator; + + private: + // + // Data members + // + + /// Data member to the tile access iterator + TileAccessIterator address_iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : address_iterator_(ref, thread_id) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + address_iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator &operator++() { + address_iterator_.add_pointer_offset(Shape::kCount); + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator operator++(int) { + RegularTileIterator prev(*this); + this->operator++(); + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + address_iterator_.add_pointer_offset(coord.contiguous() * Shape::kCount); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + address_iterator_.set_iteration_index(0); + AccessType *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + int access_idx = c + s * ThreadMap::Iterations::kContiguous; + frag_ptr[access_idx] = *(address_iterator_.get() + pointer_offset); + ++address_iterator_; + } + } + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment &frag) { load_with_pointer_offset(frag, 0); } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + AccessType const *frag_ptr = reinterpret_cast(&frag); + + CUTLASS_PRAGMA_UNROLL + for (int s = 0; s < ThreadMap::Iterations::kStrided; ++s) { + CUTLASS_PRAGMA_UNROLL + for (int c = 0; c < ThreadMap::Iterations::kContiguous; ++c) { + int access_idx = c + s * ThreadMap::Iterations::kContiguous; + *(address_iterator_.get() + pointer_offset) = frag_ptr[access_idx]; + ++address_iterator_; + } + } + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); } +}; + +//////////////////////////////////////////////////////////////////////////////// + +/// Tile iterator specialized for k interleaved arrangements for TensorOps +/// +/// +/// Satisfies: ForwardTileIteratorConcept | +/// ReadableContiguousTileIteratorConcept | +/// WriteableContiguousTileIteratorConcept +/// + +template +class RegularTileIterator< + Shape_, Element_, + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>, + AdvanceRank, ThreadMap_, Alignment> { + + public: + static_assert( + AdvanceRank == 0 || AdvanceRank == 1, + "Specialization for pitch-linear iterator may along advance along the " + "contiguous(rank=0) or strided(rank=1) dimension."); + + using Shape = Shape_; + using Element = Element_; + using Layout = + layout::TensorOpMultiplicandColumnMajorInterleaved::value, + InterleavedK>; + static int const kAdvanceRank = AdvanceRank; + static int const kAlignment = Alignment; + + using Index = typename Layout::Index; + using LongIndex = typename Layout::LongIndex; + + using TensorRef = TensorRef; + using TensorCoord = typename Layout::TensorCoord; + + using ThreadMap = ThreadMap_; + + /// Underlying iterator type + using UnderlyingIterator = RegularTileIterator< + cutlass::MatrixShape, + Element, + layout::TensorOpMultiplicandRowMajorInterleaved::value, InterleavedK>, + (kAdvanceRank == 1 ? 0 : 1), + ThreadMap + >; + + public: + /// Fragment object to be loaded or stored + using Fragment = Array; + + private: + + /// Underlying iterator + UnderlyingIterator iterator_; + + public: + /// Construct a TileIterator with zero threadblock offset + CUTLASS_HOST_DEVICE + RegularTileIterator(TensorRef ref, ///< Pointer to start of tensor + int thread_id ///< ID of each participating thread + ) + : iterator_({ref.data(), ref.stride()}, thread_id) {} + + /// Adds a pointer offset in units of Element + CUTLASS_HOST_DEVICE + void add_pointer_offset(LongIndex pointer_offset) { + iterator_.add_pointer_offset(pointer_offset); + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator &operator++() { + ++iterator_; + return *this; + } + + /// Advances to the next tile in memory. + CUTLASS_HOST_DEVICE + RegularTileIterator operator++(int) { + RegularTileIterator prev(*this); + ++iterator_; + + return prev; + } + + /// Adds a tile offset + CUTLASS_DEVICE + void add_tile_offset(TensorCoord const &coord) { + iterator_.add_tile_offset({coord.strided(), coord.contiguous()}); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load_with_pointer_offset(Fragment &frag, Index pointer_offset) { + iterator_.load_with_pointer_offset(frag, pointer_offset); + } + + /// Loads a fragment from memory + CUTLASS_DEVICE + void load(Fragment &frag) { load_with_pointer_offset(frag, 0); } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store_with_pointer_offset(Fragment const &frag, Index pointer_offset) { + iterator_.store_with_pointer_offset(frag, pointer_offset); + } + + /// Store a fragment to memory + CUTLASS_DEVICE + void store(Fragment const &frag) { store_with_pointer_offset(frag, 0); } +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + } // namespace threadblock } // namespace transform } // namespace cutlass diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h index ff5f0b45..82c8842e 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_tensor_op_sm70.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/include/cutlass/util/debug.h b/include/cutlass/util/debug.h deleted file mode 100644 index 9941b41a..00000000 --- a/include/cutlass/util/debug.h +++ /dev/null @@ -1,122 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without modification, are permitted - * provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright notice, this list of - * conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright notice, this list of - * conditions and the following disclaimer in the documentation and/or other materials - * provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used - * to endorse or promote products derived from this software without specific prior written - * permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR - * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND - * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - -#pragma once - -/** - * \file - * \brief Debugging and logging functionality - */ - -#include - -namespace cutlass { - -/****************************************************************************** - * Debug and logging macros - ******************************************************************************/ - -/** - * Formats and prints the given message to stdout - */ -#if !defined(CUDA_LOG) -#if !defined(__CUDA_ARCH__) -#define CUDA_LOG(format, ...) printf(format, __VA_ARGS__) -#else -#define CUDA_LOG(format, ...) \ - printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, \ - blockIdx.x, \ - blockIdx.y, \ - blockIdx.z, \ - threadIdx.x, \ - threadIdx.y, \ - threadIdx.z, \ - __VA_ARGS__); -#endif -#endif - -/** - * Formats and prints the given message to stdout only if DEBUG is defined - */ -#if !defined(CUDA_LOG_DEBUG) -#ifdef DEBUG -#define CUDA_LOG_DEBUG(format, ...) CUDA_LOG(format, __VA_ARGS__) -#else -#define CUDA_LOG_DEBUG(format, ...) -#endif -#endif - -/** - * \brief The corresponding error message is printed to \p stderr (or \p stdout in device code) - * along with the supplied source context. - * - * \return The CUDA error. - */ -__host__ CUTLASS_DEVICE cudaError_t cuda_perror_impl(cudaError_t error, - const char* filename, - int line) { - (void)filename; - (void)line; - if (error) { -#if !defined(__CUDA_ARCH__) - fprintf( - stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error)); - fflush(stderr); -#else - printf("CUDA error %d [%s, %d]\n", error, filename, line); -#endif - } - return error; -} - -/** - * \brief Perror macro - */ -#ifndef CUDA_PERROR -#define CUDA_PERROR(e) cuda_perror_impl((cudaError_t)(e), __FILE__, __LINE__) -#endif - -/** - * \brief Perror macro with exit - */ -#ifndef CUDA_PERROR_EXIT -#define CUDA_PERROR_EXIT(e) \ - if (cuda_perror_impl((cudaError_t)(e), __FILE__, __LINE__)) { \ - exit(1); \ - } -#endif - -/** - * \brief Perror macro only if DEBUG is defined - */ -#ifndef CUDA_PERROR_DEBUG -#ifdef DEBUG -#define CUDA_PERROR_DEBUG(e) CUDA_PERROR(e) -#else -#define CUDA_PERROR_DEBUG(e) (e) -#endif -#endif - -} // namespace cutlass diff --git a/include/cutlass/wmma_array.h b/include/cutlass/wmma_array.h index 7758309e..e8096139 100644 --- a/include/cutlass/wmma_array.h +++ b/include/cutlass/wmma_array.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/media/docs/code_organization.md b/media/docs/code_organization.md index ffab354e..9a00d305 100644 --- a/media/docs/code_organization.md +++ b/media/docs/code_organization.md @@ -88,6 +88,7 @@ tools/ cutlass/ library/ # header files for CUTLASS Deliverables Library (in cutlass::library:: namespace) + handle.h # implements a host-side API for launching kernels, similar to cuBLAS library.h # defines enums and structs to describe the tiled structure of operator instances manifest.h # collection of all instances @@ -175,6 +176,14 @@ examples/ 07_volta_tensorop_gemm/ # example demonstrating mixed precision GEMM using Volta Tensor Cores 08_turing_tensorop_gemm/ # example demonstrating integer GEMM using Turing Tensor Cores + + 10_planar_complex/ # example demonstrating planar complex GEMM kernels + + 11_planar_complex_array/ # example demonstrating planar complex kernels with batch-specific problem sizes + + 12_gemm_bias_relu/ # example demonstrating GEMM fused with bias and relu + + 13_fused_two_gemms/ # example demonstrating two GEMms fused in one kernel ``` ## Media @@ -211,7 +220,7 @@ of tests run may vary over time as more are added. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/doxygen_mainpage.md b/media/docs/doxygen_mainpage.md index 6b8e09dd..15656d25 100644 --- a/media/docs/doxygen_mainpage.md +++ b/media/docs/doxygen_mainpage.md @@ -120,7 +120,7 @@ cudaError_t cutlass_sgemm_nn( # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/efficient_gemm.md b/media/docs/efficient_gemm.md index d601ff5a..7a1a6ae7 100644 --- a/media/docs/efficient_gemm.md +++ b/media/docs/efficient_gemm.md @@ -216,6 +216,7 @@ participating warps - since each warp now owns a partial sum (since they compute The following additional resources describe design and implementation details of GEMMs targeting NVIDIA GPUs. +- [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100.](https://www.nvidia.com/en-us/gtc) (SR 21745) - [CUTLASS: Fast Linear Algebra in CUDA C++](https://devblogs.nvidia.com/cutlass-linear-algebra-cuda/) - [CUTLASS: SOFTWARE PRIMITIVES FOR DENSE LINEAR ALGEBRA AT ALL LEVELS AND SCALES WITHIN CUDA](https://on-demand-gtc.gputechconf.com/gtcnew/sessionview.php?sessionName=s8854-cutlass%3a+software+primitives+for+dense+linear+algebra+at+all+levels+and+scales+within+cuda) - [Programming Tensor Cores: NATIVE VOLTA TENSOR CORES WITH CUTLASS](https://developer.download.nvidia.com/video/gputechconf/gtc/2019/presentation/s9593-cutensor-high-performance-tensor-operations-in-cuda-v2.pdf) @@ -224,7 +225,7 @@ targeting NVIDIA GPUs. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/functionality.md b/media/docs/functionality.md index de8da82d..465fae7d 100644 --- a/media/docs/functionality.md +++ b/media/docs/functionality.md @@ -27,7 +27,16 @@ Hyperlinks to relevant unit tests demonstrate how specific template instances ma | **TensorOp** | 75 | 10.2+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu) | | **TensorOp** | 75 | 10.2+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu) | | **TensorOp** | 75 | 10.2+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu) | - +| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f16 => f16` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `f16 * f16 + f32 => {f16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `bf16 * bf16 + f32 => {bf16, f32}`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_bf16n_bf16t_bf16t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `tf32 * tf32 + f32 => f32`| {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `s8 * s8 + s32 => {s32, s8}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `s4 * s4 + s32 => {s32, s4}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `b1 ^ b1 + s32 => {s32, b1}` | { T } x { N } => {N,T} | [example](/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `f64 * f64 + f64 => f64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `cf32 * cf32 + cf32 => cf32` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu) | +| **TensorOp** | 80 | 11.0+ | `cf64 * cf64 + cf64 => cf64` | {N,T} x {N,T} => {N,T} | [example](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu), [Gaussian 3m](/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu) | ## Warp-level Matrix Multiply with Tensor Cores @@ -37,9 +46,13 @@ The following table summarizes supported warp level shapes for each TensorOp ins |-----------------|-----------------------|--------------------------------------------| | **TensorOp** | 8-by-8-by-4 | 32x32x4, 32x64x4, 64x32x4, 64x64x4 | | **TensorOp** | 16-by-8-by-8 | 32x32x8, 32x64x8, 64x32x8, 64x64x8 | +| **TensorOp** | 16-by-8-by-16 | 32x32x16, 32x64x16, 64x32x16, 64x64x16 | | **TensorOp** | 8-by-8-by-16 | 32x32x16, 32x64x16, 64x32x16, 64x64x16 | | **TensorOp** | 8-by-8-by-32 | 32x32x32, 32x64x32, 64x32x32, 64x64x32 | +| **TensorOp** | 16-by-8-by-32 | 32x32x32, 32x64x32, 64x32x32, 64x64x32 | +| **TensorOp** | 16-by-8-by-64 | 32x32x64, 32x64x64, 64x32x64, 64x64x64 | | **TensorOp** | 8-by-8-by-128 | 32x32x128, 32x64x128, 64x32x128, 64x64x128 | +| **TensorOp** | 16-by-8-by-256 | 32x32x256, 32x64x256, 64x32x256, 64x64x256 | TensorOp instructions depend on a permuted shared memory layout that can be efficiently loaded from. The following tables summarize the destination shared memory layout that @@ -68,6 +81,38 @@ from global memory with layout specified in the column "GMEM Layout." | **C** | `half_t` | `RowMajor` | `RowMajor` | | **C** | `float` | `RowMajor` | `RowMajor` | +**TensorOp 16-by-8-by-8.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `tfloat32_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<32>` | +| **A** | `tfloat32_t` | `RowMajor` | `RowMajorTensorOpCrosswise<32>` | +| **B** | `tfloat32_t` | `ColumnMajor` | `ColumnMajorTensorOpCrosswise<32>` | +| **B** | `tfloat32_t` | `RowMajor` | `RowMajorTensorOpCongruous<32>` | +| **C** | `float` | `RowMajor` | `RowMajor` | + + +**TensorOp 16-by-8-by-16.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `half_t`, `bfloat16_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<16>` | +| **A** | `half_t`, `bfloat16_t` | `RowMajor` | `RowMajorTensorOpCrosswise<16>` | +| **B** | `half_t`, `bfloat16_t` | `ColumnMajor` | `ColumnMajorTensorOpCrosswise<16>` | +| **B** | `half_t`, `bfloat16_t` | `RowMajor` | `RowMajorTensorOpCongruous<16>` | +| **C** | `half_t` | `RowMajor` | `RowMajor` | +| **C** | `float` | `RowMajor` | `RowMajor` | + +**TensorOp 8-by-8-by-4.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `double` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<64>` | +| **A** | `double` | `RowMajor` | `RowMajorTensorOpCrosswise<64>` | +| **B** | `double` | `ColumnMajor` | `ColumnMajorTensorOpCrosswise<64>` | +| **B** | `double` | `RowMajor` | `RowMajorTensorOpCongruous<64>` | +| **C** | `double` | `RowMajor` | `RowMajor` | + **TensorOp 8-by-8-by-16.** |**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | @@ -76,6 +121,14 @@ from global memory with layout specified in the column "GMEM Layout." | **B** | `int8_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<8>` | | **C** | `int32_t` | `RowMajor` | `RowMajor` | +**TensorOp 16-by-8-by-32.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `int8_t` | `RowMajor` | `RowMajorTensorOpCrosswise<8>` | +| **B** | `int8_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<8>` | +| **C** | `int32_t` | `RowMajor` | `RowMajor` | + **TensorOp 8-by-8-by-32.** |**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | @@ -84,6 +137,14 @@ from global memory with layout specified in the column "GMEM Layout." | **B** | `int4b_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<4>` | | **C** | `int32_t` | `RowMajor` | `RowMajor` | +**TensorOp 16-by-8-by-64.** + +|**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | +|-----------|--------------|-----------------|------------------------------------| +| **A** | `int4b_t` | `RowMajor` | `RowMajorTensorOpCrosswise<4>` | +| **B** | `int4b_t` | `ColumnMajor` | `ColumnMajorTensorOpCongruous<4>` | +| **C** | `int32_t` | `RowMajor` | `RowMajor` | + **TensorOp 8-by-8-by-128.** |**Operand**|**Element** | **GMEM Layout** | **SMEM Layout** | @@ -119,7 +180,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++ # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/fundamental_types.md b/media/docs/fundamental_types.md index 98374022..7556cd45 100644 --- a/media/docs/fundamental_types.md +++ b/media/docs/fundamental_types.md @@ -16,6 +16,8 @@ Most types in CUTLASS are usable in both host code and device code. Moreover, th CUTLASS defines classes for the following numeric data types. * `half_t`: IEEE half-precision floating point (exponent: 5b, mantissa: 10b; literal suffix `_hf`) +* `bfloat16_t`: BFloat16 data type (exponent: 8b, mantissa: 7b; literal suffix `_bf16`) +* `tfloat32_t`: Tensor Float 32 data type (exponent: 8b, mantissa: 10b; literal suffix `_tf32`) * `int4_t`, `uint4_t`: 4b signed and unsigned integer (literal suffx `_s4`, `_u4`) * `bin1_t`: 1b binary numeric type (literal suffix `_b1`) * `complex`: defines complex-valued data type based on the supplied real-valued numeric type @@ -182,6 +184,39 @@ AlignedArray *ptr = reinterpret_cast *>(smem_ AlignedArray x = ptr[threadIdx.x]; // 128b shared memory load ``` +### Numeric Conversion + +CUTLASS defines procedures for performing numeric conversion between data types in `cutlass/numeric_conversion.h`. +Where possible, these target hardware acceleration on the target architecture and support multiple rounding modes. + +```c++ +#include "cutlass/numeric_conversion.h" +#include "cutlass/numeric_types.h" + +NumericConverter convert_f32_to_f16; +NumericConverter convert_f32_to_tf32; + +half_t x = convert_f32_to_f16(3.14159f); +tfloat32_t y = convert_f32_to_tf32(3.14159f); +``` + +Recent GPU architectures such as NVIDIA Turing and Ampere combine numeric conversion with efficient packing +into bit vectors. Consequently, CUTLASS defines conversion on both scalars and `Array<>` objects to implement +the optimal code sequence on all architectures. + +```c++ +// +// Example: convert and pack 32b signed integers to a vector of packed signed 8-bit integers. +// +int const kN = 16; +Array destination; +Array source; + +NumericConverter convert; + +destination = convert(source); +``` + ### Coord ```c++ @@ -311,7 +346,7 @@ support on current and future NVIDIA GPUs. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/gemm_api.md b/media/docs/gemm_api.md index 0d58cd36..759b1cd4 100644 --- a/media/docs/gemm_api.md +++ b/media/docs/gemm_api.md @@ -514,7 +514,7 @@ to inline PTX. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/layout.md b/media/docs/layout.md index fc36a276..bacec0e4 100644 --- a/media/docs/layout.md +++ b/media/docs/layout.md @@ -267,7 +267,7 @@ Permuted Shared Memory Layouts: # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/profiler.md b/media/docs/profiler.md index 34051651..ad4c58ab 100644 --- a/media/docs/profiler.md +++ b/media/docs/profiler.md @@ -15,7 +15,7 @@ $ make cutlass_profiler -j To limit compilation time, only one tile size (128x128) is instantiated for each data type, math instruction, and layout. To instantiate all sizes, set the following environment variable when running CMake from an empty `build/` directory. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=all +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all ... $ make cutlass_profiler -j ``` @@ -102,7 +102,7 @@ Report: --verbose= If true (default), prints human-readable text to stdout. About: - --version CUTLASS 2.0.0 built on Nov 19 2019 at 13:01:00 + --version CUTLASS 2.2.0 built on Jun 8 2020 at 07:59:33 Operations: --operation= Specifies a particular operation to run or print the usage statement. @@ -191,29 +191,34 @@ Test your changes to gemm kernels with a quick functional test and save results Example command line for profiling SGEMM kernels is as follows: ```bash -$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=4352 --n=4096 --k=4096 +$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 + + ============================= Problem ID: 1 - Provider: CUTLASS - Operation: cutlass_simt_sgemm_128x128_nn + Provider: CUTLASS + OperationKind: gemm + Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1 - Disposition: Passed - Status: Success + Status: Success + Verification: ON + Disposition: Passed - Arguments: --m=4352 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 \ - --split_k_slices=1 --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 \ - --stages=2 --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 \ - --max_cc=1024 + cuBLAS: Passed - Bytes: 52428800 bytes - FLOPs: 146064539648 flops + Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ + --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 - Runtime: 10.5424 ms - Memory: 4.63158 GiB/s + Bytes: 180355072 bytes + FLOPs: 115992428544 flops - Math: 13854.9 GFLOP/s + Runtime: 6.73655 ms + Memory: 24.934 GiB/s + + Math: 17218.4 GFLOP/s ``` Note, the arguments which appear in the output may be used as command line parameters for subsequent invocations. @@ -224,31 +229,34 @@ Note, the arguments which appear in the output may be used as command line param To execute kernels targeting Tensor Core operations, supply the flag `--op_class=tensorop` in the command line. ```bash -$ ./tools/profiler/cutlass_profiler --op_class=tensorop +$ ./tools/profiler/cutlass_profiler --op_class=tensorop --m=3456 --n=4096 --k=8192 + + ============================= Problem ID: 1 - Provider: CUTLASS - Operation: cutlass_turing_h1688gemm_128x128_nt + Provider: CUTLASS + OperationKind: gemm + Operation: cutlass_tensorop_s16816gemm_f16_256x128_32x3_nn_align8 - Disposition: Passed - Status: Success + Status: Success + Verification: ON + Disposition: Passed - Arguments: --m=4352 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f16:column --alpha=1 --beta=0 \ - --op_class=tensorop --accum=f16 --cta_m=128 --cta_n=128 --cta_k=32 --stages=2 \ - --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8 \ - --min_cc=75 --max_cc=1024 + cuBLAS: Passed + Arguments: --m=3456 --n=4096 --k=8192 --A=f16:column --B=f16:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ + --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128 --cta_k=32 --stages=3 --warps_m=4 \ + --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024 - Bytes: 52428800 bytes - FLOPs: 146064539648 flops + Bytes: 180355072 bytes + FLOPs: 231956545536 flops - Runtime: 1.51255 ms - Memory: 32.2821 GiB/s - - Math: 96568.7 GFLOP/s + Runtime: 0.98647 ms + Memory: 170.272 GiB/s + Math: 235138 GFLOP/s ``` ## Covering the problem space @@ -271,7 +279,7 @@ with the `--output=` command line option as shown: ```bash $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn \ - --m=4352 --n=4096 --k=8:4096:8 --output=report.csv + --m=3456 --n=4096 --k=8:4096:8 --output=report.csv ``` To faclitate generation of pivot tables and charts, additional columns may be prepended with the @@ -279,13 +287,13 @@ To faclitate generation of pivot tables and charts, additional columns may be pr ```bash $ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sgemm_128x128_nn \ - --m=4352 --n=4096 --k=8:4096:8 --output=report.csv \ - --tags=cutlass:2.0,date:2019-11-19 + --m=3456 --n=4096 --k=8:4096:8 --output=report.csv \ + --tags=cutlass:2.2,date:2020-06-08 ``` # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/programming_guidelines.md b/media/docs/programming_guidelines.md index 5ce16af1..0cf7ea25 100644 --- a/media/docs/programming_guidelines.md +++ b/media/docs/programming_guidelines.md @@ -104,6 +104,14 @@ for (int idx = 0; idx < kN; ++idx) { // Loop has constant number of iterati ## Style +### C++ Style + +CUTLASS source code follows the +[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions. + +Design choices should be consistent with the +[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter. + ### CUDA Built-in Variables Avoid direct access to CUDA built-in variables `threadIdx`, `blockIdx`, `blockDim`, and `gridDim` within @@ -132,14 +140,6 @@ In particular, be sure to use: Avoid defining alternative implementations of the same functionality. Instead, prefer to enhance or extend additional components where it makes sense. -### C++ Style - -CUTLASS source code follows the -[Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with exceptions and extensions. - -Design choices should be consistent with the -[CppCoreGuidelines](https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md) recommendations by Stroustrup and Sutter. - ### Classes and Structs Type names use `CapitalLetters` except when implementations are a _perfect_ drop-in replacement for @@ -178,9 +178,10 @@ Members within classes and structures should be organized as follows: 3. Constructors 4. Other methods -This convention follows the [CUB library](https://nvlabs.github.io/cub/), -and it also approximates the usual order of Systems and Controls textbooks. That is, they start by -(1.) identifying relevant constants, (2.) define a state-space representation of the dynamical system +This convention follows the [CUB library](https://nvlabs.github.io/cub/) and is also described by +[Howard Hinnant](https://howardhinnant.github.io/classdecl.html). Unsurprisingly, it approximates +the usual ordering of chapters in a typical Systems and Controls textbook. That is, +(1.) identify relevant constants, (2.) define a state-space representation of the dynamical system under study (i.e. the data members), and (3.) devote subsequent chapters to definining dynamical behavior of the system (i.e. the methods). @@ -291,7 +292,7 @@ Github's pretty printer. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/quickstart.md b/media/docs/quickstart.md index 5f459223..4587b7d2 100644 --- a/media/docs/quickstart.md +++ b/media/docs/quickstart.md @@ -7,7 +7,7 @@ ## Prerequisites CUTLASS requires: -- NVIDIA CUDA Toolkit (9.2 or later required, 10.2 recommended) +- NVIDIA CUDA Toolkit (9.2 or later required, [11.0](https://developer.nvidia.com/cuda-toolkit) recommended) - CMake 3.12+ - host compiler supporting C++11 or greater (g++ 7.3.0 or Microsoft Visual Studio 2015 recommended) - Python 3.6+ @@ -20,23 +20,7 @@ $ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc $ mkdir build && cd build -$ cmake .. -DCUTLASS_NVCC_ARCHS=75 # compiles for NVIDIA's Turing GPU architecture -``` - -## Clang - -For experimental purposes, CUTLASS may be compiled with -[clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the -[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive). -At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm) -but does not enable TensorCores. - -```bash -$ mkdir build && cd build - -$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. - -$ make test_unit -j +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA Ampere GPU architecture ``` ## Build and run the CUTLASS Profiler @@ -120,6 +104,53 @@ $ make test_unit_gemm_warp -j [100%] Built target test_unit_gemm_warp ``` +## Building for Multiple Architectures + +To minimize compilation time, specific GPU architectures can be enabled via the CMake command, +selected by [CUDA Compute Capability.](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities) + +**NVIDIA Ampere Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA Ampere GPU architecture +``` + +**NVIDIA Turing Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=75 # compiles for NVIDIA Turing GPU architecture +``` + +**NVIDIA Volta Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS=70 # compiles for NVIDIA Volta GPU architecture +``` + +**NVIDIA Pascal Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS="60;61" # compiles for NVIDIA Pascal GPU architecture +``` + +**NVIDIA Maxwell Architecture.** +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS="50;53" # compiles for NVIDIA Maxwell GPU architecture +``` + +## Clang + +For experimental purposes, CUTLASS may be compiled with +[clang 8.0](https://github.com/llvm/llvm-project/releases/download/llvmorg-8.0.1/clang+llvm-8.0.1-amd64-unknown-freebsd11.tar.xz) using the +[CUDA 10.0 Toolkit](https://developer.nvidia.com/cuda-10.0-download-archive). +At this time, compiling with clang enables the CUTLASS SIMT GEMM kernels (sgemm, dgemm, hgemm, igemm) +but does not enable TensorCores. + +```bash +$ mkdir build && cd build + +$ cmake -DCUDA_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + +$ make test_unit -j +``` + + ## Using CUTLASS within other applications Applications should list [`/include`](/include) within their include paths. They must be @@ -143,10 +174,10 @@ int main() { ## Launching a GEMM kernel in CUDA -**Example:** launch a mixed-precision GEMM targeting Volta Tensor Cores. +**Example:** launch a mixed-precision GEMM targeting Turing Tensor Cores. ```c++ #include -#include +#include #include int main() { @@ -161,7 +192,7 @@ int main() { cutlass::layout::ColumnMajor, // LayoutOutput float, // ElementAccumulator cutlass::arch::OpClassTensorOp, // tag indicating Tensor Cores - cutlass::arch::Sm70 // tag indicating target GPU compute architecture + cutlass::arch::Sm75 // tag indicating target GPU compute architecture >; Gemm gemm_op; @@ -193,7 +224,7 @@ int main() { int lda = A.device_ref().stride(0); int ldb = B.device_ref().stride(0); int ldc = C.device_ref().stride(0); - int ldd = D.device_ref().stride(0); + int ldd = C.device_ref().stride(0); // // Launch GEMM on the device // @@ -372,9 +403,14 @@ To instantiate kernels of all tile sizes, data types, and alignment constraints, Several recipes are defined below for convenience. They may be combined as a comma-delimited list. -**Example.** All kernels for Volta and Turing architectures. +**Example.** All GEMM kernels targeting NVIDIA Ampere Tensor Cores. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=all +$ cmake .. -DCUTLASS_NVCC_ARCHS=80 -DCUTLASS_LIBRARY_KERNELS=tensorop*gemm +``` + +**Example.** All kernels for NVIDIA Volta, Turing, and Ampere architectures. +```bash +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=all ``` **Example.** All GEMM kernels targeting Turing Tensor Cores. @@ -384,17 +420,17 @@ $ cmake .. -DCUTLASS_NVCC_ARCHS=75 -DCUTLASS_LIBRARY_KERNELS=tensorop*gemm **Example.** All GEMM kernels with single-precision accumulation. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=s*gemm +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=s*gemm ``` **Example.** All kernels which expect A and B to be column-major. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=gemm*nn +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=gemm*nn ``` **Example.** All planar complex GEMM variants. ```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75" -DCUTLASS_LIBRARY_KERNELS=planar_complex +$ cmake .. -DCUTLASS_NVCC_ARCHS="70;75;80" -DCUTLASS_LIBRARY_KERNELS=planar_complex ``` diff --git a/media/docs/terminology.md b/media/docs/terminology.md index 1ef0b383..07464143 100644 --- a/media/docs/terminology.md +++ b/media/docs/terminology.md @@ -74,7 +74,7 @@ contiguous and strided dimensions of a tile. # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/tile_iterator_concept.md b/media/docs/tile_iterator_concept.md index 4fd068f8..061ff907 100644 --- a/media/docs/tile_iterator_concept.md +++ b/media/docs/tile_iterator_concept.md @@ -466,7 +466,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept { # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/docs/utilities.md b/media/docs/utilities.md index e3d2a52c..b9ddc79a 100644 --- a/media/docs/utilities.md +++ b/media/docs/utilities.md @@ -111,8 +111,8 @@ std::cout << tensor.host_view() << std::endl; ## Device Allocations -To strictly allocate memory on the device using the smart pointers to manage allocation and deallocation, -use `cutlass::device_memory::allocation<>`. +To strictly allocate memory on the device using the smart pointer pattern to manage allocation and deallocation, +use `cutlass::DeviceAllocation<>`. **Example:** allocating an array in device memory. ```c++ @@ -128,7 +128,7 @@ int main() { size_t N = 1024; - cutlass::device_memory::allocation device_alloc(N); + cutlass::DeviceAllocation device_alloc(N); // Call a CUDA kernel passing device memory as a pointer argument kernel<<< grid, block >>>(alloc.get()); @@ -340,8 +340,9 @@ used throughout the unit tests. ```c++ #include #include -#include + #include +#include int main() { @@ -378,7 +379,7 @@ int main() { # Copyright -Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. ``` Redistribution and use in source and binary forms, with or without modification, are permitted diff --git a/media/images/cutlass-performance-plot.png b/media/images/cutlass-performance-plot.png index 1d76a7e64ba3c0c1aac16286fd4282f62b0649ed..9caf0223492535a717ab4a2a1e4bed04a5bc74f4 100644 GIT binary patch literal 69902 zcmd?R2UJsCw>F9uUKOz;-3kg)6d`n8ML|VCI#LBxga{ark^li;6r`(2k*Xpslz<4K z2SGq-K_NtG5dw)OLWB??K;W({7kRm)bWqf^6Oup>omB&@nOI)5m`%+owcfvN;m!S`=s~OI^*9b zVab2oS^xX&&E>G&zfYduSfBCxgin6=*Nwl=Zaw3Z`F-+d=l|fV2JC4ULO8F!wk<(> z-Uc@J?c?0H+Dy%WeL?d-E+&M*ldzEu0i2QFO5DOFU+LaRUBH5%cZ9$L?bSun_nlnA zhcEC(+V&=NZM9$Q#|n#4$j|)-^7?@WA77K~9!t_@5kFe5fMUbSZ0C1VQ*llI6U;Wq z?c$b5A>n+JY~uuKMac5J3$ZceyQvAYvW66|8Lbz&J=dW7+rscTGXY-EV&>ZrKp!1{ z%P(~~p{=jKItMXbLy3Oy;KAHC{XYAI#I$x_kFypTvD7Cy)CC>8=_SwfEFAr9IDd>o zo&R1tjF*wYwHu-f0drNJL(v=fx4$x|z?fBC!4YoSy;X*iN(B>FN2a_30O84+dpQ$s z6;=IZKV)M{?eYENNpuOtUaE))FHQJd}H6y+}@>vHqevWD+P4;|RqvJDi zVF7yRmTGifn}k8Eh^yk>r`sms`US7u8G*(|i18>HGw;)|xD;GSZXmTV-wZ9Yes7IH z!5`kQMPqmOZo%m+|A@XSM7h|gnowt@H`3-HHcp>*UvvD4(FA05KITYD(-z^eht(e; z(?XDqjo$PBZwFS+M7m8N1~=TfEDHyXNo44WoFs$9EZ z=L&p!X<8+4t$esJns6y_8n2$<;e>L2d)-C5tLRJV%^BNw8ds|>uI+1r)(m&kGUW5I zBY+b&I=5eJ=i@8;tJSCt`SxLMm2ekES%_UV7Asa{Y2@`nCps_-pN>lOlCY&PM5UX{ z!<$1BGuEE7rWeF#(GDKzG!8?s1lhlI;SXAZS1+*}{cQnDJl_-1hS~|pq1cGY`c1bQ zEm!M3`2{`2C<|n*(mz=AlB-_Yl9Zhkb5S=tvU_8|(OE>7qXe|&SZL((kOUuJ*rf3$ zpnqX|Wl>GrY7eR%s}~vfol$|Am-YAWBNtu0--LpM6`=!CuhaU1m)oizYkiaUXVsi~ zSY@H|DX%TlB342-$LW?*JOzD24L*CFs^Th%bC*0i)3wo{aQ&WpE!7H}Lp;q%@p7ea14CT8&fp~t z*{Ge&BI;@GG$loz8c#!6>Dw7QBEIML)5>+47yYLOlMvdZ)t(JjS&MeELXsxCqHlNj zuy`CX;3(s@P0^kLzJ`_~;_n2T$rj!h+PjH?CG_lE9w1XC=r>sKKlYm-WoS+k^ z%(|y5tc2I9=MQKTkqsUh3vR27B6JA0rqGeb@|A2oMoC6PEJ>x#eOg$vdICw?sUusj zQ70)GQ5sIZ^GQWxg7Qp^nYOJZ%eT=nB`<_dhC(eZisH6lNhk*URQ$zIZ{ww9g?^Sj z19~k3>N^O9eP--`MTDySfKTW`pJ0WGd`IR*buMS3ClC0Qe(6iQDfj^!(9gU*ywGt3 zs)vP(&xa(Io*~&7ht_QXGPhE|6}@H)%)-{32%B3b%*}U(i%WV;8~2jTO|+8`hMh&4 zZE+IE5u=D}+lcC}w4KNGcMb&MG9-TZ9yW6S4;`z-zJL})K#Jv!0+YhN1IduP% z32QhaGvL0)ICOj^?_>u<>{amgo%T__B6Hf|J<9PyX)UuXK}6fdOk-THy;!1S~tms9i{Yuwjn!xsim)#47nX_Bou~B;{+V3hk0U2hbH^N)9s-J0I5-4=a%@){iwCftm157h>>)|eqF0BMge_n+fF3%D7 zoFX~)6nU>D_uQ2mPkQw>dg}ji%6Qj-?+Yu^a4#6L&hx9o2hgs}*8{UD%MQ++S6pZaZjzq35bd+rgWBjzI zHSUuUoJrW_KP4lpY!$-lbTV!j>&Z}0L};^!in&+2&3B`33_<6WjEWC3q_NUcfnV?S zr)N&i^DKP=&(eQFFZG_bt2e{e%bH-*G4Q7G4>H-88*-Uqa%T_UDDW^diW_<*>=<~E ziFj%IshFcz$3s`sRXr~?&<@>t5@|lp-ZAvT%FaHt2BxRlo*?D4$1C<^k#3HLZn@DN zxK>qK2{i!u(PswvA=_>x7D-G;uXE0Ler-R6CAf=dF8F2Abz+<~c`tuOeSXMmVd6t# zA;AebwGGj|94CQpQJbZRGaT3=tl?PEN6|{2k~Xo7l6VPuY-3NRW(or0!C;%^3&>3B zh8B9Jw$y78YSFC%2y3k3TF4Mx^!tKePRE)lPVG$j3X>kNy&@yO)~U0K@WVOHMvrW? z@hF23{CTi9VseqklNcUPo>*6>r+D&zk9vMi<%-I&Ka==R%zDo7N2L$5+hWtlOa(nY zrN0GydAj1aJzVxdiqGH+rdh;~jDB=SqQ)$Hp$ zaJe|~q?O}IM$^3wg^-4V6Dg5qhpsDsb9?98v%xrL^Cm+-9qg$$PZM^w&k~jK6cMPw zs_&uVC;YMFLCIU5BsmjLUH(E(6FBL=THES}>Hi%56sNvsuw$lz(PV@iA-jyu(sbLV zg@>cEg|p?Ha*$^zz4Obt3OmnsQjYZ$5uv7O`4o*9Xt^;g#FAK1ZX9dIIu*)h5FjTA z(AU{li@zAUy9hpml&emltyj}}&e{|31Z2(1Kl`4)jA}EFbiXf}*|z29WB8}=P+ z-W2YokOB#pDcpkZuQvK--R)g?=ZYA4CS!0H!Ar8^Atftv9fNvzr_KkgwD5~(9{Z0A z4ad4%;VUv``NlNquj=4heG||{qWkvza2uou)0cCbvo!sGTxe(=ALe%;%?vY`6D&%$ zNrPCc@54Vol@U(7qa{Z+CyUl)JLA-|>?G|SJmPx7wQiJo9KRtThuwUs0)dS^gp}Im z0T+-jee6v5rln1&Lbpm!*qJ4~jzc)!QxYnA0B@9TSLsx#%a$H^boCoX`e{k6>N}O4 z6rCSS?kzs6JXfd3b9GmZhmMr$vji7&*9XJDxjqd$f81-MqMxs>m#m48a&;k$){piW zC~n!xi8!KY2Z{S2e~GXdbn7vXr(qWFTP~0`-W=CT+y3y~N_$Yw9$aYuPp@DHN$M>I ztF=a9@AYl$EJJazd2xCzt+bz}9=aM#HBv;YL3%#tXi*+vOT&@+I%_2Rq3|v$koQFi z$ffGrHM#`%($>5Qe1gFrM&z;D;O$n;Dp%7c(?(2oL*LiOP{j7CQ{MWl@dk2+8$jDW zB8K6-XUVfvE0R@iboz2j^U2BXudwD#Msap2^&SIiZ)5VhNb30GRcdjpX@bwpu#kkt zQ%Q#u>s&FnAQsmz;|kUl3cP)DQpgRLWz>+RRebug766MTi{jZ9{1oPH)w6jSob4-`?f!{dC`Er z;Cgb>OxkSoRz(nQNLq(Z!7q;F)zk;XzBDi?YP7GDz^yjJ7_%ur!SS~MgTiL~PDYH#I zhKdbO5nZ;94`jS%w+hapo4{Y-? z*O1Ai#nW+vRgPEAs5f*DiET+Hsi?oNFZsf85wffI3|AT+ zMILZt>ev=?-M&&NH&?WJ1e}8-E8)v~@6zosRghmGD6d#cy}p>HM65(V$z zv2w|?=<~U2F*6f(O8V!sPc4>P#B8yu(Yk!HuwdgeqChE0+80o@i$q47-#5EiV+2p9 zoSwEq6&+kG7u}#Q=TM*TVpMFk{*|OB)@eN8T``J=pRu?8RGEsEWXovmhpalN-6wGc{T9rPjU~o52oGSWui3$k~KxIjTGwV zteQgD@&}A#-1F6g{n&mWrF4M}8@HHJ~zM89|-{IKQI~F)e>x`-t9bNecKLx^c zwf?=_RaIud#DOASdU?~3YquUmiT$uNvMl~m5BH@qFC!RA*qxc#Yd+Qs^JP2zyD@$l z`m2b9x}ysRfco!Eibox?(4ze$gV@}s%)KwKpLX@4^L=+h#|n)xV^wRT$9NQazy9Xc z8v){e?9yV}X?lv*b76+&w}44tU?-9C^NHSD*((fO{%GJ(KKhkP@aVt~U|lBBm={YP(%onXAIkiY~S>*Ex4kw>r>HH6Bl5;fa^~y{{TS;^VmPp5fJM1#2LGIFTCppVPlLj6C0Nhkd z^(H!`Q7E;;rLD+u?r{aC*5v4VWI#%$um9hunMO@4PnwdoL6VmZW%MQTcP|4?alBSCri5q=b6rX z<7D6A-lFI&&tQiGEMVbf!@HIMOfr&C1F)A|N0XIUQ@xCnNMzsai%7{!4M_*lRmvy$ zPF@&J?WR-^lbLvRF=;RHgry~j%5?SY^Xmng$3%1wXFG(R)*U)=Y<4Pqxg^ALChNBH_0yM&iEw6fbyZ{( z4PHno-!-ZTzjNoxgIYM8GiG?1*Dy69@x9!+HZ2Rvp}(H(>=nN`TKd7=u@i1v!I?%@ zEY@|_7aa|M_U6;-k?~N&iBP*q-%n4IX8$D|_50d%PYh+(8aM0=c~1i1177St{ML&- zkT8ZA3KkA&4DF5kTFpvLkb3Fy?by0|lZUe0m}Ex5x`$_GJM;`kE+h~jgU`#(+`lxN zzP5Awet#qYa-yph80jo9X2S-=JC$c^vB zn+FFl!P@Z@)nh+L=?#0(xJ$N&4U!_=HnUyt&)U8xCNi!8P^~b)Ap^P^r1TUU9(k23 z%lmx^bd_f*l)<*j&*(vySnItseJsJ<4BmU9rw%a<@AZEKuanM)%TqkNy*va9qswP0 z)~nq*UOV7sleo^cw(fv^w>mDZ@B$d;?#lRnjy!CN7OUwS4vb5m*}2)_((sOK>B~DH zHyL6dNFBI?ZRryc^ka|){wT4+Nu&AdKO7rOHNUegCMi}SBeU6%cq*KdCK@0P)6>Sf zYZvB{50++Y4WIm`{>IOkaqR3ye3;&S+e3U7DZwkS6Mo&3zN+7u>fLIIrHVN#u^L%U ztCIVfgI%GxSRbQU-$qiY-i@EA`uH|DF<~i#K^L1xnAG0f79^hz0?@eBP zZ*&_>+2=|2&bJZOB3eJ%e)yP<)&*jDJqw?F_TqG%);33>W1zQ2 z&n3Nq{S`M!Z+CuMGr5ZVfLV)EqCDSn7v!cx)%!?pPv7FIwjmr`8$@O5MaUqeo3G5s zSE-9qIYRb$N{}g?gdzPM#@&p4qv-s*df$SF0*RFST$ zP4$Zvk;IK+`js$JTfZ2t;B73jqs&QCCn|l-?=}or&NOco4XXH42s0cm+%p5-V? zctyLln{e*F1uQ&4qps6Weh^Kqv>Q^KRg+A|D`c$M#! z!}K)KJcLoK$CGUHUqM${{pYg)c>Hxdzrmg2UuXaOBmVz`#fAUst4eAH9xrhic!Xa_ z@m#<@u@B#*pROkgzJ7NX~cHZg49mVG&yMpIuu49N3rB3gAogAHy}u+M*@^ zXO&@J&4Hwc$m5eKmVicn`r|0>#O9bqR+%9n` zA$q)ql10mM#h1Ibo1xX&NH2RCSVJsosdj?6yhutAHi#F$n9wFwCo3g~HI4{rL=hQ| z9j_?bJ?et5K>!4T2y%l~oHQ;co=}L2s(k*PdBZ{uCXvSPS!&7fDD;CpB^(Y-xa>1! zL3oynylDqrJisVZfWR`lse_p*%)MY>R$J`~0D|{t_o3e1&VN;})433=&?xQIlOXGr z*S@f=wor1s4Iz2m=F&z8iN^l`qIaLaO`Ra!Q<0tGh&ay7!}K$}`?vrzp_QiAvfbzZfwEyE4f^cxAaM>&Q25n*Q-A!RC#?b|a5MUur%vXsIwi>r%t~X}DxL z(sF}d-z^vUTP|Jh#qa!XNp|^^q*cy}CeQ*O%|gf78bUWKBO>88DKvYCp;4@QSA7W@ z457?cfMe$Vj2q9JSHpiEC$AhEs~0#nxApk8QytyR#WdJ@y_^i_Zi@PQa(BA9H^P)v zDvL7o9IBGnrg}PP83I^}W8Jb1?TZp4+>EnzG7%LaA%`x1vU*tP^ze2X;f|S6hCw~1 z-2t2Y>78vadFd1iZWHM;N~%fr%xGHBvU7*xkGaT;QA866xpB1mi+!bI74|7KM~L2T z*LW96f;Ax1MxI;3;8Tez`?J!;YUBC&si71}<~hq8B;OP6eB}_EQ07ssR-IBRxdzTiOR~6_F^$q9;mYu6RQQaos)+r9mUx0OzA8 zYa0Q}n=ZXrF9U|wf8y+holoDl>^Z1IP`rIb@an`YHD-p*_WIh6E4I25v*XX1TZX`G?<7||C6Eu%)sWL;?Z*zIFDKFi)a z_5@X}dN_6XDRq8+Caa*Qn?pTSDQZ04s z$+!q`rMAGz^*3X+3s9DyE1{g!V=U<>CvS1Y=?Q5+MX)EcbvYP}1K}OqY!4TUa;(mE z!oR*~fyVBKoNXYbM#@JWw%>Iy5v7LCZSL(Wh`2WmN+09*_$4nSNCyeZSW@5<^g4m9 zadztz3GZyLgTe$+H+)MQ&xuBRM z@k&A5Vy@DLh0!2yDsvkuD~Y%tC--FX&<@j*RP{@N&J1#QA%R$ii&X9LRf#5)+fJcM z@3MMf5B8K-CKKdUA>%642v= z)oC_cixrCqm#!8i`^-*!;Ar#n&hsig$i3KR)S_FyF@AK zC;r5Ix?CH9^We&aOP6U0nJX0J2b`3Suj?LiD7#H?ruw8~Eu%-%+}j=Abc%RN8)acR ziO2fjaSL1Dp7zVCP%V)47&_EtE~s?A7ZqQ2Zll8`(nckea}L$Nvf`=rK+)zP>Xne5 zawG4;K`E8Hr{m!231>70@UxH08^iiy#ek5=sfjH*yuPo{MsXiM(%tu-9iOHq4m;DTQ?Li_H7M zAD{%zY}-OVSN%%#$@`)+5qUj{G+xE7^VV(z<3@5}#t- zQKYG6tD;CLe=0q&o@N+I(!nCJu7~VdKPqr}1}$;Bs!p7L4t?6|cEA15x!R|Ios+zJ z0t1m{Bo-j(w4FBQb1eikTZFL=aHT-`XTP}#NWQSWZPj}RE~P`fjT^F%ByqJn2bhRM z+a5>WI@Etv(7$aU4X$+PG(2nWT=h|*k$Bq&UXQetlsl{{@CPnqJ&G$V8nW8{$Q3jf zv|$c8s%EF;b#=l|7} zAK}eMS%Nbw5v&9Fshcg&FImCWWO|Ov<|Wxtt->{S4tH(%GdB;Hm=!6BIL1W8OX+4I z5{tYDnjHi0@HpI%6_$rvTo$_{Z`q>Du1YEKB8ce=R>wgddt8rmjn>VlXxHMYnYwGe zOOn||5i*R>V5P1s?I5p3{tYwB3PP~$MNg`q44_cdc|4u~R}pplPq-}KiNQ`|{YoP&OT=1vWd}o0z|<`gRqenA9PQB4Wy|)G+uHy&E{;9n zNr%_l#w#^s2ADD zp}FMQYn~(INiCa9E(RPUr>}4ni56TQu_~!nyKN8Md-Px56)&#-1fd;XRpQw>))HAa zl&iQmE6T%qM#azbV)w+MhfgET@r${mM6=%`mrX9z58z z?xFglNTdnJIcbx;TGG80gE>{}IxqFx7lWsVYd{D#-(WfNQeAefH5nKFj)KCkqYak^ zaCj;|N}ADp;gp^t+f`vxG+wh1WHf8lPH%%%$PCZ@B#IFRk*~B`>6sW?R|ELOO0`D#49P|rbb(i!p#O7c+mODqq_uPN@{Of2# zP+v|KsXZazv0;{Ry-dp{-Fi%%08r5s1REn=8li%6(lkN6O@tLaB9F7IS}bN(ORR=y z1QdPdO35BbA+Uy3D&_lZoV@l}zF^R9jCRv(RlYNm(3Wr|FBOrh2=oB+j(Qo_-reO! z^Uu$imcJG~@DOFjzg_%7LUJ;32OZ;ZTNkIh77Xkk$pR;{KCK01)2jN)^yHg8>!Lm0 zzba&0cgOE=>-undNSp`RnrF5I8;(0Cxw;+HZLgcIm19bq8@$hV@@s;1cfw;qAPwwF ztc@bd@`c`gX3nh7=xo#CiOL46R$ux;+Jxt)>?CW2F7%N51YxCmtXH+=RZj6(r>D04 zcAj&V;?a7ryu{BBN)zyw)VO<0TU6n%L`(eC`IIN#leQ8vXsarY1-2W^3>XEy8Tj-C zqYRmr^VbS$yKkEh6;SMU;c}?z5zBO}E|)r)<_oN4PkYWhdT{442jSVOvFqTw$)BR~ z3zf0~T+|ZBt+n@c>iLl(^cYib4A?==yZ%$+*6qyelsgo^d-R0m21Jfk3o1fQ&MisQZP{0l z4$($?Ld#^+KZgw}G0)FPzKYrL_J|~>98KF~8DV*~OK&^;;f+sIrx@tY@PmHYGC5JQ z8Tg;?0BH>QFxMUi9OY^_UMr#QbA*YIM0|Szl66U2;N~gL#UAf(aG1s;EY|JQ^Hra6 zdjjeViV&cNTnVJcNw1Eo0r!iz>)=>&H{;H&jV#oQj;SF`z9So`XbKHcEpFoku#{^& zSpT^`%vDq9@QD*AUX3o!dY+54wCM`ZTzw+K0DFf zpjm|I)$*5+(fbyKou8grsJz;JFva5z<^vz@NC%-&wg)yS3+szWM$zL0YGOrGJ2)~} z1d_Qz<_*bFtlxVAN!hLEm18I=IHZ1{#gTs1#u$oMGBtWFsua6G13^w-PHroq)i2#U zz_BA#_g#?dXZp&cwUODMEkgN@A#L1Is6k<#7KCb~FV=Pq3G%%#IJa2{BqL%ZR@{`p znF;4-uy+e&=Lbm0*x}em{09&1c|wCYW{{qX&BL^WeCCF{Fw+csg{;t37AFYI@Hl(l zfjZbc$L*pKlY?36LJTKWo;sz>i2fMDC~Bu&mi$@=;!|_)yR3zYB$EC~Mas!2>4CrU z22$XGYXiBpo6migcZ@NsH)e@uhq8fX@5`Z$S9YOW;pO53DA)wlS}Q-0z1(g1FgksP zwOIRTjTCFPg8a6iiFIo2*`cc#Ds=u&#nEs-5fFRy(8h`Dmo+6t|7_+ZRxdlOMEduiAC7gB?Mf!zm%^99XgWCebARR*>(}%{ zH7_ui`7_-%8lv77xM!Qh0y%tnXy}DrWG&nI(5=Uj&Sy7zw+Y&8t5uDc&`d_;M3ot+ z^omnX7MNMDYQ#hP2_c`=Eh$>D%Nd5O*k%@M^4PQ?al zy)j^%S*&;eln*<;JEl6BRvV8x_`*Glu|Y`d5klK=6K%!F+aPaI^!v7d)qcsq{!qO$;nCXlT>ZA}(e*?>%++D`cAfuv%wu zQ9}E+V0MaMP5-;vFa5S!1*f38MQl0*5S)6H>v#)3ycx@Oh76L(OJ--%s$WQEOQ}5F zeJ_4-$@$(x5X{A^c#TlhT9Tt0g06+ zQ{UV%(a^yyry$GR0$Y?uR;Qqs)y6VZajt1J&!^X}?1r96q;usK*d16tqOEfMdy}+o;`6!g>?U02R(}EdwjB9k;(8i0=2Z)3f3I< zSMDPQ5JL(H%C{ZtkB4G`acJ^!-RcKM7WPM6ZNo{W(E3ntkuU73`8`ep?zQ!%BE_|p zML&g4NR48_1->RNZuriZ+Ei$sZ|1#N^{Od&W|h;W%tyY*O+lJcQTS!hCTax$rivkc za`u8E=UXL39;($gH>nt^&Wv^;lsXjhpWy%XSYB4Zx4*>~r)`~~6H+^r&=CUqPN(MY zu_)}VoF{Z8st`2Hq&OeBN7z1Y>(;Nc|D(H!|EqxC|5-iK|1zp7vaJEtWa{hN@YV{P z&d?P7OGW8haCcAwt}=Sp$a!-;8E$8Ih)lOy^~xtGvC(8xA5l;bJD~t%y$wea`YGn9pD|s z#?Zd@^7FYLKrc5|VsyAMBnT6CSk@ug`GbJg{8j^ffm1rdhxUuC>$KlZdgmJjd)Ycb z%LR?v`J-{;zBex11@)wngukE0$5#^tUN7Hw_^aB*Ycre{F>4+uBQ`ZyDaVWkP0s*P zqESUD+}^d;bzOitKd0_^+}B!;`jhs(FTajmpi}Ed8}RyvUL1hH!jx|hRh8JiIJG`L z@z{n0=fbH0*A)+Dm)5zf1T{)eCJlK4n@0uy?+5b7Ldc74Z`z=zItFHb)73;(0nhj= zog1sR3GbCN+=&ps$w6kU_i!UtJjB?N3Dt-F{Q9wr^vk>^=aO{_EJLT*2p7KTB#OL_O6pU++PDa+5U-j*HWKtQlpq!48&eaJGed_RFS5u zK$HA*xq!zskXK-n(kBPL*ZEITk^fkPu>S#{o3yNZ^1*Htr~e<|CX{VT3&|^EY7)Pq^qg$RCu9?g<$c1J~8cmPTM$PL(*=sV|F5=cPa;@2oO#aqd z706y8Z`L^tjVw8Wg0Kuu$F^90V6Sd)r@<%FHb3(CnrveLbImA0dSTj+K20jm?5I({-^)`;WyBE?*T#5&gz{^*5Ltc*5yl}6adZoBv`H;p#+oIHO) zr8)h@u!i>f$K`m0{5oQ zotl&0F+^pp8A&S`SG$p~SSh8{AyBja3*GyG z@f2#MTx(PXcC<#joWhoF^6gD{s9s z%4r6|sBKigaCP#OMfhXCxQeHr{To&q8-n<~3+p%)M4+P2snp6Mb50Bd4x5WN_kA4( z4Hht9ZtpGw+dcclF4iFI^&7AbrJq&^%{b!{GnZqK;J-YhZTLXP;EzX~Vy>!_2II?k zpUGNB|I`|MMyT&0?{Q@--6xtv{C|9Yj5rTlS)A#w`^asUIL7Tzt^=N6>3Dn0qi>wD z78~V(?`BW{plzaZS)<2)3Oqut)h>m2Y;_6meRml;P@Gx)Kp4BgoHpP+-R6&*qF+Hj z1?466+42INK40WdA01)aKSa~5*Hx6ld`#gL!x|wV9045{1$?!jxZ;H^?!T(xN3~dR z%@`^ess!UNJ|{LXVjFzDQ%1%^m(}QWA&2+$rjC=qQl5RXGrG8{FW)2|H{Tq~qPS|# z4ugn$-vdPk{rLHBKDaFH?;qo+qWOF9%AN9r$W1y@oMex->3pZ^iIsmWi7GZ$E@`?N z{d1CwP_(|R<#;6hEe-=J#>X(m37`zGy>mLY+;``(I1SlbR@|E% zLcT*$v3Js1x#x7*pxrlDx$T4ZiayNTEjD4KLQ&#|cuJ}L37rM};Ss{me3^{M)J|Lg z*_mCN9+QClwxlZ%I3Xux)bKr+YM_&X;44Ga`?2E20~+UxM&$dY5No&Ao+|{5b)>fz zVzq3CefofYg#1kDGi{q0m?UU2enK$2XdyN7qJ1NxaPfz7q$YZ7X?#p*#f4+NGfx^LNs#Dt z4cFiwSs~wVCBXN0=gJAk;6j)e7F1Tg|3NuOo@HdhZLrhi@f@ka^!Nc~erJ`g7N0j| zWsd-`>$)v0iCWU>y$8#vbdpysuB#0E#t!=QamK!S_9l#~n6M8seAjNqS88SQ?o$|; z8fK5yimjXE=*MS~BY;Q+dw7nrPw*MmmF_;Qq4i%?v$9ZWWM&&w;cD=++UERxLFDK>^lZbSS9DG z&30~R8dj(^o0MP-==s`DND$6OE?IlHJF(_dJ{5qQl(4+`^;;3qe>m|pUzoFF!=2JV z786UI=Xf>Ct)a}0XIv|j>c$(&l|Jl1?eQ~ z1BDG^@v2O~YISi04RTD!c(*JkAib6RH=*E!yO!~q{=R+t^1MLOb-mQFPQ#BQihN=2 zTo{|L3>7@{z=TV5o|j3(qJjH4&eweVvpJ9H#iKvb0Rzb+VkwUDjf_D^9a{xDU_9LH z)G+(EX-vJ(dt*_LJY>Y`mu*`e%{JlLwx$WNrpSs})UV>jeKY6jHq-wS)c^Z(&THrE z5L47&v&r||i^G_vXlAZ*www-o4t;rKm4In{r2i{=Dv0Dza6&mDcrwQvga6t*35muc znYS_6o85y^c7ZcV=m6bcX0HD{r}NY1qnBd1`jJyVwII{@{uQj3X9Xf5rHnaXbzy?Z%AW4dPMFUJMG=iQ^8cOpBAy zbvFN#0)U*`kvG9SLV0~L*VeOT+3`cXOJVr`=b`fM0{{OvSI36!b7=6tLEU^MA4%)t zDIZsuuGAqRj5bdou^*+dKc4{yE|pfR%^WX0eW>Z~>Nnb2C0#?=YkAfw9@H#u%7Yh~cW=|e?to12?~V30@BQyk@1iGq}Jul zEVa`2DmJW^cwvbMWsiSiWYaewc433|wLUKv=ZoBG*+>Xe-*e24xEuo%Ff56j?1$N_ zSrzQ_a-KvVNzz}p89r0#7H!Zt`>}A)4U0SwSEVtG=+j&-RnK`N6+$N(@kE@&vHBhu zL;SmJ{vz?; zRqhuRI>2_F0=6sc%x}|ifbjskIK>kr_IJ$M5suu%{02AvY+V>d)GSVmg^o<29V-0C z_cgMqUpSWb{Xg!JqSv!VKy;s-QQ*0zyJjp3FsP;UA|592W3*v8q6#!C&+W~Lp?`;1 zt;FU@Xd~vPvKNMZak;$7=hCO`yZdWw8R_w11JJVKh-6JpQx5M}zbuj!OaCr|Y%BZo z)_sLezMEe_mX@kh_eS8M4(Os9Sp{pfV*Udjky|5hNB>W52t59|+|?TcfWSQ5o{?+U zMdE+RG^gWn*6mULoPasOpxXD!L*+Ib8=DIWr9ohkrEn(Y0BGf*BV)5Gfe-tqjP$t& zd0mgkjm`oaPO3UrKmkj*?e~m}TRc!k-Ao?q7BJDjrX29C_r>{xJ;BA){rCg!Op26F zuo}~APqv&~C>ghb@42Tn+TqzRXUWIco$4N@ygt$N%j?qxE;sJ0VLXSDC7H4?vT>Hy zGsJliamIrPywsu241Fvsa6GD?2InkX@P_dzBas_5^fG*P%l($P*h8RXl`%ix&F?0^ z^Zndx0!{ocEU*+P81Z(^)RWjVs&6*`qxxf}Cy zd|~zFGG`^dRjs+xnqtTrj8r|fwqP9!T8+R}o59N)GPs>FFF(I=H>WZKz{~SN$~5=M zu#m7DuV{$Blo@`Tvw(JAVDqls+Jz`ye_b)q8wlZ9`y1cZi*KRsU492zh9RrWz%X#J=R$=Q`G29q@ zoL!|VloRZYZ;KN%_nuiurtDt#(f+FZ>7Ezc67?Sfge+$Sd3&AIP{nU2&9_atkB{>F zz~nKM<*6c~orLcmJiTFUb=e%vTaD51oXnj*e#yY+%O}Nkk@d?b&t`(oLE8N9h?p+| z(7*t#KgJC&>jYTU~T^lzgCDe4}bE82V|=t)BqeNBY?9^L=j&q(a`%Kg+$&g+J>+!4>ChGe;#M z+F2d4X^{x#nn+sJ;1I?!D36l>Wuijn6}Qhx)NNhZsVVN%;m$uA}=~}&n=#ky%hn{(!BtV`!;V-3GA9z>glynBrXS$ z#W+-TKYO5Z++!vQgT&g@-E=5-?NrRoA=MzJF2TlspoO8)m8}WGv&!FZ;i4dGH>gHl z?ImlnpzVWG{^^TSnyIh0wLiW(SE74tprocSFtAgGlN;3d*QmkgKlYp`mP0NkC%nqh zW@rd8PVz-p9*WTO3`1Yx1juflztdl7{kkObsqMw#%>kK^Z4BR|@I?`rAGgjW|T z3+ZZ)Y1vPAwNNw}Wu@TM;#s5y`9N_zyt@BcWMb=5B}7cvH9REs{X!(Y0P+*Jb^*Fa7+wG6S$Gbl>x`06 zPEGLCSvSeXE1Z|;?e1v-Z^)qbv_$l11*=I+odh=kYSI3bF^Nog>W})rvmz$c>$uLF zvHVXTLAIP`NEoGdwMgy56xWq3eppyL3o zXH>=zMo%>Qk6FNIMi^n;nMQ0%e7sw)R`3FKYipt~QUz2ob%f;btx)f~Gkwz zU9Rfv6-#lFUtLUZ2U&GIVBvL*!PulFh6f4DNGeVX(*i@m~ zw=AIE{@(lF?ITzCFa8|E$lXh}%QrZ1%InT*GwLjF>FM(hx;hklhxi2m=-Y8dNbc?rN)Z?W%C$}O!EX?}0Ar3Cf zZgY5BV(gueE@_hM6E9bnOR-DbU%H0-_{MS+;r;mgoCpTIx5BF->6J8V@%vAU*^^v5 z@$B@q@*0qLqh8zUZB*}^>e9O}Ym%)qGG)N*M%q2Mwv@Zmwr{%JbLp51ZSFH!+d?kH zO3vF#&irYThMvP-Ir4c6&y}t|4&tedGC;0G>LkU_znkCMx742ytiVJhZ}}r3rxZrL z^9!U%-jcxufwCq~a`gTnN))3m^3d1 zouwBI-*0LA?yzdtvIlyEEzMm|S@IJD@~8OeAqq08ZGCw_es2Sn9Feyq4G zzOs-Xc#a2Lcj2v*u|ZH*%dM9lBzo`^!K1JVPFos^&K|ibDLq5ZB}A{y{J<6gdP->C zz_#dKGzdydjjVOV!qXxl3Jta^(-mutV9<1%wVrWr&*gStFyAnKW)0Dg>&!#8=NX)b zfd;DSWL)3p*EbAjqD(H&69v;|+gh!oiR_BCD}V(aff-pT1!m=xglF5O5Z%+QLC#ue2Q{aa(d;z@7$vdB*?dM> zx3{jRtK}O9by#HUhQ6r+qz(U^25LM{{KGwBp(Xg1hosdw{X4yZ=>r?DA)>(8>bz=l zK9@6>LW`J4kO!X@-m7L{Ix-)`?W*M#(#5*D&71Xf`M&{X2Fi=*XleCr+s@U0R5b)T zrJF!Uf(Zbahy4f+g@g*u;uN`P?v4qYi;zYuhNY(GlDf@qvpBn}q?mcnzN571RQFL3nOpFc*lP0ox2& zOS&ex6g&UMpP7c60REi7ORJi+06NjZK_|k-$U03T&RwVe6yY?pBVwjLmmdZIIcMXV zfSlbj#;?G*K@%PT*43>xo?B-B_y_KML4#^F#Tg&~iR3~Th1Y60Ibfa_7dmQiSshE2$R6Nm?7r0zA9w>5cGTSwX$oeaOO(FNu;&VSqkQ4MaZB=Aq9I&ixbm%plJ*#bPAv zbkt3Qerih@a&jc4uC5IkJ8SL13W}UK*)}nYLS@+-Z8j~udQYR1yNl;_3qQJ z$9Xh@|2ajtO+BYxeL)+&+-5bG{}IUW4m@ zpgxsTS~20G*fx&8FpX;w#vyF%9!|Zba42Ps8pA_llQ;Y-9Hc%bC?L@QJQl8FS3(Dj zhMO@?Ab|j|2iW}TZyqVn$1{o6AmcRLT`3jvcP@BBh6gV5sQ7#PJDcV&6>c%0HeEQF zlc1BvEmQD)`=7y#nqJ=pj?l%~*%>weEhi!GBj2qdfMuI=(R94dGIImK-ffV-;T}Nw z|MvWu{rR1rIxeGPU7m1_m2;cWLG{#p83Ra7B~g@^)GLHz6*LZ zT7faF3*pUrnCS?T(dN>g?3BhqyL8j^L7q9XMK zZ1cXtwoqA$60bS^O)?={<=8tB*STd8PR9l0y?Ph7_krX##p!6^AQ!x&APo*+p%na2 zGOx}GcKN%jYaRXQhQhxCO&J`Z={qP=*0nc-?niUF{!Gnm&0G-2%l|r@0dAF_g^&e4 zbu@@>)F5W5yTI9vWYkZ_BP)BZYy1=aWaDxJ-7N-LeqZY>c8ZOi*aX{kD zgCJ1jUz+>wr7P<_sKLQsc*p+h3RPDWxjUV-Sx3mU05lGeu(Pe$YOHwkDO{!|fo+3q z2;Tb&d5a@rK^JV9v0XW=fiwbJ+8bV z;=~~r5S#`Vn&-kV4-9@>_VgLQ8h1AvxoA$9Fd7#xshx&^cvT`f?e%tAqX+Ql!p(YM zGZe70<)2zzuwVm5ZncOPsGw9Ghu%b_cS3J*h!7P5C9A<4C`eeG*!U3-7mZ>>cL2mmgIC>(m9=tdeQN3cSV z${D^*uajI*m7t|{S8A+u?5`^AjmYb0x^7FL|4ECX$#H8KlfiN)Q`+Li`{Mt2R3z!M z&~(o7?ho{W{geY%k_ocGwB>;S<^~LR5Ql(zt;tpT!bE%jCden-Hn<=LWMG3UrNy7h zT6D4c*wzXYEcGekgANA#kaoN_?^Of@G@usCPCB7~;HDlt_U6EC^4AfU4fXWKQ6W#P zwAuC_6ggY2J_xaApFua6xKm@_Ki5Y)zklA_n?-F&Qi%5<2S2p=3;j8L=kjxj7yk$2 zpBzgREirI%B%P#XC`rFMbFoIj(MLW~(d|GU!gH`W?Y+on`j_Ze3tqpQc&Xh?WXu<} z4`sy!JrLxPmI?fbb4bDEC37We#vhDo}rQ z_`6b`eFJ6qjMIjHkq5a7u@+YVl`Qr|4!`I&74AH2A0kjWY7L0tkke*UM|@wnAybF8 zxKQnJ;-HQ`6%M^Jm*KSWbCDxit$}E$_y{yBuS=U6Kn>twWoA4_EXV!8KNZ{M9g`vZ zwQfjLqPL7@u3>eZ7h^Xe2Kg>Ur2$a|Sfd;PfrPu4kmN(hu7JNA$PWMy1~{sRT&+=x zgRwza_%r3Te23^>Gi)<44M2-vXF+WxV|MSR82}R%UE!mqj7BGqF*84F6iQt=-dp(Y z5m}BdG~M~wnArtu)aK7yUjw#LjwV>3>4ksDhfdCyhBSzCDw_7yOh*l83^130O&QnB zW9EVaR+59`JNu0K@Ix8xnskxuNxHK=baY+WAhp74l-$C{S8K-p5Advy+H=BY^`1GL z1Qq9u5`r^md!nS^KLDdxu6NkDmCA91JVtW&h0vjXEYL@HE7&J{(cE(wj%V!0&co%#}4tYbWA^q=g9>+*~}fvcj93*RWSw z{`c@DRXBl(udWL-FdzV1p^#Yvb1%rC)vZ{YA(Kjq{!C$$?pN&1H=oAIw?}!5I3? z86oTlpx;1uY>**8=fIDWV}I94^b(?&OBeV z6dO=$^zc^XjyyW^txbE#BS$X69<@;Jc^B(`TE~lbH$!DP5Fpgw7(;J^H!3KilRVM_ zZew}X*`-Ly@U4c@mdAP znYM+~RQ;|>3@JVXlTl7mi^#YpODd)u6L2&66HZ?lFJ#ElV z$<#&UO>dje?PehdjpZfj=46O>3>Vp7O>3$?jARhCUXebQz&rWlQAu>| z)v%nfuD2?(k^Uog$IIPxO~*IyblCPj<_yT2Vf5JKg?sA>x$P2^qJyL_J5oi5D41gs z__cRbZ)}j=VoPRSx6{6bY`d>oK-KA4t|stV;Ab#_U**e={3)Aiq}veeeNcy2iN0|B zPY}gS(MU^GKm-I?(v=z;FR74RD8@)ZJogFJCI)|MCW|PzY`~-fBrg_k# zz<7|?+-YF`qZh1X9{f7gtx+_Xos@b1={tl3mL0iBX&hYXDWF`tP5-2JQXM9%6dYlW zbO-1HZvsWmKx|OJZ{bT4q8!*-f4T&RlaWs;qu-k8hj65#&BTVmo~4L(7FJd$AJoEq zbmCKm^bH^cY(O>+C^f65sIv_x6-B!Pq-|dFa+?0+zudZc0giU6UReuHRANNDee=`{ zai_QX`_=G!y269dgutjBE>c>HfI+F58uP@C4ShBvvw-UT3s_jLW@9X@fw_36mTA(9 zSdDky70z{xS|#S9!D;6@H)8s*wf~dl_5ToZEo{oAfVDWoS!cNZM>6v=bciq$#Y|TEZ_8;+uOQfgke50bniDX#DqKAgmi-`v23>i5M_c#u%NEPW0aED z6{NGlR&;4gF3ng?`!YH8E=W{Az2L;Tj;5?Nww~)lh7nH97PcM$`w%DanP4PDav+SA z;NX%A5~nwCeN^w@fqqzKmYeG2Rq6M4Drnz%mg@KkQ9--8IPv|?mnEB!6{GSckPj4Q zp11lbyn%WJ$&3!URonFQ>MIc+7Qt?Ly1&2m9)!*GW+;!6T_+bH;}uqE?CGlY3Lj{B z)3h?wu}Y2JVZ}OfpL)Ej5Zx^CoSU6brVzF@$6kJIfH9Qh;vOQ`tXfU9rCbO(pRgj6 z#@O^f)|Q=_q#yX}e~i13j;f1uI0t_1fJo1xUy6|k&#C)!+XroElR`H5h$Tk-{ z=a~Ox0{v<368Fu?&b$ehqnCA6=M+9~Na64Q{aa<|#D8au1(*McvaoK?|4SJa>zIEJ z;QMV?!heea1WSrrpCg3;Gt%flkig{$C&;nxM-cY6Y9H=<{jbWSrm`=otRyc*V)85V zO~M(+Lm&|N8Sk{wmH%bUQS2TW2nhNE5gZ*amz%CQ?q|K_w(vB*E8R4EgKJ^q;56Ui z;ZRYKKfXB(LumeUfnVuR09*o`)9NfV>C^ZMrqF71~r| zJL$gF9Q=Fzq2{3&H0TDLiu9(a)Ku;k)zt#tl`J>+<4&;-dd*XfY3V3D4RL{$Tb*^&W$ zI{gy@oIMxQg+tOr%v7<`p>KJ{kkAWO*lAZSA}f;avjAl1fo($u=#zLAS`oCI24~7c z!An>&D96&3yLJ*D>6<+r8(h85s4QIxS8q-<^?+fPfPzm6#2h^-QrLd{Jr{8kcrWh) zbu*4Muo4!8=v;`QIFFsm9`uDQ-Ebj%i2BRtXL9j3<9}s45S)Qs`*c0Kb`hJgx9D>u zfq-frtGhpVPWN?BN&k4-{_>AxfI2XZJ$vix&oApG5Fx8a)kbEGzGGq?+_s$zOA~1V z$(T`IF>m>JOVATMJI)F={yHkku$uJW1?br1rcvc0zo0(}djmV5BXkjC72%PNW}^Gq z+E6Y16R_=JA)Y2cJccHiI<_|v-^6Y}jfL4MyO@J@e3&fp`e@CcSz&uJ&hO7{MMZj^ zxidCss1Wh?8dc%UT#x#sflDdlwIRVY*BGDwu4ptG(dpEBq{+A}b%jVa^b9LIuEe^R zP+!C|#THMLk8}n=tqWEr%!!LSo!%~`71!1$3 zvlguSed@g{`eK8S&<|tyGgFKMAq)I;XV3th3|X{y5>i~h9e}W+x0H|u&bqmHh*|(Z z5Y*0+23%?(mbSXi2Ai6Ptgj(kue_F-m=<{l5sr~6+?4rT^SxHk%T%W^vX+?} z02x5Uqq9C_x|uPYE-B5q?7*0C?~o6rt;+Yo_K%nOsPmMNi8o$+ud-9j9gq#Ut@MDh z2bY4JBHQX8RDC|G%7a+{f-}=OxF-_qNJ}JCz2SA7-&hbCd_$=!Ial$mQ6d8Ds9+6gO z&iJp_DkZ}Zhp7}C4p`4)UUSfMGc3AE)NqfGoHP0Qj3>$j01;?`;2JDI4K*2Er;D2~ zWK!f+$Ny9D&wzpz>hf}DX;-jP^(?7jYEt^B(&_+H4*G1LD9d=$5IE$>#nzQDz(Q3c>;<8Z4g}~+!Zz8O|K3itRzK%uFZRK z*b~@J_i!g{ns0f>$f=QckBD;MbxC}Gql+X`7EEHw`HnRA^kLr1BV}oh^xq^?N6Eg3 zh^QPZ3}IB~ZuH%MNsK@CWrAgCi+7O~IWEhP%1nEc&Uw6F#DD*PtT;sUg%4bEEIK6} z6~~$lJ#`9xwa%oy!)AteEmYtgvDvQ4K%*oLX%UeBrE7FL6B+HOA;(cR%RWe%MOTBWfp%8!?QeESAc zCUjcx`EGgP(i+<`BFmf$ieoh;XaGO}rC-ZNJK-x6yTJZ9S52NYW;kv9QU zeQNg_a!|9OLIFm@Ij~P#4tkq_ovic1z9MM^W&9S=Z42)hy*aBu&065qmlwXvCHT69?%cy z@G^UKnQ8zQQvdN`aHUO214v^gOU^%Ix>jDQ^S;lyJ+EPK^Su(nAdycXLW*Dj)M3AL zZL0X*5GExXV6q_?mBbeL4oy&8jc)Yq4UlEo%m>+FNGR5yTBI}n9`W8WFIZ6#Bs?=8 zXi|AT0q`mlaJ#KJisrL1W{VoASg@O`Zrxl%(22!{TnM3bN+5$}M|>R#8H{&>GBH4k z5w9@fzgCEAjyh`?tbsExW;(Y0E-h4dZGAdfvEAH^KGyQSQ3Ci@$Mzh05m>XO`^rzDKr=x0^7W9!h%%ZYh>OK=Hy;qW^LI6 z7_lO{^AK*%H(+xTco2>nsMgTBer{$3uv)FpkkQ;)z^2VjMAbCZpXfcl;d!1eQl%?| zD`(tMVunG7{0Gl6mFFJibuoEa$j518W|$*sjf`ArjyQ*0Gx1{Ga%WS+1FC-x49-l| zC@2?=m`?96W4mIIl0pHGPv{Ss~U37(^vdn)uH|r?GRw(n~zs2~+M= zaQcjU;MW1$KD4`)Y|e^vFAjZ*!vBL6i2koM2RvCm!y9SQCRb`3s0XwEXlHb(Z?CJ& zroT+k%qs5lkpDeWC>qI6m@zPXZuC3LhGvo6D(m}ayP9SmO!e($<*wgGF>F$vU$g-5 zNHfuN7_3uRj4Fe${%Oo4MExSFCbQD=l-H;LQdSQb4$GbHwrBVsgyFlvl&(&WW0Dg1K7F!(eydq-x4k&V)Kz}toFwlp1_?UAncV+6E;Tr> zGe<{ed1i2MN_Pt4DyF8oM<#=ACZi&yo6b!@m_U_)n@hM$ehS3Qp3Zsjue3(z7UF1x zF=Yw=!=|?&&B1rMW&caUB)Gg>b%@dsB>r~ne9XbPgo~d6O{5rveM?EyaKFxLAMp+J z^Sdi&(PtzdA(Ov);`9BuDk%+;F^9|VL=EnWWN7X=P4_@@K^##D$!lxM>m+3*#WHJb zlefp9P>f7DN~3>&+A-=_GqP0Q5iYX0mHlL^@&6Y4+JB>|Pc!NNzZBK$%6%T%2!5K9qMrS4^(eKSjxJ@KMc|FwF zVm6wR(dwQ&G2QBpZ?0S6+qxjToxz(|#dk(H$9)d7g0FV`r52~vPBD_?&RU$Af2qYu z-Z^jneX37$Xx9@f1@8AqNuhUu2SP_69nJ*CtrrA5q035;aG>fE74q497+Vf$-PU0| z$~s86%R4#ayW+b@jlt;*e{>u?NPQkw*V|CY^AO3 z>N^kvS!@eeuF_(6S+gl(?!@1@bH~ixeWC2&&3A<HauP5en+a2M+wn>Z7a?g&aH#Pa zPsjn0UpoEb!6DnD^`Po$|L6h9R<7U=pdg=$MEv}VLmJPz7j{|US z1S0Hq2Y=X{??Y~Ab5KwaZ}fH>Zg2&7o~Te#yzoWu1@9awhj$OH+GnakgWYH;f1hUzOJj*{Wn|`f@nz!qGAdKryL{wW=5|C%+ zupPDi-f6{oVpjo9D0keD+rx2Xs-!3B!7JV&MrFj^y29^kT&8^A(taNi-n5N7+t-2( zh_cceTV1gu85x^TN*#2(%dDDTDJ*u<@#>1=ct%<3i>X1WC-|0R*1w+BvL8&uWGvY6 z`(*8TbFsptW5`Rd(j4p6l@o9#uZ-IGqSSe~THq|LFgugjQi8t3Z{U`iNt6yo=bom` zwtUq&E^z-!Zg568$0b=Ms6Eb6zI#L7`7W-0aJo0C8xg2hzj7^Hjogm|-=8 zJkuyGHJnf7irF7u;%~IzUkyR8UpBbeTKZt_DZ7C1X6R3}wgUF4Dn)NedCARtg;t|I zv*EfMJ_a`jDZIM%oJ{HrF}qH3==P>(x(?Qcl84ScLvvnJdTZA};T1kADdTFm|9h+r zdGQsr7jbX%DLaR7A^XTBesw8Yn`y_;qphoeK1%aqD{?Z4^{6gJSkoo`%S2jT>CocF z%r7}w9VwDsCkhT@8RmM`^V73Sx*G=2#mGtFyTX1m0(LnU!g($#=m+Us;y0-*L_<*5 z%IxUOhJ{{xxlm0$bEpJuS%meP2+lL}WeTL*A6uUdQexGbt|}7|_U(qO0THUq^^ava zkA(xB!jjo0F?k$LzM}={l%-}&kbqjkevQxo8DqnkQLP~8S)F#rP|{QOeR7uL^NZAa z-8#>bOrr56{(%9#Ce1rGl^td^nZ${w?4Ei4zj{rc@&6IVe7@A0s9VR>NFmHji?lJz z8Vybk?0b`F7{nQj#>w^>cMP4~2c-eK{Q2ruHv%w0UqO~)-w>X4R~YAgT|!5&jOeHA zlOgDu(ju7g)nY3+#u7ufewD&oA8$IRVq2=v$|=c+c)G_vgr`P|^4+zdPUBcyekM_u zxGKp@j5pofgjq_>H(_J0+|f9Ut;onE8otd&m!#$r73@uT{8vY3RC1=gr)d$;Au+gz zvpm>w*~IMKQ<_23a9O|bAGMaP<@ABjvi?qSei;iYx$=dy6Ps#OlK<+2M2_yqO>Sbm zrnkEf+r>ua!8d#Zq2{;nA3M9xU~=_}#^30M8m$Jusrk>%&YNg3JA1XZ%}#y354(xmGu9Cjn`f#;Xl-bKa z{`jNOg{sM>6CM@@W_oQ*RT<*WI#sood8?#^=(< z9>xPFTqa}pjXAgj%P*G+Lw9VkuY>9~2P6zDIO^uKY^Bmx@nu|yVyJ^(^n$fJEL(7>TFQUbFdz~-UPbW?O6I*9bkA$9hdwl@!Nf+fCrn}rglV&Dw%98t zX%8nCC)3bIbxTH!<_Q8^6+NG_d^ee&QM>hH-zve%Gz3ON^_y_9|nNI#$1n1Q6 zRV*wVKC`6WXIo}Oo2{n^1X&Dj zR>J_zn7BD<1`XQgj>43Ik8qkntw*sh-`sZFh}%D8Z6de0)}xMC)H3#w=z41c{Sk7| z#30uIJuZuy!Jg;qxNgqq^F)z5Z_|@DqmN=^RuxjvCDYw=0{2=2sHvdtt-CR$ao5wS zdQt8?)}|k?yD5#^Y$g>$7t1q=>R7#p0;ZJ*!jo6LU^tfZ`@tS{&e%~>q}{XGOO*Mm zgq6YM>haBdrB_$QW8Xsg))NXZ*mlOLL<^nEb{fh08z}U{nyf^BL5qmA z#Y88fRY1S*082#5Yt27{;Naq)&$k`=q6Nwss>-#<2g+EyE9SGjFSh5Hva&KSJ(dsc zZewsCb^&~8TmzwlZbG6L4N&EZrtj!OK?3UdtcNzTWBYb+xF#(P5T3u&am#le11~Jk z7vz-}y=)vD8TRcB>alaa&71?FuT$Gc{Kl&N*gRVn-ZeeXnvVZ`A6a`7713Fl_WR0_ z<=Jf7QW2n+&M<5~i@IlmDjiT?ZXTQIy7GQ2{T_0dp1+UK^X`4-B(m7uZguDF51t(P z_kQ=&xc++=|7j#(7ys{R@o6yrcTEeGm#}q3D9X*_`Q`SIS66%*3nhEqLsk-H-K-iF zhGjfH>Q(7X_V}rSosFGanhK?j%&6}Ole_if6SgC;nn{bC81`}Fa*?tepR2#O z&(^XElr7T#_?`ZsTo(NCRUXyzYcC#BajpvK4OL!q=5hC&A>2ozNdrk1_UCJCj=4#7 z8%o}sWLojNv|@Cs?b+Gc1K+%PW1D(<&oL_P2{*xHzqiy(DI_#+^kKW!awI0y^b9dd zTp=C!Lu1#|%JWHY%u($~8I@{3*}%ZSw|VC+6x(kOXAU?bPs5}2NQA5)A?a)9#upAb zsT7awqI2T?gv7G^eLCs4Mr?kR+zP#h9-VVqOI8_F$K!gs#RI>wxGx&GAJH{poE}W- zAR$=z3G?*4=Apy)GXr zQSH~-wUjekDM=-K6oEo3(Hi}!wr<>@-KxhHtit5Y1RKyjDGliGUm3z3|)`4;3O*oBGIr1i&g07aC^Qs>&Om^S z{Tqai^b7gteO#B-lGFQiqj!x***$TC-8QT)_FF+-yLMZT>dHEgiMMGI@#sBu`WdKp z^WRV2CcQ}zZ=MMMyYrY$%IYU}r8*Twoo$VJ7p8R4@*5eC0e5%w=XU!$`nc(ktMtzs z?chP_)xHy>xoD5nd4ti|{ZO30G>I;HR?<`TmN;}Zgi}~bsHs4=A0p|)n!0=PQHr<< z7w$vP-TVSRw>o~;j!SIpeH`z>DDFxOK)+*TM}m`syB>MC!7-AIkQ^(m40F=D2|gjh zHmmaZmkE`InPt8xCUR#5?;;OR?$51tP+4rtO8!z6k5}%VV}_zydc{6(vDzqEl(?S@ z>Q162o0HUF+3#zjXMgm(2s36M1xeW2y$)gNN1;Sre+&l)hqTb=wc8VQQ3-vk?x(Tg zXIjq&UW6q>hNMYoY}Ne5$x~xAt9?#Fp3n@w|L~N?XG4kQ&*YRu*Q8La?ijLli1I05e@HlYYjvmM| zO660FyE3)derePUR*vej1j{8If4mWAEU&Hr%|??Wj$6%{i(hg?5TN|-fHF#0mpz{MS80mgrKM)uGYw(eGU(^lpkc%>a-c`RgH^}AJG)_`>I zNqgOD$EcO6YdAj9-FgZJe(CF`Eu@l;g+HO1oB8#5*s@OaXrHfGy>K+(z-z}CQ5<)` z*VJ!`(W4wU*Z+9%V9?`5hx7U$_H|B+F29>Ew{TsX`cd{;bAz%!V>Pn%-Vwhu#Fn#! zL`<)^sP}O`rR7I$oS3`P`F%=@SJ(hp4Et*@a^39X*~Il%Hx1f=sf1mN=NtiIqWN^7Z%!7HNZ5a;7Rp zUCt&jrj+^)*>{RI8}^hvCm3Dhb{mVpw|$rRF-Ngp7t+{e?GBucKpO$t z`aajB=!E<{ZBP|jk#dogU+RxIg4F|pV{BUrDVD~=bBQ-JgXZ*tV8cv!gz0s=X%0My z6ryAij9$pOP@S(~vR@-C;8?DHrBuy`4j~EC>$%t%IorToSu4zTQuBJkOtjVEH^Anw zPX?p=HX}+5ToxNcF6|HR!vKacaPxD8_AF8enM5KmBiNun4nr?+V!eJ~T?O=UmVB9i z8ZN7lBq8tw zP-#ZNwz|n=sd)C~@pdJp>M7cXkRij7lSEiG5PMBla^U9RDMQMKtciz41k}5ZTCH>p zLC=Jbut-$BI(HRrqjENKhW{+hSywZtT)NY><2wT(ESE6NxeDl7j_J1ptHGwf!kFWx zy5WmVt?`*^Ikp8tUQs6qC)}x>L-(Gto3eYtKs0?hIa^;U5LDpLOs1IKcBH%?c)Ev> zfazVXFIBn3znPXvTz$&!bTbziPQI$5K%K&7WTF3J;Ov<0`ZWNH^h^6UQ#1eQdAg@S zP4a$63Rd?-GK?yv(4X8mI~Gk)0-#r!!$}O-e1GT?|MX9^*{G6i_Nh~o&aH9Lfyw?W zYj5p*ld4x!fp4(EJ*>IR@1veUJLlVxvIduREvP|XedeA?lV<58vci!P8gv_+krd*b zbZcE#8=2Sjr!ggjdXKnWADwusDevb~m)dUOnBP<5oNg7QgYU1CZI|wvc^ofb?bayV zWf7-VT^@1NHTj<2x5}BMLM!hQ?}gF=zq4c-=b`78b`|nyh0R+HTS#u0!$kk3Ns)O& z@TG`4^AB%zL&rAU#y?SctoaOj>EpdeKd3wxehXgw z*=tyzLC0|QP1(dh>D(j3>DnCR-RUsKp#+t|n`_jz8taqGQ=NLjTb=-3*+C9kVGPpn zpDqACzuHaz@%T+^byF9=qG1$8!PoLb`@*_djTNn}j`2-@pS1G+r`9;>U#Ik@Zu_4`}DhloyT;Sq3-Dc`ihb%C6=;Gq!jlcH%}V<{BxD zW^sS;xPL)rk2gN<2O&A^ezQe*$qQLsTk?2-VZ(IPis*8h;_F;;t^{{i;C zF)wp5KHw43&UY&z1;)M(}rakd%{Le=RH%eet|TE=rb09c`GKsx0%`1 z!91!=-5gq59GDPAdt2U}ouQg8@3qASqf1VB++1HM-#iT+$L7qiD@i|L-#0g0 zN+l9tq5}u($}<7xzYN!{-f^sJvP%V9ft#wFjcpHj4E;3DnfbetoXCc0k8l~guqpGL z*PsSUwJY8K?~sH#BHe!9p@7>fw>bnqF7HWnU7_KJ#0NKfSxAolo5(`Xpjytc_4g|o z^(UFtwTC?8&PJnaoEJ$$@5-4%(4p$CKEsY7Pab>3mPe!Ip(FT9{I2d0i)eL$5!RViq5fXSjqt1d2Fo8KnfsHK=tW2u!DpkzfAj-MSLp7DwIXs?YVD=L z`Yl!)V8{3Wg|m}<%_hdv*b^uY25w1?LKMfCyuJdT`fHNAyBw{Y^24KG5<0lQZ@!K! zxWTL2T2yettGCh#h@R~zc#H$QvPtE-k^JaXuI(&Tw;Jy6-wPoK9d=&g!Tll@zla6< zHb>g$B+2*pS>ofWtC5`;GV%9EhR18>lS&Ssjon2>T&o{FDF5)tzGt_dB0M!1aAwZL z&xov~H#fd~ZGDGUtMFqtfN`5T4UuK5D+?0ry8`sa?+@*UV3jZLhe_oMDED!IvDWw~I_4A@nFVid6Q}S!@ zI28F>R-4}{?wJ!qkp=`&S|%PVq2*33-3^m-KTKZ32~wtG(kd|fV_Q59dx!sdRY;|3 zt7A12YED=1KP5V(@U#==_9_LJC+Cdk?Fi|6Vod(x6<26zO5;kSYQN}6(IlnW7<}BK zr(_Ms#gc9q?{{fTFs3V7E(ry%z1V;lW7tioeE*y~M9<@S=vQ%B_Y>_>45p6|04MtL z(zETl$ac|fnLIz-T+H#k*ovMfa0+}+&=p4?z_Y3$XxlRzFu z%}Kf4lU9L&GGh6zYg2N*Xo>kc{M9=zA*O@*nGQQwuk-_!heHh224Qz(us?hn5@3NC z!GkFnz{s{88i#5)VNHyR>AN0zpHJIF$&sp&BG21ew{n>ny^NWRS&XrdpHLKy=?)ir zDcf%IV}*X?q-f;pUABCAicvJbqldT>H8kAchf?Qt4HGfS+!dk2A);LzS}jMji+ruP z`4WXct{~Osy0w2P;iKQ%Cdzs|n*`n3lk2ZFHj;CXPrsU{P(VI$x$)|l(Ug|OcX-v} zvhQ{GQRc@NypmTOO;7m8M@ASgt9|b3lkAgd=N5{hUkf2A< z&vFPddQ@a?z@A%rN;Gl=VrM3f=DjM_DEL%=0xtB{k2Dqv`AI`@X@v_gHhRdmgrE(u(b}kT#n!rWzX_9QTTuQ`U?kAMszkQ6Tw*AwZH5b`KbFy zC>=RnAlW1vE*9{XAbDJJXTL7WZ!$(~p&-vhnw@S7&1G`o=nl!B-k0i$=p#Z#6>Tvj zp?LDvX`)Ga?Y=v7eAoZZ=;^A90_WC3+XZ-rqa!p2p|T;MPy~G0t1Z@5F6m$S%K==x zyejytt7DMv(3;;eqb2h(DYKk6DK}8udY^Ndd6{3-w>x?W9)&|!HM$f)EC@`=d%E9rteSZ4gx(3Z?K%Rc9 zypSO>r1}ioHtNn)hhr@jBygC%Ej2_q>({O=R7ejGtEgJ=RkX)!3naA{Z`=|O6tTE2 z7HlkNco4K5Lu9U`(A#7ys9qO$bU4(EoJ~W4PE`6%GLc9OD?NppRE%eo7SpN7s&I!; zk?bS(+#70kfw?zy9IUGqO>TwSUGNlvQh4dzQyx=2kzg+n(<3#?dZbKz(G1o};!B{! z-Yw63KKOpDnHb*@ualL;1$!LDc=RtZt-{JN}%ly}LuDhXTO&CPH=|xyUviy_Jdm$74V-&5+ z5UE`#vCj-*v~53=-d!U2q;H(VLqnhG`}RviBDrX*V-EdQl8B0cjJVZ5Ms1;N^Y;tz zKn`005#z|WpHF;UNCB!zkba9MJ>dYy4+I`dj^&+-P;{aM?&p2$uf0D% z8i&Hd)e*7N0&Z;BvbC7cK>gNe4&;o4*y}29Hbe+KQp^i%`rWVM!K}Dm=|;q-ZZDdN zWS?u4_ZScqaFZj|ZqE*oarb2jKz?^0{ZA>Yg7H4onp9S>?J-{CngBihPvg=)nU;+-js`#IBzD%e(| zHL02IeaAEU)-cPH3y7xjlSZ@Wf?0dH?mSS`4W!52I>6z|VAf1}eqOufH@yFxS zTb~(ydV5eYVy$;;`~BMo`QGqpyvhl`%LNNRLWUDGr6#|{z`IdY5Oux?WW`tTZg<_pPuh+xbSvC`~qXDNnfd(YkWiqmI4KhFp#V9|L9E zQRM7F#8I>*%cMZ$JS^Wh9jg(VJZupG`BHzysd45oUUHZc<|_-II$6G}94!ZPdg6rB z-4Uwdx2yHRvyG)QzvgI#6Brr5Tp+ihew-RNKffBj;eoNfZ^OlkgPW7Dsl?ZHIzz^W ziVp#dTFm>%xJpsrUt%q%s7q_q*wipZ-%?pM5`Xfaz$XrLHEQ9B(>{AV+a~o!w^r;yg)#gOs;;Q%yuw1 z>*FHriE<{3leVRPx=u!aH(gIAaOA9Z-rc)87>l8#=#iM7ChkA$H78eXOqaFK!7 zK2S&<^Q%_)Qw0u|g(V9&7VzVqyowXiS#C+Ivd;I5U;el{K8Mz3*&M#abvJYn%|CIV zJpOojpRX?KUEaBqGSPL29b@Ui^NJ2Lr`y2%NW0bvbeM@E4fk)IOm@|0W1~%aqcx#! zc-6c?A=`K9^~;Qj6-zHL9)5F2sw$m8JlT}a9&1wYVXRJ;CPJJX-Dr!8U(3GG1bjC; zxVOJgi8KIeP0tT&$0lc%Xql~`i+{y`{KNQF z{!pnPw0i)Eqo1*d@HUOt?o#cduqPDlv*LE-L!px9PR(P`&W%?`-@OC#eBlY5pJ+1m zTvkTros3lNk-6~&M9wfqQBS6{#8X-h!&3Z|E-BdLAjlSnc?C{y?$XBr6jhktaAI6H zNSsIZapa@ZcbwL>SL3seM0CMf?eNB}TFm!N^pb+QCES1H5y8&iuytT7NG{t&B zfSOGSE1XC>?7tBht4iuXY_sPkg#!cDI!&!6pLY}Nj2ZX3HZ-?Hn*^hdA%q_<6;$P9 z{;?2H3OKm(TE|d33y+5O&--s;10G^BGB4O82s&W00O@%7>h!GJZd&01VH01)^|c8C z$UEKoX^WTm{gv|6Cob~4*810K2Q_si?Ms&3kcRdLN>`I+@w*Apm<++4fcLx%>;}D% z!dRvEx}hKd~)NA3jQg0qTCrE(>Vf6n|{J**?SHv={Lfkvh%R zuQ5J)DgsQWDs1o)eNDs{*TK`%8fMCtyw;N14ZUq=zJ1EBML4M))Tto1nL4{NgOdOx z9G$!IV0QXuxxX?3f0te}16z|<^bjTHQS7@c_I>jsXE^KU#05Y#OCKnW2N&L|aUOah zP@Qsn3G~1q8-;l{QoE~3SJ#3J0ko(~idtTH9hS?r)2Pb*Cvs0U!g3=ZXJJ)#pr8N(QeScBB<)VGseqJf?S-?_64d1Qfbcp$t<{9&7 z^4rag@^5F5a=7@3h~wFT*G32)pA{Xu?pO8P7LP`%JCoD*p0?KVXbv@)Sm&0z7q$FA zxwaLlRK0UiSDpWrjD@o+f+#hoo>zkU#@6%zX)KshPtzQ;$-ZZFB6^S`+*j#Mjx{qDbP@T_#FkL-R5TY{U-Ie<3P0m%1#>t&KAhUA~~i51*RQwXMy3)Fc#i<2&sP(X9~PQh@m1Wr<0+cKLu3$NKC>4O=@)Mr>7?$1$vsQ7l{)h6 zSrTqY|I8C)wYj& z)V2d;G%v}E70ZcFS8IlAj455-$Cgd+%hA4GG-63ExpzPN+(YM+72EPE!&M|k3s&i{ zZ6X`!`kl@8#s0QcQjvAQLZ;n?`Y}-niQgTM@X%u&4=FEtY^eZP(N^s`)%r7rub7gkvbXO3kqnp9Y{uR9x56il)BmTdw!`$s1J}HufYms z;~i(c9*Y#l0tuU^#S8BV2-}nH%IQCY=yTl^!eN4>74+p-DPy$>t*Yj2#WgqkQYFO1 z#BL_bjP0`omDO3Yf!yU6}^3hy2$CjfVf`ozW`6Y4wJE+}hNSs=iW7o7O ztskwZuoE>}I)a`gG(23Y5deq5elssG#+VNu>;kWt#(sMt-qKlopuneBCR;hK^+=5% zzmos@m_MjmDl1!An+Q$$d0bhc4PBH*2~Q$ddEk+_QcAVkoh_QySn zkdm5BrLpaU5yF}(V8_XI`fS4p*&?Q7fiV)1(LIHN!pMMaVy%;Y1;JB9E z{2wpctlV^4tVD78*?}jl4-w-jYrkjrOIozKm5Gc4S8aEgw&=(qW$4am{aK=uHC3Ki zAi?}r^bk^E>)`>x(r@>Km2(3x>atS2T)VG=3~2{NJU3JY; z#EwaND?Zz-x$?^d7Q#G$|cwAKZ{dG9bBFUFJi}Z@{mLSl2WA|N=Px1=d0^?*DZbu?# zb1@R8$}C@shsUC<+4Q6at)=#Px4eO!vEzVnxXGtF4#=qg+hc=_!q)4*l*i`3G@IOM zBK~OxE^6Om#e9GUxI7GgjP4G(z#G}4tT8<2kPG3GN!+Lew(X0sWB zF9e;ZDlH||>B=Z((=e%UK9;BI2?>=Nm*Z8~jHAA2c75S&duG@|9)UxO5)y}jT6XT> zjlV-+Oz7<*~bK9@!XNbUMCzyDie zn&?Z}WKeQS35_fxi3%-t#IjQk@yqPpvxn!4CJ=z9M;dM|g;rP^TX)fS`hn6TQzQQ5 zY~O|h)@=3|o#jkO1j|zN8j&hV1(9YQu}S+r&CgxuIu>_Xf4qhyrFFk)wIu}DvUDs@ zRr+9EH{>}1s0qpPE3XdU!^w|R+dmK8t?q@3ds|i< z5V5Tl)Tl^JNlB5YHfFQHR6ApK$8rjrI2d8%{G$*9#)?6o^IAUTaVT+eZF`j`_+S^6 z70UzbLCc@^oTB2r>0|J0oX7wHwPCiYQv*w%+uWnouAz+YhNp@M{-XDb)$ZU>(0bAKQ9uTkd?e68Cz zM$M|HNzayvT1d8MgHDk1As$pRx5x+W+#?>Ks;n^0!MEfk*jEDuI zVQ%*Bm^IBAP*)wolJ>zeIZ5hu+XrBoT>SjC-7Cdf!<$8qtqv1BS7ctlw_&r3;y(wm zo#2dz1j3Tr(&AkPZ|KUI$_PyV+fS1RnC1$KG^FjV^Nq#4T%}&v>~bjfdaSb? zAEUQEa&rJ+W*}9^ZqtbN!rp__nQ~z!h_65wBKX>}m?pH2Ni3;OKx|VXe-*ZYwVyGj zk0Y(E@#Sp7l?JW&vT}5Z_&nuU_H-V5t!Rx&ttjaEjX6qs2~9>j!+NwPUZ%%d%I&z?Q?uGD4`I&heah!e^$reHEGfXFe2$YTK|bn8tx{6K z|5OxW&yKzKz)+1M*e%Gw& zSo^^-_{gV*{mUSyawe#5qSUy^d@iEAz_!(ddqqNlz*f-hnstG+na~2s0lI4h z#A;167iZq`dkMdAA$1-bnUP3WP{P`HLkPJ}Lwj{85Pii@A?8gMmpWUYr8BK`f})f= zThqp<>0o-$+Iv&UDeV0Mm5MVRW-b=owVbyme5!U^>y&B{Ve>hXNx$|lf#p&&lF@d` zZFU*0c2u&Rb#Sqk_3t?rmkK})sx%kIVL9=fyIQ%9xnDpvM@w;GaXG!a-VdD9ppX+) zjlYoZ=UO4E*)f!be6iB?l4j7}C{Q(L?a_po^j8^|gS(vn3`tiI*%1E~RO(ZWl-VpN z6aoboUrP?v76_>4c(YXWp`2I-9W;@;1s{RWo!nB*tLI*TmI?;fL*NsOy{}Ah8v+?y z@>5PC!EaI+RF@x3ONG|!CpS;!Wnai?EHD+K%_1S|Yr6e99oJV005dy{K#UEk7l8OR zI(jQHtCY5~M?lnj24PVb7-YVopC_a83#{IK&H@F2&5Z?9Yq%iCMG%A&k}(!@K+bV0y(H8UqUAdf_XoTrdHNK+?z>Jdc+ z?)~n4YqdXT7}c#hFq+GmQY_JF%@1s~OP=j^Un_ZSX3f50I?lXEv}#IY7dF~(ky)A| zGyx`@mIQXlNNhAfaCaJeD-4i>W)L6fWed6w_>Yxp5elZD>O+;qq=^4dxFtCOB+@-; zGi*qqAlAp9)>cJr6b@|lqrP!~-cAJAY6fKu9?AyADnC<`vF@oD2b5E#bMvCta+x9F zIKnf1M+1n|R-7OraScI#)DHS;WiE2x5F<7MhQoWNN9_{7<3>J)-{-<#bn9~_w?@xs zh1Su5qIC4#cKesO^5aNcc|@8I&R3^9MLc*1FS}*)>TfdCZwwrfb=2^HeIYc4~D?Q*yNZAG}EtVuD9+jD()-4L5*n-xF zB8-G*#N~05zw}#?BeoZ$*H5UNc&@J?<%g`{q%1$lQC7frI3}MjA1JCqvFHXp>I`JK z=AnD7ReqVgQvkpalGc0QXPx+=88BOFU2@e1=3vaDo54}@~tc#+Enq>`igs5)^G(E z3aq;AJ-UJKJ7$gjf>iQ=D=iRem^Wc*HtlV_ac%8F;WcypaQv|)7qU3qxZ(da=d@gG>! zB%v)^*b(Gv!?$z-%c%=oSM-16li!m4CqB7n6Ck&Ik$C$;@gM?E`U~~G_SNxFC~^E< z*s2^2MGyPAs6sOj%0TVGxcyffB`e|vF<<5T1S(?nT2)gmn&IfIGCtVuIyLQJJxur) z=@Wq)k#2WRghyM7H9U;}Gs2hmEh!9@XmdW%tN+fyIFa&hX#ct6tA?NLJIc=s3-Lk; z-*X5J&&FAs^j8p{&l=oAd;}6hQm+$iKs;sgE%ZAwEAh&9HNupRmC>ghOy!6v$dyqw zSYBQpPZa&Oen>84_&kSu70aOC$_4yI*5&Hyx$z97#{)ol;WR(^J>e45aeu=VIX20^ z0DeK%=jx0h6*0!HZ$DIS8ignrz5}^F(Bs>8A?}=re{U*M*zB04J9igmy9$JzbeV6E zB^=zJMns7uV*nyp<#**umaEG5OQPHlp7KRF(0RSf@qYw-Z3_WVT8HjgH%G^GnShbU zfm@>A!!0ftE%Tx1&su$|&>#XK2k)p$T0x&ug8Sb675WPh*l<06n+;3G2&92J1`ni? zVkm!7=9AI&fuw$w%1f!dq!yR%!f=Z-PkYcYC7@^Zk1>p|#aAHMxYc$m;oT`H++Paw z6;vGqWxltU*wms|sCrYO$y8!G=wg{zJ>jQ4udqD6zY%+~Xx_Ou2U00+$g`DKVkY_O zV{Ir=3eF}1AD8}`SXtRG>N10f^QmN>n;AYQW|jkzM#4U*0k6_0Ak?SDpq@(s_T4<= z-Za^@6_kx8zsd5oxSbD_gLI3ocw%R6Y3t*J@Jvn}4;PP-(7w5`P$Y5z4%Ul`{wnkr zWj>eQepHw5`3bbq+FeQDY#fZ6`Ug$EDFN!;?`ab_EdrcIJN{nRz z*o00Mwd~!@>;h6hchu90mNmR8VZ}0Ec-1`sLL|xqb7W2?aV92 z39mjXmAK>EvPzj;EqNO;3l*ZbszrPyL256$bi%qX{KT^W%+;>eTE2>cD(zbyX>EZ8 zr4$1I_1UCVgzav+C02!s2j(&5MFot^Tzu?`woc&9gahuhMYlc)qO$RazL!Asb@{5* zB@L8rcb&juecaBHIUd}7usN+W2R_J5c}Mj0jA7MGzAf66sRiR zbU<;MGnP?l6{~tu-;`=_U=+d5Hyxsbl$K8G1Onuxx6|5bza4D?gS7n1z-)mfb1Rr4_K0%_}=xUQAxi!k9Tm!%Um+zaNz`j zCnL|$kdT9K|4EW>QN`y?U8Dq`s{ZC3(Pfqg$$!q$wpRxM@3(g&tHiebL|FURJjuuawngzP46JCdw|FIc(py7oPxx2RFUs;94w5smD?)ujMYFB5 zMGsDc5~z^G3IeP&UqWAUd3yfPWQW8K?6cwMzC$MaJw`slPe8^Xj(YT2yF}{ph1aXe zHx;HnDqUJisLF){XJ42Q;YF&KPr65`Q;VR@lKT=y4swxA80MIu3}i zX~TIhM<0p)_oIk=(U6v)mX;u&maw-@D?#VwU7phI0N1LNM-=uDJ}zjqNoQ!$-nA3s zY}QlX$YaEzybcgKGQ_rg0AE&%+@cYrPuv;6QW9359lmlr;U%Jj{S z7{sssNiC_c+O#^dTH0o>2tf_&b!_oJNTm2|FINQ2O@@Tc9RHmDWH+c49Z`U4Rcy}` zG+2O4$_fN1M}iu}`u^cvx6J=Ib>g84sY(+h7H~M<*Z5H<=fcsWM_XQB(GRndLfR_W zL=(K|G{9bjWo(H0%7s;HLu8>!CO#M`K_?s10kxOMFoOhVcLn<6EfujH=<@j zy>h*Cye51ORd)Dc&uws->c|nh5N9-zXrz`;dz*-~hKSX?d8}T>eBpbJ$}rKV|L3`Q z%iC^VUJ4h$`L7ca7EX}=0=}g;c+&Z&Lg;eA7R`k(=|A*TXoZ6528I2X8ax_&1fDyJ zAl$lw7!wutQkBc42~!UniBh*teve*adt8MT+-?iY*KZs$J*@M6g>L=`T>4u<{<~pf z|ISbRVT`}G#UGYn`Ja-EQx*IK+24yR&_#sToRJUQFMoNmxiHf4v0x;A$BDb$OBjG# zo=p2Tc$V`ZvMI1SN@l@padtGfIq`ssci-Eaxd(dsDgp)jnu;plS%aM1i470(x2r85 zxT5qga3ntB%dyMIN=qUQrTOF(o%gZ$>wWOf7XCo ziL}3m{pvZO!>3>DT$MF&rw)c|VCX@&f#(0L+dz0WXG9Cu{ErX=Fd}HONl#`ALV$bVOI%@>m{@HewmoO@ z>W_8<)T(B>OB)npGh}o^{18$}^w7T~w8fE_E@bB$$#rcN(ozb1O^gTXHnS)4uU~(2 z00LJ#O63*c-7G=czW(5 z`hc!xPwGRoMxXE=#!3>2Ikq_{Vz0kkzy6zX@wvOFXGU_`sw}jaTzazxhLH*l<6ib= zjMh$OAUm;oFjHG-cd_1Sp=U*UO1tMuA!Ih_b}^>@(-@qSuobWk>VeGRMvSh zO+en&a492*|0ma&$^28y_2v3qO3adkius{Cr(>d{_l5mdx95S-KtDbhWys<7!T;!H z^D6e(K18$)JjR~+Lg*0^4U&fo&Ez3M(&jYJs=7qYP0_z&TW4k^8Nq~Qoz60+eZoX4 zmd`|NW`fq@PFgnC>S~^!rI#&K2WP}N`!R76#x2Ur$;>i4r`JAnm*2-~rq^|DCN#Iq zGBNY(?VY;#n!D(j{EtcxQwS(+axc?IjNM_BxY#c$k?(x@vQX^i8=ocpd+$_)=H~mK z@bM)F=sPAfwAgAl7jrG265})JVmLuziKHde zrsTR%Y9y3;`>^fYp~UZgn*w|SBevGSEa>pYz-hl_KlnwLr*m3D zcQpk9L$!o@$Ez$#xdXBL^PM*ZP9sB&E_Pi&i*dH7W`?E^$1D3TcVY-&X54PxEX+G3Ib)Cdj1iE(w*ryHi1q-Oov6E-8;fH#*|4 z$_4&=8I_CKmgCQiNS}9fnQOYdWZvvAet;p#V|MtH1c`gf;+n#iN5YuEvO71TnP1h1CYrj*0nyl<|NKiQA8Tpv( zzdSh`jKgHWs(AK)4lM3MmLXEX(Y^ml8vGx5;qfe6z;J*eEGac{>FOdrU{1beF+cNZ z#dyH;VkZ&i`p2cN3(l~xlK#oufhcKfp)&eFK{JYJ5CE^h?Dn;&?mKS+&N?P^dtz0_ zD*Kodm3?WjYHz0GT=|K%1Y88hgT&Wj;5l&$$MjsLBL+(0zQHW!z9Dth@-s9 zE3d}uyT%+72+W*Z3ZHdhek5#$ zJwI8E>$p=vNRjgB|Lj}&*ma19ZTAk&U<2f_4K|U>e6ji`)&$$EX(dhM_+rU-bl}|3bpuo*MTz6m|+Z`XWIN^0C_p>T`|FX$}_wm)iKT9H1!l2 z!{^=AJIf4&)<%FNoFI6b5Y*z(X&YT5mLY_Iu}e?W?z zQ;=n+xNaU+T*bfbPK6Z1beF`Hmrmr$zLcmPoSH$0;TZwT(Y>RyeG?K_s>Z@mGv`gJ zce?TuB2vOyh_u1_$n^3eD<|8<7L@n1kH(rC-d?V>fOnVq_&$$fbuArxM(hJbbaQua#`n{3M?F;u_uI5$8fvxeZJad3CnLXn0YW{TnPzI zSj8oC9uxV-ErFu+G3{im;+5=${mBq$rep>%#_^sl9R)7tiyhd6DYqNmv|bWXi}}fb zSparks%sKGC5BnTLC@vcOx(=r(P6UIReXY3+NMCV+H!_F!+=Ts`Z;*3a3juX+t*s? z^?y!)Sz3s3>ZO;{S`&(!u?y3U1c%{i+(u);z|a$;qRXA;FE6cW%FG~Z1uRh=%hccv zE|6E#WRA3#nRUQyO_& z2u$C+c?LX86h=^}aAvST%{Rumd)Wb&l2&2f-EyZQm~a{YSl}OC*ed3#^p`<(BYX~U zYil^Dgo&t%8%HpljYzk+D3e9#n@t<>KQQke^Q?)7K3d|&d;OtBy@o^9 zlA7z<_X?Jgjm@+fDe+zuG%I+LH|PC3psjf`8gqVViEu=MUBTRUGVe)V-XruqG``^; zpAqM#Xj7?x;f9BN(H|z~>LfBZ@B8MMU3+7!Q|=84LAk)_bEbo5e^kmq1Rs+O5y9q+ zH~pJN4PKg#=988y*ww1{{?PQmPJ`7}0eTv|KQ-E!=b$le4W8H@a_LmNml?_Q2cU}~N9dkUA(r)fUN*QuKL|<8 zYa{*+`b=T2cP4bBaN65H!ALl_*#(gZT6l>wRLv>rUVLszu&TbxlhL^SJw-0{#|(>R zhSBw$%^L9#4ec%K64$RQ@tGT|>2Ex1rlg9o{lG{nl%4OA2s&w1+$#SqWpd4Q2fISj z$N_d?u_AnOJ=@;(;=V3Ghm@L1EGP~@5zg5;(RLYF+o9Wdv+cL_HZW_AgNWK-) z-^2dx57HcK;=lca<$`xM8I0xR zTl1tn)0~{I4PyL2q50tS9SiZmSg%|FA7?#%sSu8*Gd_qva|e_P3m_u4jJiNZ(p!Sw)F zi%!VH%(oBuEWBLgJyi?M6ZvrZy%R@Wd zfbPjXS4~ebpc>jU4^@lT(YD{3EwJz>d#s7~b{$fn5dCKw<-u&^k{PY{osu6)yk~3* z7;Ppl9=Kc3ZNUR~TxS@B)ry_I+31=M%+n~+&`4Ild4o=tH}uk&nuLY!angD24GBBM zrjk+#7yt?VxJ0gdK27{0VIQRtTf{A<^%U+$k1h~I#k|HadWAeV<4KRh5Z5=-aeY;7oX`C(o$t2HmP1By# z6(b0zxy%>;c&?tk>%I2f8=H4@7t!X`(k4<054s`Lo5c)|k zXZ2ZdJ&azv4Qq8m%K#-vd}i}p)1v~y${pOW-2D6~psbyBTyrr})tj;tP`6T(KzyBq zPK+Pf%Ts-q7F=HkFfoX9Kqt7Y$anLzv%^JHyzaHNx3>VhH8_Lik@LDmdk8IH!S@@J zyk|#r-p5XO*7r9Sy8D24xB4`OHg8&V#P)XocQXIAcd%Q4@WhkxKn)bOR<@ZdVQr|M z+-by90B?^{E2iNvKJzndzJ10oeCPLmTWxjlw}u@uOZ9W}d38C@y^Il{Y_C7ue}8@J9cNLl2%R5T=Ui ze^%6*{w|xK4i01mNa4X=1e}qCu@v+$Jr1w%T)VlH@r_7&(5m-2wuGx)MSiV~EANP= z_qNf(=pV{lotcN$#6x!V<16-INKw9??0#`RR3jx_aHr_>Z zI`~B3ib)v9X*?Np(w+WZ$;^sHj>9LnXF$~r?`eWR_0S;GOc)lX?sQ5uIplS~X3r{& zuZE1hW5Ext$M1E`mn7MpzkT{~beapWCLjAgoXszUQ<~}!XU5Hv1IOL}jcwJ#9&Spm`TWUJJ}oQ=f&;76txuW35x|W;5NMae6&25b>OZYgP&}R)qwM zj>e?SCDAjPsa`5jNVaFP!Oa>q)m;>k55jJs4r}O`5xDeD5?#?>!_l|{^%N1B`-gKT z0af(adms=WE*fjwI#U5LO*16Nn*o^!qAB#?jJbQFu^P^XLhvblRr%lH;!HuJW<-z= zH~>5*!`jYvA~Xxvfxsc7xuvW>A4SO5Phbo;eEIqVUcG{!G;P*sT@DwU-gxJQ(1Yqj zBg@FdMhmX~q8$|pLtRY2P#IypOa5RO#({78l2I*J;}b%shKi9s`smEaA0Ja$%uub| zbWsgb6_$0E3_W+z;WZ9wTMxwJlMpNJD;t+WaU-6EKbERO?XmHgHxVj~w7phoB;+c` z#?b6CSmqi^O2zd&V{%EN*J#lm8x7pDxi1)Nt4FDLqW|fEAPn1XHw|XJ4fT&7;6WcX z{J=y`H}P?Z5Vj%g7^q z2vvn}&rkPE3>A^(TK;MZ|6-=zvnIZ`cc~xReC3rit%H9h;BSo?F6%mj`(l+qoA!elJ#7mlD0$gE6qPcFEL5u zR&1!uzdx&IFstJTtOk+uoW79v-`d-wdp(3@gfc;w;vC3qNJq~uTFOAr6`VTTM zqV@0<%yP$pK$~#EF)_Grma@E74)wW+l{-si}eNu~pT7@nBD>uY5*EMluwZH=D2;T7&TJ zN$~hjBPcbvQl;`&WsQlTF!57F92P13gZhvG5(YRqIz9!-rKfXqbGZP{x0GOcZt@!@ z{qMZnHxKjgEaZPL)tF_8RK5}E7H~uj9qj}2%l2PWs^MTsdsC>O=)blh;;TDvxD5%$ zSDaEtm7t3|juQ=v8dJxt$@}u<-$PthZ0^$Yy462{CFF!Niy=ou2(@HuG|Noxr1y#U zHftWZ#3Okj20NqDXou@3nYwSz?lWqWGWjVuNxa0;2?&`XNv}g6;UOp|2D~JyRe4Vw z_MuOIE!0EAw2zYA2l<9k7`8nOU=%x{EXB*)KT30v$HzK}P@d+86!l zO&EI2d>;OdlM9M-yj??0u6%n9t5glEx=E!Muhmstf8$1y&XS7S9$|xrY&MO2dILdQ z%~>{jOQN+xk}Dn*y+{?(P4?0nEJQipM@>cJnqtm|uInw-py#!lNnl>-#W(zt=^xd- zDNLBouUjHu$-IFMI8%PBP@CI|B{#VH9ZLQ|bgb|Lu6}>BCBF~iHr6-}A9KSgTU};7 zM|rxweM#mK9-yqV>fhv1FgWw@nC>gtGeYqXC^^pP={`F$>zNWo(P0+}OeVFW^>A2= zFK@%6oa+8h8Kz_Y-NWxCh$oz58TezDajR<&XS#)lap-@#q)!?My)X3nfa0}huU3gg z+@{9LI7cofq+&NWo| zF@^>IY}xJrT)PX3G#_wJaLjs5(-=QF&Z4!7Zq~9S+_dN&Z`y1&7WK(m|I<&;HnHuH zDdMZY5#9gD?!&{~u8(*M&Iz$_#O(}O{d~c@TWSIt*3dnAVYai4E|?s^Lpj0#J}#U{7dH5PWcYAI zA8lORTuQxl;8tJJwxO_o4=$eo{xH2Qvt>4LPxa6_I<*}Mr4V>P(i6Q}Izvy09Crj+ z%~e_#cSDYdNU?BKbM2Dg)T+<9bmUC$UEhoRRKcs#rg`n=0i|}Dn9pEj6FT2;zrCIyNt%0IvhD1lB8x))3ae$im=$09ODZc* zZ$gaS{Z=jgV1P?!SU1a=Ixn08dR^A3%~w{*)^q8SuI=VrUaaNPBB{TqWxJJGLODg< zs#5u;{Zab|?V_K2mzED^x+1F}g)a9RV7VwTD5^VTTdIAuj6cke^(?l=ZSbSOam}qm z+oL0kWU9uOcHY#uQ*Ro5JR^P{ERvw4WSYWndpp~|rP=4Dr+)(!tKuUDfN@G@#O*js)w0<=~928uL3 zD-D$E==cJtH?6vhp$9QcfzS!1)EL7?sl0?>e=a(MlFh~wSb36VdPaw^jgS6%<=kFR zi;2~)`BD@1qn+AM3TYkO>5OYPntvlnAxy^RZ*uExFO~RoW{3GY_Yc+^Em~_hZs5xX zZut!0&t#S@0=nyhY4*j9FeR%JQOSAh5_1fC-8)`(h3ls(51V0%DwiZ5-xDp7Y!OhU zb?7EVsfV?jzU0@%UmDpiD){IWx_kgOQ^x>F+wUH#mThPk%@R#nBYo=3$DxRT>{FJQ z`s#399LXnC{JYl$;n=|pw<_sN$Jubw3k29meJum z>Z%)N%#DcS@}a_Scla?Yy$o z7$Sb^zMvtGOrijpF562LB}a#vuFAc$O6$=9+I5FAQ*X?>15K>j5i7xg!}#SxjDhI8 z1cRLkZOH7!(dF*Q>?_%x(%7P>S*bfEL~Hn3V7FRWI_?v5wEAuT?qwg4w4Z0g^U6F{ zuzP#x0ryLp!?ApJl`+B*iOGjaI*Qr6gk2&9?I3HmX!e`(?m-{1MBiX~zhdS50X_|p zx_}fc`hZ*_{gLIH-aCc%Gv^a`%xuS-S&QP`#wL?S;?PeHS;~yf&CRS-V3KoTmp^G0 z8YCL_n!BpJC-VmIk!nzxLT%J6@c}sZ+qa{SZblELeICRcwA2{X2#SfISErSGxs;Gw zXQ`_qknOBfzOLZ{OnhSEX-xMd^|~WwT-9(&jPusJBbL zhNE;SLC`Y2iRgmQkUST(J!{YF*g?k05;k$-`0uX!T_xqS*h8pr*=b^=q=QUXrLS5O zuWr3octX$n-F;EX_Q6p3-aW=9#61I)dY&OFwiQ`+c69kHa)RQ6Sf;OIGGA@9dKVN= z!Osq?zjzOQL}It#2~C?p{HIdxm&q}ogz~XVwzRSRrE4bx?xDBuGdoe%sf|k79p*b{ z!mtg05*Aom<##|Hd(koNw)L5+p#r~~DYsqIOJqat=zTopd`~cTr~bWQ>YbdztCPm> zw_FI(gAKc;%9rHd5~B3j=YuO_>(}?S&JT)r8V+eI^^ZQwR|V9g9fkPo%{%*9vWS?v zE(Nv=MN1bIId9X2KS!T#4VzhOxhdGs@tSxIYfA)NUp%6n(In?ge!`;vb7R}G=Sl`Y z(N@{T`f2p)aOZEs`o-pN#Jsyxx-Idjnv77gL)h?};`KNz@`fvkrl*`E-}M+&^{PfN7k@8Y6ly!Z6bFueCH>1_MM-AcbM92Jeen;zyfeDHv38AI>b_i8qNmdrR@{Z; zSrBF2oS`X-S+IaDyK+Ue8Vy*jZaBpm6Dq`#s}a8!o4LfVxQ%&Dw=5>qk5$*+EFwt9 zyrzdZSMdM%2-+^Z^i?N z6AW^1rJ;|A>W6lJ{;m6LLgCvZQ_ux{pK*l-b5O&4+RQChV((8ytoa9eaTiO>IsZ83 z2iP9DM)_e_woEQH&|C>YF838KN|+xlX+j$CU`h96x$)p32s}UWP2Hk2pOoYQyFDH zsc9~-8@pB1wm|gTXI@WMZ*4$uUa!F=ZON3axTvr3zUkHBwJDz<-PmXo?J2KJ*eT$w@O*%`mht zI*5!D+dvR|d?KTY0gdtaGWxim?bh#GQ)Cg*oz?op?dWn$QcyTVaNv^>Nwa2ho~G#4 zPPQe)OC`kAQSus_ZC@-K)1eII@EfbE6i!>eV|B%_U?6PF=E{%T&iVVyu8%W0Zj@H% zAq%kW*)@t6eGuDvXXBBmux1GG_{;B^6-;Gsim>YW*2$C+WleW8(Yl$ zyQ~~tON|5y$I;vC@YZqnCJSEMJ?~F{q4UGO)#Me$4B2LKZ4x~gjpv>2^TcuEzHSt$ zTYv{Nes3HSGZNw_Qv>y*ptgGM!WEE3l6f`2AfR6Bf@55LAKSBXf!V0li)pjfqGW?h z)B`mM@Qn(fSX&hsLCwEuOtn1W4Jdu@47%D6!fdIz) zP(wve%ijxYxzSs23^z#Fhca^7^5cD6IlQ{*E58pMfWggOrj9W!3%zN0;KszuBvL!!>AjREksT0`C1~R zu9%O+xwH!T?|2y zSe>=HK%u6dYb_yOQFc~ckxV$O@e-~FuYW2gSILhW?qGA=#r95D8>^-XnMh3s2Zzz} z>wpv=krH(~F5r14^}rv2wl8Jiq^HLN&^LJsCk3C$i=)dp)rpUVSiiYw9Up2!7(Pjj zS&Oxr*fAd|C}yxhkZ6f^j*qllRK2=aO)#|jXPNhM6Q>R^25_jmt|CbVVWa?wMf5%U zM9=+9U;Z`&i~Dq~Sci&7USTuxf)RwI-4!Hq_C$Rz&M9>cxU}w&$;k!mN3IL8%u-@z zH+VD2yN7O_q`b{yoS~S>%8!tbJ3l8>RUP;F{OPc$WQ{cZE28iR~VQWYUE)I~-|fGY5S+p=F0qgK0f;AuY%Z&m3OLn%5V zTV%Wpp^5+&-*~`6kW#;IC^~Jz7QykRnGF&rw!Y^grC)l0Q`5Man*b+5gQZB(wyIID zrfJWEzY37Nt5Tx7%U{X%3X*MQYQ0bmPXhv5zL^HDJc{N~ zc)ffkN+h}d+Tnq<*IKjKNeq?RUF9Y4dLkda#uIL?G@TTtCT=`= zANNw8=$B*1Dt)_~C2NbmMG01{=%*OIuVBio?ZDxRcW+$W^pL}hepkE{CQlgW44nYvTzZ|$Uw{qvPN(Uhi?^Nj}M3}DnA-{?6 z?p7*(DCl@G)t(Qr*Kw*tdJwn723Bd2BG!wCa=pI{D8-4O93&_438MF&%qX|Dk7j|4 z?hcB0h&G-(w1b00ecIGNr6J43kyir&V0D6MQ74e=R^UJXv}ak;9IdQ&sCs?gnvt2A z0!J#Z39EX~rdg}6#$7Snzlp3^K_Gkw<2=94qwxJ-m!JL6cH)r@`~QeP>F;GQeMxvp Uh5bi@keQ#<&^q?=$i?6O7x$C5djJ3c literal 98106 zcmeFa2V7HU`#+5JQ68(1wzeWO?b9kwP(hHHDk`;zs3?eGm?}dyG6DpMtqN6yv}Mau zkS!2Fh!8^5R46kdBVi<{$Vx&2goKsx-Y4L|+UM8)`o5p{{k;F?WA%^|&bjYve6R7n zPI~p|5p#tPRX&uHlT$eK&Hm$Zav$!Mllxctzdr!)T(~M}1OF`ZKW@HHuB26M6#Vcn z*S(hD6L)c|MW_|v=T*Mn*!j!JeKi98x9o?rE7auVq&bK7?>!Oh$Pr&9Z0p_IA0k9J ztE8;Bx6FFw^_@3A{{ErMN{77)Cqho56PjrMR#>A_;%Q&mbTY_^ScEBdWN<@ItV&w_ zh2LI&`&7ou=rzfE_kHrg&Q(ACcY`Qi(9DW3x~d&bv+l3eyOHb4)SeXb;*sUu02@<*Qb4Uj+o8dwPjfs=6SK6#=DHnVX(=gr? zt=QRebg24d#P}~s0n}6LDpEYpS+KTR8`HApotn;Ouzy59F^v%2KRjnYSYO}e6%z8p z@D@bvR}716&pz)|+kpcGUpnP&8)J~MWFx^KYzHL`&Er;Y@#rV5_M{6^BkxGQO6F%J zOSR)BFD6LNu9Ul|83)$%^{!!^uGSLjf0}R@=Qd-cs!%*X-IQhKrT5F^B}coC+~z>` zMXNY;LeJ?~lQA96?qPeio&HBcSvJm{O-G#`U=r%>ljlB-R?yvI`_x!-2K$Zu8A?gN zZfwFARp$oJQLQz~k#?oF>nb+;GYp#$E^>0gkIn;|@+7d=^*U5Wd}J5VwKaOE0MViy z?IJI@nR=@^ut;>%cHM6Jjnhg3vcWV@QE%G$iK&_1u=VJ%rgUWX?6CNPx1wVQ-pJhG zX#L5`!$$_zH$K+wbwCfc>vHB8!91sb2~z7S$Rq20r`3oC2EOi$ahRtK{`rmx{EoeX73S4YpkbhFjUb+ntZ z*NiyO;iDccX*_j1VxE<8cN+uM)~MudIe!7&xQvx>fLbw|wFyDbz0k*8yv5 z1B*)zz2}$hHTzI;p#DxOe7|CoJkSh{j%Rahu~Rj>r9LNlf%NF-S1U`Nm{J@E%zKgT zp01O3F-`UDf-lB6?xzRqNr`zzStJF_(XNpe@{uDqO(#|K#u^`YY>logaN2{OyOzJU z2gSSWAE1*o9Q^1wcP()sI(|7gt#?@R$r_rwUAFBy^}>j!{$gM$a)yvaTT|*|FQvfk zxO1OhNIrLTU5hnV?)16?0?nalQ`mOgXu^T0Lc%Sh&jntG{k=Eck(MrxdYmd$9ts|R z_K=xxr}x6fuG(Ha8nUN24q4myw7gLJ8TY$Q7Wy3pH~kIn=G7)@y_BwdaB{1Xb?LUA z+}d$Nf2IK}8`WQ+8C0*dSw7!(otHJdd0Zc}xVE}-5wy0I(AucSY$-?VLi2Z7ly19` zcjTh;u@l&rJz}(-p;^e`(rtzv_glJJxv&%1tU4>zREd2I&qRhcsDb9HjS(JtU8i<$a0a&l`Rgpt^b8wN;cMJs>XWeWr{%u*3v zVtM!6wd>mH!W*6uXsLDW>UG_d*bsg;YBy)@C$34TKu2Tnsh2JpMeUzF^v(JNIL`+0 zI9Q8yCH?si1^PzoUFd~(ChYwkf8pz2IN|zRS-vfqyK+#TS`lKbL<;PnIyU7I%RS23 zt0Q)3M9s4Fzi(^49x%Q_&pf5bcHP*~%vzGR=n|pIbTx)LpXE>Th|{wadeb&`u5AtT z-|Erdjcl}$=gs7nG5xoPhP2)r5gac~b4t}IEh#$)$@;>Xrl$(Ptbq&{oOnGHm_$|8 zXIh#y?4WzJSo<832|E&ZT|c3Isxj+Q{N{}Lt6t*0LZ;OPc7l=o%Oat2taI6hf#GM- z9kd5bWtWIX>@L23ZR-_NQV6l#~F z;`DUJ-6lECqk0(a$}-1KH`?9nPJez3GkD05SW?oO)^2NKav~)M+AldMax1E+&Z~s_ zsJZce(@T45H~!(v5X#1c9Fp(hl>%d;V*uK(4#u2lOH-cg{Vhzs#<99Z3(-sSgWa!g z%<^JUK5=~P90MtK*JFQ}AT8ScL}_0)=J5w|540eYe>^a3^B_Bnb7~0KDZfKGd)qY@ zEQw-Xm)dwX)Vb|~K(jIqSK0KG`BbkGEw&u|nV^{bd=sm^ZL@{a78p0-z%jcDbcs5= zQ^}W^Ja@`VjELoUZO?Wn?Kb>AKpKuks&WH7x>D8Gw8swlpz@4eD54m(&Qc=xa~@;n z`FNMry`~aty()aUL)W6DZ!9o_)KgeR* ztYh4m9p&F-M(>>-pDYjQaouxm?jnce>QQ7E)qFlUeeTBjt6t{Ed=5)yYE5lYZGU@t z`AF-Ki|!UgaLHT_$%C>nuOQ-^+RLo1HRU1Inp-)Bsq&uZ(vj{0v&Jl>psu6bGPkFq zNzl+^@4BmD=+=5p4`(hbUG1Kz>zL0;*W($J1;&lQ2|~hv5$_18Ziu}eTh=15qlX6L zE|jn>85Ks^_(O5F=62O)P?S7*#J^5yEI6 zpWX_=^y*IdoKM7LZN5psMtCV~UHD9|sabZ)7h8~_eTITqxbU|538x2)Wt%PZ56}I2 zuBGAUzTH;o>nbxw=FTVzHJ*NS!96a8h)9GR)6SV!*6xGRe`{!Cd)CSs398n6IernR zKo7kaWk&MQH}XHy-Q1F~|7KYLrMWYbl&Ussfb8wYeBW|Rqkn63hR0-G1^a4jg-jxQ z6?Jqo*mE3D#|vvi;#f_4JWrZ548_9PBV!eA^_tow2aJCAOtGt1X(+|ec{H-PG-=i- z!D=uc3ZLLoS@gi#mCiZ9@Q#LG*6)vJwcnd=snY45d5B5^g0b2L^4E)5lCU5TYlE*- zN+86*Iy!*>Ru}aOF|G(BN*p4(R*#|X3F7)&)2w3T$8_%H&zt9Mb8s82A50G|owkW1 z+BiKRJ~AWW(p%V-%#U!H4q;GK@^G$iT? zHYFVm7yG228*)LjC`9%L$>ptD_0M=PU1rIQAfhQ34fAUdmcXVL29;faHdExf{{G zZt3&-aMgy>j@s#q>f1?WDrm7mth-V@;G}^)Bu!o>_v}zEC|r(qWFFBNpWxa!R|6NP zGO5Qk-^RF41jFS-)2HcSk;uf>qgX}Lo&557p>krOm3@l@`kv zntp&m=hfEfl}eX4wxq4bj7FC@V*Oi}>y?@A4_VG?p9r2FV+bzmtZvCAA$McTr%UX} z=h0h}a`+u_yl~H2NLNfvxbRDFf0IrWrlZ^F5CJa=u3O1VFRr~#Xiap@$5x}ys!(S{ z>ZPu5y`Ze_ljk8#Dx@;Es|LJTfgzAdYQFygN%4h6 zg%MSRhXs2v_vURqxL&Y_FNMT55yLYRl{>dBVBfaBBFUXFj~9%990d$5x3%WpO2KAk zGtB?RufL{6b;yVD)&;Q)^!rhY^xvwZUq)9kJACal@CvCy?@m!m39YZ%#KLNCaVp-p zY!=egecN@6!s)E?5U=a8_gnbXeR%XQ-l*A$>-sqd%Ew3=dZm2pjEPyJ+9?%o*JclG zX;kCW;Ty?hYmC+4O~~!^I$XW*>;hTs@QIE#j=xY?#6X3`F|4UkxeE7WE*pwZnQ@5+ zj*%u_rHU5u=0YgSwIJv2&gUD~mQ7jDOyV1tWe)I;W8M4h=0=74)wd|R(fQLw5m^q6 zg%%tyItE?HOh2HfjSq#p4&@ub3a0&ZlD!I-*9~#u_@8ymj^&vqqpXv>&FlgR#gGcw zy9wUZr`AV}UO*V^Te_|Auwx`qvFT$%h{SGv{7H1L;erm4FPN4$zIsqLWg~7c9v+Ez z4U@#er@g-N2q6Uw@j9A|ilE&FOH|`|JHe`YDC_%s!;L1K* zpm0)dLZ8`?sk6Q3(bWb`vV2;uPd!A-YIm_u>DN5yuW_Mo1^`Rzngi)`XCmA0qE$C04LTwT?f_RjycrpZ8_~b6_Kcl; z24MXbGrib+=*Qlg>1}F*_UP43I9D|SDYc`3mQvHsCH-a)q9^j+HMt9ruM!TZF za$7LaWqD=WdWs{^J=Ft)t^i;vC-H-c89!VF^-%A_vQOpYyrFDKR5ho=b;!J=ecWmf z06_zIy;TQDm|jJ{5EiTe_GHq}EBpjsbCh_Gx?@ZElnSCXBOgOy#@s>lBN}sh^LHi& zXq?cm+Z}P=37%HG*}FY0YFl{Vu>YbKL#-*JzxD12X-LXZeT6!qhrwD->!U)>@gYA1 zwedYmyKLH6gk326$KEvRK5_dIyJ{4#fU_ zrCJ~jN`P7vx>lqLdAF)Wmd-D*%^=Qil&MpHc2l|dEO$<)zgWSgXzti^et*w0m~uQ+ z7=$R!=+;F1mKBqhKR<#R&JT~=o05g@yT~YrZem6cKU@7Q_MSc`6sLmd;~nu!aT7NA zwwkfl>JBHr-sW5m+Lw}%f_$?@mi0$^=Zoe7wuN(+HR=E-@?x2=6oVJH_~>q{AWMM> zHdgy>z1a!!^EDQF>JuYlyQNhu9SvdV397NH_;eFHM+ckdlF_(ewx&F!v~oK_hM%^W z>21#EPaw{X+U2XWGeO9>lK`ORfq%T!TFltVGD$~;bt2dQLH!MRY>9&mX7YJI-oVDQ zs1?XqFR;H;@a-_Jx}I5{spVKwWA1^dB4+ZX>5(6M2ze1>o^>?`SOt~2Pb~d4j%VI9 zKqliFmZ=)(d>Y7l@yQW$W@Jhnhfz42>QGuGNZw|$qT)-(Gyd_ep!DeNKz6364TGp? zy1L#_X0ijjIjt{qIyPf}EKQfi4@aILt=K)p!(q;Tt$@PEoCp_RUR2~5V`j1(DA6o@ zBA~u<8zYaIJd5i&kvp>8Bvnq%cokI6W@Jm;%5-!#1_l+|V_z7%R?e04W;1uuaE+M~ zE15=>S$V10A6c+yBV3^|qT>qpSrUR*$)34K#eWli8fp5dwQ=p#qmoXgv4@xdd%=Xk zO!jQ(FLUm8h2FM0+O!lHK)}fg!nB$?{%1kj?8$xrn%Wt$H41{So(FUkdA;8HWkS3q zBhqrCK^J7nH-IIZkxr)851#^N40Id_Wd!|I+-CBQjqP37=K7Z)e=EBJS`5x4NAyRH zPrX=X9qOX8fXX_W1tB8joCsNpIRWyiAhImL6j2#n*(acmm`gsjqf0-fm^%fvg?2oJ z+CqhhBpyfX=_mU90{e!bzBXO-#d&dxizfD0`yx2~b+_Zj^JkwNh4;=K?JzHZO6MKy zJnu|YZRoU-^XLVEF|z5!%TRiC>oMWTh>+W(7Kk6^q~{@Bc^8EEF~49f&D z(NH=!7pSComKo`NXttZXJ*`H!axs$-To-_f?WqFAN;6K4{Xy#4;A?wW&$b$zbYDqxx==@)d31=^J&_75ZmNRM^VpD(1et~KQzMME_)oX*M6(KSu-Na`^{ z`CvWaQNcDS8mSL?20?N52C635Fk10EDNpcyp_MhLDvz~pl#?sU1L-0DfcWmv8Fu}X zIGhJMybCdVxw?F|fctWJORKdGcR;z*>f={2S6HAQ=%bUI{MF^MfL^XoL@dH!b6hsV zlyW#O&M{F--{4bXYtOk<%_2+FZiFX}yN}e3F*~-ypa)q>n0wJ*y)WFW*?2!6UN)W8 z=;{lwt}E$iL?uzgBc$BYfHw3ELOm<)V>u0JfRk z@=>ngnJ?KEo5-KA(dE8|LT@w36A0X}m^@FmO-nx~v?V1W*YR8E^+5VgMW!M>4=|< zVrCDL<;o?&=wDo2G_uBTkLjF4QX14!Gp((18(qD8^!}a6$vi-M=R2Ud_hpfy`)B-^7CLNIiteW$Bx^`LW%SEPB2D|T^TnP|9h*>uSc5Vk`| zSE7d+628kqR!PnU9n)Fg8R9~dA4C$iU<*y#Oer=xh-BtHfxoEwU-1Xpx-L99Wyk&P zsYB{n+CyGzpxxQ1xnFvTlQ$zuQedN;(Ckf!SWrc9JCfu%#5F8En{{3~SXI5VNRlL2 zNN^PL^@+@kVr#E09=1c7wU@N<2B#f43_s8iRjUEz{BQM@gZ4O9o~oz!bCIP=fKGe` zX7+Rs>j-t`Olz!Fz`UbPc**2I_7-bf7w-JftyE+DwwSyN7gnNEst)P6ss?PO5q%rp zRDv=R>K_UV1jCD(j{rmC^b|AxY_DCwGA;iiAe)OiP=L6&S^$VDg~0Nz?b{F;&!1DG z22lz1ICv7A=g(u3CX*Yp&2f!5SGT^44^M?8u`0=jC*abv)3=?^mu~a&O+f~J%Sbp{ zIln=B^hf|`or@nFNNR7hjFo=Qu?ROwIxrt!fj&$OT)uIgdatE(A~Oq(OF7-@5^ij6 z(GL$gkCpyQPHxYa0C42)@4?KEFm3Kt^mjCd8hIAeDu~RBJQ9l7H< zZQ=OT;jVMr8i@Xa8TD)NNTvjCTbrto*^;mR5nA)-;I{KO&Q2cLO-8{d#3<|ec}+RF z2i;I6`@nf~igV^qs7fZ|e!(5mEm3eLvG?Lt(gF9Rx!B>>J=9|n9@FP53uF6~)-zcu zCS-h~d>W6IM@66qml0Zf@kWfqB=gqtsyzA2jV3W ztH&c9oZ^tGW4Uvadzv(2G~MtaH&~x-EPZI?MMttHhl(O-Y;WuEiR&&Ixt*L{<10hNPg@S&|*kMU1>OE98S>) z$g926k=oJiP**%A{4T7m#n_T~xbk*tvE#1cPl}x+=2+}{NA2yrOo-Z)F~B8<(&hB6 zZ5VMupWO&e_bqx+5*nYG<+psDf%kUP!~yTlL4D5L4{JUPI$`5wB#27+s+olBO@G*M zFrx^C-l$4uKR&5?)|YUBH0Xb%T}v>kq4nF`V9oR=#kRHV>8N7X7()Enzd>@7Dcf$G z;kc*G)9NSj2YRok2VSrqO3Sh#+-v!;^Zv5v>v5hMQ96cTm*yCs&yAz!e&`F7vhy8h zr-;^0Y5pqXxjK5BBjHoRT3=QIF@?SQJn#I6k!Md)@gr#&lVVg+sE0Ny95CT$7B}X- zV<0^V_F{Wv;I*Qr?x&HYlshW=XMWD92lbpty33l*de{AQ91Mw+Nb$OdkB(Tp$tuqcJ6RS`XTv3{^2A01qwf>evt>b zU38h@GcT_*^*t9;>dZd=&e6+W`r|U6c0WR)=~1HMd3BMV=4Ql*SIhTj*Q$3S9j%z% z6$%4i5x)(|>#ZcQ`_e=8o3aL3NnhLDv$C5HR>0JpF?4Q=oH%M#5@mpMD6HKVVb<%;MSnCyFtc@f(F!1QW7QZji3FJ^CV%}DZt%0L`7_Yu7 zlYQmXCL6(Lj_by(yG&7(lHqO9TL$I5>Td-dP#{KF8QDCN2PpidOzG+@n|r%AXqbH5 z8R&m+_hW@4eff2G!}7WqHvH~2FW^*xQGPS$ho42JXAcD@x@L5jxl#5|yQ2M@qjX?R zZ39 zn(B^vI?KhTp-&h)cpurRUiUmN9r&m*<)-O;YO3L|9k*kglb$l{ToekI$uG~Hj?qpw z91Nm5ChFyDHYd3;C$E++acJ?-v3nGu`tV|G`>p~uviOrG1 zEdj$2UcEcKQS_FH>(rY4Z=Dd8!Xi;O~Hz&U+*8b(@9R~v&3itZ_u2uO1l`X~_*9Ok_>GJ24(dE|bw%Y=J za?xOEdo`{Tm9-}4j;GAt-ix+NtP(n8X+~hzch0LSt_I=l05k@0G1^8xIZX~`v7eFM z_%y_?=!i|+wXRx#hC6*+l+|hE_E->c3SlSq+#dJ}lOTVhj-~t^D%_FJ9m(1&ncWs?w?OislZ-AKChsMTPx=A)bbf^OQ3ZS!`XnZPQ8p z@zUFOzLLFaIVAlTgbMW?;vzNAqlDE#LDJ=-E&klhJ`?rO6mmf;0mTFmGoJaSn;yV^~JEY6mxsO-<4U2zr z-O0MBs5{B7`7uNt@8G!8?pn*|@w5KReJm z4`Zh9+H@H;%$tRKqDi{I8?_Q;-k4?hAfWMSc-@Z)sV(Ot zGR?`FQesO#d6KPz1UL5gi;pCQd=|s!_0+^E6K=8X4OQDhOXhsXyyyb9 zH@};m$u;Q^G(NAP9L&X3KzIgcxIOZBTr(ay3Id_NYS1J*z@1F5{6){*opfYn+f z5JwXYyGputBRo2(yH@#Z2fIIob&V|X6K@3>1gIar);!SCT^}`_eRozmo7U)~O z2(C3`FcWfmJMvs;8*Jfpk^=D#tv8HCB87eDa)eoA`grzYz!ARWDD}x^3t-(PP^VhX zKJn1SZe*oV-6)kg}Wy>tm@2{f<^cAz#AjZ@x_c8!Tj5hgGP{mgOF?P7jSd#OU5u5m)bhu;tK4jGj=W z-kF~Vx?WVrcsZbQ2fRc$sQd^l` zTt!>Jh;Z*GRG?4AUv4^Ex!Gfzafd*Q!{~UJM$!nBuQK+efY?32&MB|zdHJY!tLGO0 zghFJQ_6ocB9}NxS6BD+4p=ik|-0L6VFrIImy9>Kqc=~j7y-82cEb9h+a_f&c;>qlm z)!Cw(yG#XceLqhQTkqyHkXoZlQmlLCE~MJ|%M-b*jF44W8r774 zrD{_(FML(JU<*e7OQ&l{E4Ll|15l-NbE}GBK$sdeU(kCMwmuGd>gzqlsM1G&dnZPfeH) zwIUKir+NL_2)aw(&)llU|6F7aODUO^*LQ)e2YOkkJ;m4emYP?gXq&6Iv10U3`6xUc z!xbjLeb&458h&<%dAmwW-8nHKbWkeFG?7Z2+?26WCoyA}(!vGGw|me9V>DPO1sfG< z!u*U>u@<*C^nz-@Kz6l)YT|WOW37UiULN5`{=Y&y!6mG2u%AxUVFmpORW`#o$xB#N zcvu?@`y7>x;Sg%(q{nj?=BLk8IxAo{)uumJEg072)U?u*HH+u)<}v!HB*b)NfgW;Z z&!O6@YBr-d#Ju@fj4;(nu)(>yQ&}a5aEUA=;$Z8^GY!)aqC&4IyFNU-G%2GUAN85x zAXT+4wUC{9D{4dfquo7KWF_XsH5v|-c<;|QW~PXh|aTGOi&($N)ZVp=0o&e)+61dUQaue2tH9%mv6 zV^w>qU+MCBoPpu*;e;k?F6tGkk%dfMNTHGS+)c%@{p^VNSS~Tj4C~)JzYXR@tg%=)S#% zLX{xbzS}*g?$oPRh}ZdL_`ZFz2b>hOovB$q-C)LliOg1ZE z`Z}KHbc%W7^;gHQo72e6xMyZ?$XqUV=qQ$ov^N@I z9t4j|#YWE6ffK}Eb;uUr34vD=@`CQr7E%2( zCd^Mv>INtp6;1bDX8BxulWl2hA8Y^m1n(reM&2{(s#C8m73wf3GzOkLYF_JBM&s9$ z=R4$y%DS9lGfJy0F9m`3fq4X5jF^#1tibO2C(WF-p50)0Q8akLwP+s1L7ci}EeRXR zu2k_4PA{sDWSAVY>rh?iPG{dlxxQ9zT1KS88emuGI$jDhlrjYz$jKv|p1rYdovEFd zCq1qq>WYfc(2bt*%B~(f<4d1gshX$!sI2RG#&=i&!)ZbK6f!#tVVI6z$C1D4yRPpI zB6SlV2{*Lkmn#^xzf#&9Xyup;p;^?5h?gJ-R(L9T~eV3anTkbTUQbh6S8LywuF#%)u5IjDGae`^bSc|YR1(Z7ebld|Bj zYZT_?BPM6#U~IS&@LUx8Sj62t2!J4ICXqHrpe{=6HM%a^!cIFAaR zEZN!S|0q)Y^1Z*RBK&Nrv=%S_FWHQIJHGxFN*lCR{yF!ylK&r+OGR0vI4@IbKd@=-!#HzF z?RP3cZ@Jimhq@d;H@XEFUzc`gitW%$VtM}_2^_(Xb3PdS$c=7$n4+l5IZSDVYA)c| z$&n_C3YdaUT@H5oE-WxlvI5In?KpL9SY6L?hT=qT8Fv!L`E1vDwD#b8?_~@+3X{ar zq}W|e#RFOWOyWB+ZCT{sP&` zji2jsw&tSbA{grgXahL@xV0ec%b6@&wNZ;sFoByrYtgkIX0$vxu3<23LEX`d%qR$~3X3 zFn8;6cIa}9bv;3_e8+n)YF>_1?>j(ow9FoZz(yhFcQUm4_o8&3;BWo>yG^aSwIFbL z6+PXj_|oJ^^|v6{_=`med`*cwxb)T+iTGUV)@2tv7eNLP%_{Hghx=$wMVJ{O@!y`^ z{DE9r!osD&^`Ct6#-4)Aff4x>pg+U?n6r5~xm+y(&P12zN_mFAxkM

&o^;zdZ7 zXml7s+ts*@&^6V%2Eei`MR>2B$|O>?Nh?FM9xJwz+DvoXc1gO)5e_Lug!k&}HSGE- zbxnojM7A98n=t+&Vt^2lgJB0t#M;4aTQks4W*%{$BgSElM|0VKevei^-a=ues$}oU z#9jV1?4iR5l^c8RL0*pmy= za8jwXXP1Ky03CO_W@T=)s~f$4KC(33{xh9=j6{sAj`E;GV~BrRI>TQk0F@7WkliP7 zq?7?MS;A*fc5@ueD30!yO1Q|DhktFsP{cC5yW7E7MGu($e=&%kEG0J_fmYbNyZa9e zTrfQj6%zBmb_Gnh(E&6pF%ZM^l!&R3Hj4~4lY|*`Jw`Yk@}SteaI)M51~*k33KPxv z*#&HPD`J*4IJJYQ>P*@=Pjs_yHQxduN+$h=mWDzqKvZAzx9~X6gKnnDWPyYcTL_4?j@eQ_q+KU3-n(x7Dz0&l!+6N6#5vq0`3yjnc^ z0z5;P9I1fb{|4a#vLa0aUbJ44hc#vmGZV#NS|TAQ!SGDd0n$Vzr{y}SC?p9q8ghFQ zDB5YS17SB#)CA`Q;r&EntN`5<;Gw%p2==w&;_4+%(lqD=ZYa|Lr@ zt8jfXfth7eH(KqBTWDy6=$Xyzp>)lb{_ticPw?cyPA9`|_UyofCbPE%-xVpA$4+4w z%)qn1j1A+scl(9ePL8?DpB?`TW>w9Pg!G_L5X?@0lJ0kYaK$xQzGuC{M2gMwb; z*Z-iJR#joJkzh#s6Zmy~ciLD;%^^eGp%&HqQNuTk&v{TZ;7CbW`fJ;h#mkj8XD4Y- zmSrElcRViL;av~Hlv2Yb#A_bz(ohQmV{2Am!O12&?B?Kw1DLWC?~cLMR#*{7dR&i<)va;Rp0SiiY*n9{!9y1uh?lH(?6ZD~C_NE>vZSiNc+_ zm!-Er%HWo>(TMmOaTmQSXndl7R^ZKZ^p&o56v@hkscYICml=t?laq@*?S1AS&>y89 zv2})yuUF01$6C$gwxsCKPu)M~`JB=mhp8xye{E;7TIzxOVTp?2nfR+V?q%h_0}**M ziiAtUwwx;+#!X9D)Id;2BSHH1RXe_F6NoHt0SlUmufNvUEg%wcop_@Gg}#crr*Qi2 zPV+t=!~fErp}+>A>3eQq6lj~FJ9_sdtOxWL>`mA69&k5iZZjxFcnG_x;zQGtPA0x7 zfUuG01{E27kq0sHkq$030CQ!D7IsQAQ(5R;M$<8$rk$G}_lf9CiXZj13=&>Ew$YG4 z6-~g`813nvNIC0CcdkY**xRt4Zcnx@XV*ZrNa7kn=p_mx9YCTB9-4K1n){w+crM-JmZb@PTUbHmX-4Ip|3Xq9LOmBvDgU>6Uihq&FV2VKkU zIwkBEvpKl%`wQy>r)I)lW>bSOXh;Gahu@-$E^FPl$aABoA1Y!xi%dC*k)$SH0-7{= zuBgrx5yKr-F`2)W`fmmuPmD?eLSeQgg3w%L!oQXJ0fUKmn!KjHMTdEV!-G<10?ZU6 z>-_19e^Y`!y(q#Z4Z+tE6iQsZKMcT7Bty0l=TrO~n9~nsHzE(7#17RKiJ{3qZn1w2V3E!^e6Yq64coWmx?HW`@uVCOdO)7RthL=&a2B}yO-=YCDl z=kVsPmTs0wgc5UT61s3xKMG2RWNFLNsP97ZU+!rW1csvR&oi zn{p#se}=m!4 zy8|9L7iSU~Mz860gXs^#)Z!;2)@X@IpH?+lpaj3dh{836qP4;%MVSlJe3>v&Lshppf zmOLhLKX6l)j$=a(q5C_!-pJ3XWEo+#OUVLeU;GyEkRsQE;~#(I1R7$+_PUE zhn?xb#a97zJP#EvAop1a9iXTH{inf~+tEWa4}5BhL54)~aHFH#=pnK{;J|7Z=oi=A z{{bo%UKd2!{(79d8_-}*<5U5Gc`beQ^qHCMFH&LC|7C@qpa=r#pI+4#msps1h*%v} zdLJd_!XBSELM%O&L7B}-?JiXT@K(8 zy&{SQGrK#}9lC?pbjB%~Y~x(UcFCAQDaf3YdS_ArlgU{*6;uhN&qCr5O%kVS^X2KkYUaFZPRcUUA<*<34e?vCH2oO{A=PM?WZeYBy&iZsxdg%i z<=)g^kcHb7h53fXjv?T7Ug5)EyvkecKbg9q9^i$Y+ti)cX{ADG?O)pB+il4O!~w-+ z3XpYNx(P@iTP=U5qs$B?TYPTxqVg77UvFB-ue9~Y3wd3B7RQ47Y{mGg)0Sv;NtgAN zcA(;yxj-=j5R%;lAdyVJ^JGc84-cv^t)NCoHQ@8Cpzxn9D<5QD@w=AdD=-8VXM5^< z9Z4Q8?jZAo+!fR0XOByRnW%vSr8AGbBa@`!VGxWDP)tdkHTOX~@NI*9-_p??cughB zf~)?KbzcHl(|?Jq2UUj*@?<$By zs#h;{lD}p_3trcbY>8%r)+^{T$W-}mTXEr2B-80xs$T4_HWt)W{{xf!D;MNMNgp>6 z)jib^vt*HPinoKT)kA7%)c}!ocq#VDyi+(@JoGqFN!HPknasOA>;>!{pT|xSK_Nc? z8OlRIEcc7
  • mWm7rK@-)V&m@A;f2JmLFb-RPR8gOrlmkf5vnc{pgQCOt$4b!VM)FXp(}u^!>i{P#uh_ zy%r&whSyw3Z!MvI&L4T#f~mQK)DpEM7kL-lUMofDm^m+U1$_2M*ZX1M&l~8guux8( z$4VvM@~ZV%DR_RoUGRP=|8r7UG!FoKDUs@;R=CK9fMx{0%Er=5#jIUt|Gp(GvP%!2 z;H6zS=^&uz;MB0Gu|>QXzJ-dqg#v%SJ^(-dIS^TbsBeeOziv$n;{E3uOQY2Px8?D0 zSTgmd<@UCppPHuTa>W)D=C=!6t}LL){sP{;$OwX?*Gw4HJclBqA$C49%mcj)ltvOV zx%M!j4^4QJ;bp!kmsTbk5#h`5Q8JiElzK zB!gAps&7unP!0_D2&c&r7fv?Ac{4fK_LiF{A9a<5Dgyld4*UHE#+O`a@#?a(GJKaI z!s1=u=580EO}1p$*X08^diz8s6JX?9Z@oh^zuCasw|%u92$?(8;85K52b{#4kNo9j z@B~!|F+>?KPPc$S@MgI(IxSFohr=X;xhKdD{?IIzoZ$FKVf|fWX9i|mrE`Ix7v>J- zQGdWP{(+q>x!^OJ>8u5VNqWW$e;DmFp#)O}Lr24bI-)!T82dU4T0}S~IbIalQlft$ zioab({az_xgdmADF@A8q)k%V=218zeUs@Plc<%s%%$)zbO}}$?>2!hOYnNKG%eUQ3 z_K`PdmyQ_x0SLd!nPq6S6q1$>Ap9ATFHUzXE>%W>zf%Xiv&A>PgZGK&9|8Qne0WF4 zK->RVz$-@bl_>skf!Bff|9uPycp535nHWg&3}zMNHR8}jXvlTG3wm^(dcy?F>?W7@ z)|8EWyOcfpeLQ{>s}_&m#%|C@);9|9vPEbt=jVDqDtqd0`Ry$9&C9UL5J0lyPfzc6 zZhJHO`M<%!^n4U<+5E#x^3KP2fO)1&9_jB0(7`|i#?WLj z>c|rGnCx} zG<7+EDX%Xy5&7*{)eAjO)s0x%P&)kAC=OZfVDR|(bIJta3R=(qfrxkeB>UbI-wzY#7tdRz2MPAtuhfH@tbXk2%$t`sPR!n^QB6bIVcvv)Oo zI%b`_oGs`ylmZJ5B$3)DyME#C`~cC_Fyt>I^) zAs=`VN-B_XakvPUt&gL6ivsa54cYQoXMynsiE#Ze4KdrKuN^bAz&Ae@6lIR6j_rgt z56Hh^+xdf->yhFY;3~`%?BD>IzCbz(KoPMJYz#Co9mS0Sd%gvrv8{r|fAl#7+N1WhZE+PJjS0c`0rrqol1IUVZgRi~w0iuh2#3QzE-O+(Z>?Q)HdJLh* zNMAHHK+yWgxbs-}{8r~WBsApb$j?nm0GzP8)dI7vj*y%LWl?ITq+oHCjpE;V0Wy0u*Zy1H{!Xy&IQ!bov=sdP?OE>V$D1MPiyvQ*H26kOPdk7W9lqk-(4 zeP-%poI90TJ0KG>29_b4T;UH9qyYrzGlMY4Q_w8KjHtQSTUbIuNuQ}Yop>{|py^H@ z7^4Z4j?tX1G77Zkvh_K4Z^Ecg7alDNFrSWYF`aszDpj(A=x)GyEzpk`Aw_>-`r-Bb z2Q_SDIfGfD|CS5t=}Mqciq{+?JUHlnljz?N6aA{!56ok9f$x6pq$J)Yefi;64k(*7 zVa`ANQkUa{X-Rr(93l_lTYd>_(Wmxp{d@geWPYCRT=%MbuBw7vSh;NO$NX$VyDCB_*(^v%FNWFk z#ur78Fy2Bgv|u{2WkPx3hKsrsAC}qTypvP_*5t00aUtx6s<6pXssLP zVp@=VVR9OUC)Q96B3!V$M_J!gzu(0BDR{E$zVaZI23Yq+_CAG2Q4E6z4ZVlh|DH!_ zU2^ITR}X#*It9i7bie@AjH#zv02sZi?XHHtCm?a;?oQc5V4tW6(*jpg+|TY^pz!-z zq3XDNqbOoyNVIbpslfRocW!8a5D+v)|;f=pl zU4aKhGu8f1ru!ZSYmv8jf4lN`RnglI;ryB2+`HN7lBXr!f6pZr_^x;UAqn*di;;&i z=QsKC(nkzryL)Gg;O=ib@H=P!dUNt00YKt^UA_FJCi(m9<1hEe|FM97T;LxJ_y+|4 zi30zG!Nt_+-ACj8$pc<9&`Xl9e-feBli0tjj{eC6mpqXF2kG}4$a!`2PbT;WvwH7K zcHikN{Qs^P0=*pZNLzMNvk-71llb9kh>{$z(f69QE9V}HGJqo@?AJN0S~@NM*CrR* zoOg5NVH)vK3>Cmg%D%EIgFP`&55CcHftFO(_pvo4pZhyib$0ZO!;1m0t~g>5`=SHr zun6>Zge5}`5jCh}cUwg=_$H{OW#E#06nMsKr+9Vr%3B~_jjzv{I5KkR8x5a4rb=>@ zu&;oZsR7@`q?#P54T(bM0d8`N6%jr2^^$q;g(;D@M-twU@vn|vp9S4JNo2TGBL3nl ztBBO;^~n(PS(aZ`5%$NA&HgX!b)*?Qoy%pFje!4SDvx#%T_<}C7JLDlds)vJ=sUsY z{~l%A+6Nj!#YE|*=+FVnr?+)sPvG_jz6q>AU|T%=7pxiT!q5 zzx7>^+zt^M12!B`d2)Q%V17W-BBElCL~f8i$gN078r+}sHEBq+>&@#eBn>b1A); z(m@6SClJ>|8VqLcs2P~LfL2%M5|Wu2D@)n)$`YuGEY`0O6$}acCU+(ockypR{22(r zGxN31J#@SXXnkgAY7iZnybl8N%!q_u5R7V}YXAgcfr7Q5y*8VQfjI?qLp(+q%r3OV zDIe@jdYH@pZ3OP1YnYhk@>OR>mA*~5H))uTxYsMD$`*p3rqd1lzDK8}N*EmVm(8We z2~E^YCu6$sS#&PUDm-2xJ}|8oZ%{$tgey$Re3nU3cGf>^hT*`SUM5ghwJkDk6z3I$ zGqN>@Sq~jAK%EDaSV|!trn|a;Qqt8vD8o?%&Vg8+T(~E=9Wb!9!AT3atuQtM@DBC8 z^>m6FAL-*Oi?bvfn0-3br&6C z14pc^5cimX$aJ)!-pm=jKG+PSg#Ku`8v+9N^&+%iZ6VGNLj2m)YJw1c#+#uJfxxFk z?sL>!z)YZ%#0VrgsA0G+14be*k6MAMvC#s=$^=n$ICzji+HRbRh6kbf;0c}76J{K< z2A`_oy73e(1_03D8N<^<5RjF~<(BurK$a~_z7EuVsJ!^|Xa6ocm_hS4+?C0XW;p|O z;fmln*c~m#Y1Ff?Vh9J>eCb*zmYOODNy{J<-}4E~g$_o?)#VT|y7n{IK2&PU3N4GXy9x5HA4G za@+ih%#r}M)C?L6i9gWDG8-Wb2SZ$JJTw|?-A(KJ$0VgOfbY!Dr9M6G3eRzzK&QIk zCbeK&V>m8%Vg?7lN~&C`6UeRw$X;)~ATRnc>2PdlJI4tGo;*?hr%L=^z>0uMY(gB( zup-1V29#ll9gN_VV5K%qRkM?O!g$&p-(l-=*P z`*d8pB@Jk-%I8ob24j+s8HJ-d7*YrwaCjwnpVR76tSdDmZI^XjURbE(Rn&a9A^+)BY-k`?+8++9#Ree3Ab{ z68m;+xj*SZ(ZH@_bpaVme!w)$vRCcy3^+&gHtm}5-Js#MnO&xBQbhCUXeW za1i@&Y!k#&{mecZcO3uNf>l=0x3ndtT17dD6sj;m8MVJdNRfyYf9` z%;Tg#C%^ueuQQ*?IH==bnaT9`dqTzqr0)peP(XIx=UytGd*z(C0-Cr28o3^Q?({x! z2R3mBHW3hDA?-SG3kqKhjJ_Rs{<3TIrxO7Iqt$80aIH2G6EmW^JEmjoL{yZe@ao8R zKLG}sS*d?_Q8}>lR#iT)4oif}zcHpw^Y9mNx zm9VnRX2H=MAPfJOA*-XD)^jhn51##MLkeL!0=8rBRi9aAKrp0#VM!v${z<0mHqHd%ghHsY|nH6TgGY z49KYYC#rTaHQD9(xTfPR8&Bbv%&NYhHmemh`8KKCr8~Psb0st5%vJZ9Gdl5Atnp!| z5OYxVOfyIch55{bI?l;~^tJ+|77-HU3s_b)~?B1}>$N zT$0B*DkR-SVF!2FF&xcdw(9ixvXcq#2i7|pR-FE!M{bvvlvIix7o-e^<=m{Kw~0f6 zVG7E#8sT)-zEHl%)mA926aDAfA=_n`-g?$oV8`3&jXGY6c`=qrQqOT$KSle7Q^ z1gx9Um_aI`4ijK>(~oi)QNZyqQbd%)54@QVg;O;l?SxNWXrAEgY*8yVKtHf3L!;oQ|s?J1EBwB@r@>@6- z8pIju2QYf&O1xuta+Y~EtmsUbd6!0MKWaf3w8|)f|9p|5`_ITYlK{OGQlO<5&!r^8KS%JcUKyp29xGyb= zpcp+%A+=VjWIa32qQ+QZjlj(v2(n!3>fLB!x_SypTYdp!Ptfsk^I>M~dHER+=2c(; zvKW&>?~t7FVI2UJ1~9{fRFz5Ax7Nwnz0{u}AKF=a6Oh&ro!}h$&>0w9!o>_CUDQ8e zEJBbyT^YiM%-|UcA@X5`_+eQ%tPm{Vk^Lb}T00(>sX$OnlAebk$AVRJ=$$mZo?)cH zTvcIzA`E`M5Sd?p&NSR=rtj4x02m6FpvnvGW}kuZLZ^MIRw_wBjTp^AcG46L3p0;U z6Dh^bK`1~&9+sYl6%`TH-NXs45BBBbW{2|7fki7?sD~P|-1mPGHG8r7(@rG~*Q{ zys8%vo*AcR@uKAAJzWSStuRmbEfbH?AmHgu^DP!e!sJpdc8~}0iOY)}nOYdi9b_9M z08a(WKS`ip=7GSAHYQFgmnrF#0?lEPnUm@#Bt~`v%N?dNiWkYsS87MG8H0rp(}&>D z2QyJwHwYq17*`yr=|jVWM*W6+MYDl2@t(O(2L7zxM>#1wQ7{YfcW@y@YeWc&7J9|M zV?3H@jid0#a}U$O-tvG3DfYYDtBU`%T)I*+l~#YZsj%WIRv6_S-8+6RBP{(8;Ydju zyi_yWTl~VQjBnYHuHGwSg~di50#wGnMNc@d1LcRXyECxj?rQ;2 zg0$Zf=+WJmy#!3A(p?f!+ZV(Se3gDxgpwkvya5NPo1$%bxda<13l)-!%Jl#L&3~pj zH_r#D6z_4;Yu?d6y&2!SKKYT?w(jhrZ^XB}XNJoJaYOv;6-_M9QOWbhDl;OCX}L02 z4yb!T86s!|f2;^@e)4ODco9|LP(DuHBuJ3AOn_(3|5>yo5dZpLiSYin0ez)3@0njk zWE-YQv)xIG99g}iO$SN#EhjWyQaYJ5KE;wK4(fXdgUlB3MrE$}I|kF|7qD=~Y+4t4>8!tz^ zF*ESpkE|5m2|K$!p82nH3)Wu`mhvbWnX0Mflm0#T*Hb&Pw{2L|wSUR3$P>k{->wOm zv#ex6{(;ZSexvw*o%Jn4ko8u#-K3Uifn9r2wE;yO$4yrDsRi z<;K%mpg&pJl$Du!slwc~<%Eo@mKEA!F zs6Q-ybfItwra~Q-htB*p01p%+pKqVJbkGLNLPs|%;3pb0;P>q_mp;+yiJ6`-?};@% z;m9VuH`v(Bo65wN^+%_GiG3P5-k6UM6Ni&OI`92Idm?kV9Kco%iruLdx_SC_9(a$Mv|&Sp9d0%g4p zdpn|LpsT!TApX>*CJ;gn9iJ_auIAJBU$3)+4`}3;&2jF#T+=pRO$nlN`6+I)jwW&S z7XMQm*~;fk^tltR*~1O*c;?eQ=ND~LE-%=B=4swBa&)b7J|@E}wdiV!FT=0)jC-vx zJ)=1E2|(|T;pY3cCD`GqK!I1cs?;uxJaAv{0VpT=njtq^a?Y&1@2QzGi^xBPgl)W z3}h4O$~E030W$B7TQnw$Kr9D|i=Th%`Lj`Z{#9zQAb>%C-4U~r&3&7}!4`RuH3tQG zK;v?31yU|I_`S=xw5V$Ekw*R~i&y<~$S&^?W?qqhM@9BUWw$D4LB_|G4hJ#zs;{@P zEPymt)%Gf4%A!zzVtIQA2kX32n8_6@h@C5(-9Z*x=ZVabrBCA>uNW8zI=|tnov_#cJx}~u{}_K|TR4N=$Ew*d>b{Nj!VD=VX52PuQX6!UeLRr*7Y6$TE|EtnkoAkQz=sDRa2- z?rc?1ts(GxW=iW_8&Y04Jv(8jt1gq|mnfkE%9Zh2Re{Kcl-m1P;Q1t~VrkVieUH#s zQ&8%+k6(2xQ`JCfxmJB3l&T<(Zb2Ns+JC z3#GGrKqi2oyVFTWnK!r3TXmhSldw;3^BDeWX<%vxpViMR`-HRrR4achh-G-5@=fe< zWtD~ux8xiA;!}qcd$`7LQ}$KZtCqH0Vcx+K7Us`sqU$?SGV5E`a41ImQ?oGA;kL9d zDBI}C&lMWJ_i(Xjj~u6K)n#5Q>ds2K7(ngwOPPh~$sXJxs$6gA0bM1lxrF1?O}7F>$@zn`#ca#qCKG$koQG9?IREfgVcs zzvK|gm+AV_z^HhgP8PF`@|=f)PdJ8J(0^N?e_7-zGnFCVWhkb|Hk3_SDafUEqbY++ zMTe~-`*po>ap55U6L+bI6B}$uM?VRABBwDo654K_^?&&*7ibExl zHYub^;_+YYuqVn;lDMVp#-yeT0j>=qx0Tw@k1dy0RU27PrL_`Fs&|<#UnILVAezF| zs9QXr?&4%*ur?CDl{xgl^9OP>`G4<-=i~JR<6et7%xCnzH?WIi5YW{XRR=RwGf45* zHXaOR#D&|Pv$9tW>ZQCzBeO&9j4XTETL4r>gV~f+`E7w|+B>*NDc=*w9E+*X#7wJ= z;ZT~&*bx?0Ui691vJd5>hkBtIEG**N9}7AksC^t9{YGqVfbjvzP-#)dQ93bTM@tOH z&`Q^KS{NyYRqodU8tx-&!PN@BK`e4QUGVlZIZZ!Trau*QcMD0vY>45ymPY~u#>VV} zdz4fzv#C1hOi^Dn<`oGfiCQQp>?4;ow(e4@o<+UBk?>95P%}MVaShY+`T%IXRx2!q z%?C$~KuVxI-2_T-tgci%v?II{TMmqIvhaF!Oxw*l^h|p5x@R|O`amb07=Fl!#~^>> z^9xuLNz`YaU;#j8=OY#^YtE&%t;J*a#BG;VI}rnlO2M~k>BiciQecSf>XUZSP|@hv ze*datwvu*rN0PCv0D34HBgliyZ4Mo$;fF_wY_ypuT}#z5=iW*u-kYVp$z9tw18u4BC$4jXfs=R`my)iybU?a@USL0T>~3N=YD*J6(_ zSzbl;xjgz(_J?)GTHC?fNo7&&l~QY5vwnIeZ092;WxV497WU^H>>jT|XefhD1(KB3 zRbO|KvRG}~O-?|tYvFWI5$eKouCAGQhFS5usg{^3TB4g-(@73 z{7qb2M@tqO_L5T_>DtKs*r9>&s3Jh+j!KmWq2aekBUztqYHex6xLhhk8r~Q)frWvd8n$@^iB7d%WbkHfLnI8XolyP4$ox z47#+GSxya}3I_SWU^g(@^=M194tu%^;%*+Fm< zQ0)F&E`36H&LS{|;!_Wz=o>4(olUCfI}%9gUHVYz-;g`ko#7%q=kOWNEB1QU=5U=z zwewnl(>qa4>;Oi^fb^myauKO z?=Jn}VB^V~v9um$``C|@wxO%G)$u)PK64b2DTj0LJtu)*ZjWtI-fsejh})%loswA; zMeKW9A$c1dA~4_pXWRC*ON^{L&hQ^e)d6L>yf;yqJ=R0#??&U@1FlmUOzefUn|1sZcOF&aLmjPao7?vS-l>cCl$s4l7cnD>@bFL?o|oEqg@ z@i-UXSb9sr^J7%VZ*90%wx8SSBhl5h*pfhcSpX~F9iUfsfq=}QwVd8_0TmB&NqhNA zmP?TjX|$HLoWyehgBE|DYEzal^5?+4eDCGbkkrKb<*LNiw<~nbLFXmg7XTl7@b0dy zlmoF;N#&~gZyURg-E=A{{pKvIEqrcoc>p&-bSvv3*UR_&UKVsjfmT!n9pRAthc?V3I4-tsKzvJx;oiE(IYS*~ND zHJYw__u+FG*Z=gr@VO=WcJZ+Ak6;%BKbBj&marO=JY~rGjQP$+ zs|0DOUc?e$bmS%VqwvgKt5jzRrxzwMR41O=sYGe|Vnxo(Oh$3jt2QLgiy6>VQlsl74k;QeMuUi!`@ySNPp%6TWn$CDK{L(+Ny z$!7=SWNhAGcB2iYIb*@yF0jw6;RqIw^~%GEfAH$1YlNwc6`f+2>HgpJ7Wzc6_A~s| zakKpdBs>gy|Jh~1=mkFkDolV1 zrg)0cd(G%O)d^4m@wlTG4_IQBH-n>H(?poZNbC(TE$C=D`YwAoXvYjOOa$~G2ba-y zG!e!0+!=LZ%ldD(EUA^K1iYe4+`IwIsEDL{5trgq=cN`kdr0sr$k#$Ympm+PGXTzy z&6uwU;+H(i1=^@y_*B+uP;MP#m64Vlv}K1+L5{q4@7P1_qzPd*P2}&Rb|x?>5&%O~ zU7DLpJzg1Nmjzi8%;KF9II;@RTCecw|t`S&>VU(94s}Nr;F)Rp6}r`E4GcN z-JB*rY2n?acTRf$O7qsgtiUhzb@%^9|Bjras~yK7Roe$H$+NF*j0m#50;D|E*Tt#^nyG}XqGqGyN{os`MyQ7MRe(IY{ z{b}gq@w|r!z@@p{srfn5#WrOGZ1gHoD2F92fnE<(s6cWXxX-Bl9{p+D`4`e75sevJ z!+jG+FC(-lG$lfD$iSKNJaPt_`3_0{ufEDdY=t`WY}AnLoAehlisZGwc*v;N0(lWcvWrpAEC|ieOza}sSqsI}3DmB~KaD@&H;$Ht{0r*_Q_DjWiB6n%jld9ROL%gprE=Ps=Zd+w! zLQPYG?j8BzaRj!a$^m2-vT)1JM(=%mQXOj#2TP9$;9%Khgv_3OVxm_4b@Tie(t)ZE z`!o1G-8Q5aMLBaHfSH1voGCwpxu26AK=*DVc^k`>ibfxgA)nV$d~vyxoiq_+z8M?t zl<)1X{BMe~mP;@5^5Ht$brpvh0a~}f-fYj!&~@!X6q@XPM(nS;ns3~=>rur){ia&F z_aOcEho(ORrbize%N(pwubTXjYr zoKpWx`Nd{nr43biT(n%erFI94TSx|QMJ=x)w(g4OH-}I_OfRM5r*DF3YqAqC(+(o+!}Kd^I+52;SDjU&*`PpOIKO?#5dbK|aD zIXk-ia+X3{tzdUX@g=9a3eOgW{ygpl5q&aoWmijsb+}LPbn%WaxqfTKf&u_La#hOD zWO0OPyj)%&)3JCjKcpv-MDD&jR~-I`$Qmue(Dt$g!tRk|olZy>qN0!uWNyic|O6C6!CKaR%q` z6Js5MfCt}$rDot7*2eqc`N_^W?HfJ$Q3j>ZE>yqe_L%SgX{)Wi<%cAL*s0>9G;iM` zZo{{ffc5C}MmMRdmoTDc%B!yL)c&oF)iLG&Q-0(=$5fwlK>GE-Oygu?f3-?Q>M zd#F3-p{XUNNE5U%3q)k#`AFme61JjB!DVbCSlejzcz}Lh8x9XhW=NOz>;iDSm7o~J zH4L|>f+|l?(C-RiX3VF#Pn%)}%@zd1=!(LN-3>-_M za_WV~+|XqlAR{P$HkxmaCz(SR>h%8qhQ3aR;|nr$5sS`Q_dWsvLbFc}{(Hc|O2frE7-=|VU(qJcV?1fmc?Obrqj zqV0xbA7uJeD$ub81F=;3yY#zX0!RvV=k1EzcAXt#l{_i7&aQ736~O zi4>g16|&4?bcI;t+5-o{@MPLxfTd4K;_6qfaMC```3Ru?MQr17WNwsp1UN$FnU$ab ziT80LN5G*So~XVsc)sOCO{M=2jv%^4b(0F>Z`y9^BJ@w3;ZP3aPy#Cu2K|YYC=`kV zfO^yb2x&=B*6L2!!7(v1om#dNF*O`9051B1v#<~eKQRosZiWMA)a{F4#9ywynA8Ks zX;Hg5LJ-=}W?>(5lDG-oT?^?V`Af4^npq+Ry+LzfBAVm>_h=4~#2aK?ZT|NvaHKpd zie&S^8&G*@>21yYwvh8!d$n%=(gan$UoUz%)B+05__G)cC~CtTobjjO9GqW(tV`rf zR^L}$V8d1?(#R*tdlcakS+1cu-^ZVku8j*}7mfAIWE0&v!BvZM{`Ko%jk3fLBdC`F4BeOV5wPMnG4f4SKGTunS=s( zCxgOJM-H?9Q$U`!nqcs4JGWLD8{f^$x)|(JGE23}C_sg91ZhOUV?2 zxr=grSrz-GGDv&XQ`-1o9e?^b3VaCNdjTuIrxrA8F`fPTGi5dTHP@qFQ2~+kv4QK0 zpPjbRCSIk#XOK4L2h~j;YiLC1%z;VvDhWtFJ;pHh38pHv6#mlKMu);oKo}Dc2K;{b zd?eLqPcQ*t{I?Ls1OPn&K*PJ(@2(<8A66z%_X*T}BGh11Sc>_8KXEvjIGjuzPKFQk z|G%8btit#JHm68P1p)o^|p)(KuLL)ccc@T0OMV&PeyHa`Rd-W~n9E@+E zIyO~#$Y}>E>@2&i1oOt2dB^rs90n+SF%)7Gxen!2!n@(slOMV&@OWLY^2LHgwGhLcVn8(lNN=Znl{BiV zE)yeOWIy)Xjf2$ywGD-X@vnx_3oktbaROC43dITe%v?P%dO(o(Z5l`M1vKWi?n*)W zZ_Z-0v#z`_fZ^mwVPP144Kq;OazeK1ld+;!uk)zKt6cI~zMPD+n?zZK z+ZlG>;>QQZtqcwM@8!xN^0sNDOoUu3YCj2}mS%j8VoU;^8;#GYI*g^-x?+Hsr6YKZ z8bESO0VD?lc!1D7XeM-*2br)KaS|nv5O>#JMb%{>XbjG{Kjq!^_9~;CX`)s(QTq<< zivqw`rjaZ21DsuFpr38Xs_UL&uVU*lXb4dPgk0f}tn43=0J}uP^iFTJXvZ46=8G2q z&8aah0HfNW1bKO?=RD;0mK%OdYNALtenWZ%WxkYY6u)MmPP);5a~aE>x6zWQolXdmA2tbQe+djc2?z6WMHT>QDi@|P8w_;4CDAqoT?Ut8 zB~Su7G{pOc(N{WZVBB%ienO}D*@^U_XZOei;$zs2mKIOa)-#9-B$~7iCZrsP+C6= z0u_fkQVdnY#xb;^>{V}&YgMT=Gy=Q1DT<^{tF1s4!r)|&j4DFN_xd|X37YRAmT=5s zXYP7jGYo?!+Qv1(g#Ne;$^nGdP41(E@r+qa-N{xRMqosPnSqN-PvY+97!KPqiGbY8 zTt;m_)B;VuC)MC-(h~5&^5LSNRT360LtRZ4r=Sf-VGF^ah*hAKRv@=>S6~?gX3{l% z1OUDx0stW6Z2`^O)9(Z~W1vWo%j3?ZQ|}J|*x6ey3U9&NXxs8z&@TZ-uqB=gF|xG` zzrB34f(@huhGBfN;V7W|4toZ~GBk&VdCDOC1HnlL)jkNQ8pGp(cJTeACL>aVM}a7V zDhI!PWC@cx>RWEk0*qup+21)r_SRNsxo(0H$$<<@DnCgbS;kHTpu4M3fkEUakXrSP zE%dviwI)(fWh5ma$6H{`nb<}>LJcQzOb$cW?8<^k{JLu0672;@{Nna{tbqz?T|`h5 zxz1PtQv=onJoCfmVC^cy>VX)*1(~q{p+oEHU^+CQ>%-a+0MJij7Ld9?^-1KuCYYNq zQb=5TuzJvVk&{jZbMccT7wAtAVl`^O!366+#DJ$j;k9(C0#<^cXxfm$NXm%g!IRyu zd*Ted8(2<%O~^hBNaw(12q8&{!=(O-!XI3UH-If9`Jy9RnGN)IK>Xlmwek{S2nVIT zZJo`FV;qhRYgce8HD8KfDCydL>M9Bc8>zgoKIzmZqs*v<^pET>1mJ?`jpa?TB`R=; zYRt;v)p~=F<xgd@MEm2BcsW+yjt_22M4=$NPx=WWFRErCHmk4v8CXo z`!@{Y(Qq5!8zaQxnGj}|#w&p^)lUUwZNx$lIl3E9FZ>D@Ge6p{w^290O-yH94;usm zA2|Uy_vO-=b<}y8nIZfX8;Gckz;_6(*&BQ~f&u;|+nJc|`2{DVr=wN5xjs36C9#m! z&79_qKYQD)F2sSI`x%}6nJVa}7B$xJbYNPygAX(2YnIwgk@|XhyfE~S(_*WwOm>cR zMlI-8cE{nNbV2>h4b>f5m`5BYd0jdV5h)M_b`zk0l*UcPPE-FHhO!I_9UY*?`Naw( zJx7myxr~Dr;6C+HuE*G6&`-w^Va|DH9Kpgi@e`efJk)4Q8m+B78dLt8nZDqhX+3EY z)$y~BS8pASnEozeck#rm|KSDpU###6Q~$$jO_OQ;0c-&_bfEEQsLCdO+T&dD zr*WxbI+fBWJL|TXJ_sWl{#BL_HMv7HlD@pnR&ChP97=Q#ibgm<0 zr?7b(LcCzqV$o8COWC8?yeXa5=zVc8F1V`63=95~zMI1R>sKzz33?vBb|?v8yi3d* zjA*NB;Av*|RmQ%pJXE4li|N9qFn7GPEJzqR_IZHhp_?u&!3fU``nvv_VXqQwaS7#1 zB2y}wK(^XZjtVr=9B~4CxJ5|$=r~#cWhTv5HQp_!d^ptDoA^NzlO%n7X=6c?3Hq95c?&{xUyB84Pec_J_R|p zCGUD4v6H!*HDeoD(!(t&)McPgV)kUvABT+@?HJu)=*+Y=~=9Hga<1g*U$ z@BlZuQQA=E%cM=6a&>#cfGx++D>K7$rfbRTY;~D`d^s#t`Yj|h3X#}&RBb&m3Q|=) zZCJL_Tdx%=qY%Ia7VE|7(*R00nnkDNl}#lAqMD`TGONG-`R8!z?wr+vQ4+3N1zS^> z>_fdAVC*BYi57Sd*Kh!a#k4FSCJIm>L~|Fz?niV`g+`i%k}JSIKxLtPyzz#;1qQ&< zi2?iwuDj!frPVP|4`*Xg{dG_;FQT=2fDdfvP%08&f>mP*ifNMQHS}SIM{1@QYEb-C zcayFY6qGiZ#hryl5nP|8xHDPJ?lmk2a?4aPg7rXp&QpP&GmO}vbAq_K_N zusSc^SZ52+Dp>5y$jHIA(dYw{(x@`7PGo`H`k;zn zd{0YJ0opF40FFQ~iV_&v5*qVyYzr~m{$PaKcB70HU@^CK&Q`%d-ngd)NFc+L1y}iK z0nEH}iW?lIz1zJE#^k8ayp6jp%DBTVUkq#B_HoGoDml!&gGd59;{-6Mm^Ikz8azzr zJdBY+qRWO8gOn;0_ERoKRCry>3*a8D7Yx!MC0^YlY9qv5)dLM(Dx&SorU=n~X7z7= zCQQ3y>-7M%f)HBk$#pLG2d)QBCB zWOb!8-WMYIC30QKp+%*@*0wkqdcoQmY)=Y#&3}JNVqGp5`ysitB32m7!J-4>40UG= z1WK_pr8!%h9e|6wu65ouS^}QjdbEsci~V|`u%r?{UQkAz6ajihhC&;F(samiy7msi1q2WrrKh6z zPzxYI`=NVPATgdPcHnju0`x&sKWq=gR>VzF2lT-8Kzi^=)L2+TnGk0d^uCr7?Wza% zcg>b!f`=*ki!dZg)x`@B*G}~eADCltek2pzWRnezo54?NHUCINR(_`dHPM``CsKRit z^u%S{sCV3n%eaZlxc|{*+(h_;`8LQ>Dc5{oWAZB_7h4mc*#u~2&i6fm6iy(86Y%H+ zJo<+s9HUW(i4)m>eFdNYQ)3~KS^Eq^|+9{OfKMr&d;Fz8tbFfkjha=H5KLW9h{_^Jp!Czq|&ni08Sp+>wRoc9ipgSFJ=R0=6k*cKpixdPVq zG8zbEM3oVNQhbbiQr;a+qQU8vE^DF0rk0o-BQEOs=X^j8*PjqQ&tXYOgN|VsXs>tr zmXFp6AmvN59%w!rLBd!NeqrYK)iF%@5eL};7CAwajNhcZ3tBEuXyT32TpDU&=Swa=OD zk=^S)TU7)>Fv;RUV+>rMCt3$6j4--`#~B!9=;M9>BrHMFa|*SOT-L%Gq~L)h34vV@ z@)H0uG7TJveFmlGBp9A~L{|XR%UA&};GyK4Mx`yl9t#4DPQo%o_K~f(!odl0sT!k> zg`hi~GAOkpAjvfu2PT{38IK!T8x|Km$Tcb;a#3?KI_Ld^H~nKl+OMTaj^0n2i2C_J zm^@>VaK(>Z6*jR-MH%77O`5ki8G_GDxqZpK3Iw9k$P5oH7^1$R zjD2=ENw71D2}5k_1=>XjqLeMqz{QAxSup#M?5s{ip*=t{b^RNRLBkn1bqGVys399e z0VNEPaSfzDfq8hUK`8AkNW{rDv?1!G`WtDW1Zb8oxc0)yT`R0uraYV&bO{U&R-lHs zCU_X9y$wUIPR0j>h08J^`XHmWBDfd=egx$Jz!9XP`)|RV#`KXaoC{o~aM1(pWe7A} zDDd+sSeG$a%VC3mKw??4LOZxUhE)(PHjaz-x3Iv0U_gc&tsIQi zNPFv@Mg=a=V}k~5-vK>igEYM`w3scAWDmi7Q28=Ng*3?>%ERhJR2zrUUwRBk&k8c3 z!a}XX>aDKj zvVtVaw}X#>4lI4hYz6Dlg>d13{0S@@()H~+%7zC6PS=a%1`eS&>-PcmTY6l-D`E&I zFjBY$c$5y%seuC;(%p9kcq8COQ;k3%*p$HcsSC90_Wl!w`Syu}>fUd_yn&-OKe;DP zNLGK?^P=TDNl$X=;IK&q@Q+Vpu)FL!HiwD*fvn18q?0t?G#-?V@A!ZXbf8aXVK=nY zt99@5QaZYcqL3MK?U?f(E`F8)w)n5Q)+aeGN~=wUk{!B&@SpsW*f2h+D4p){Eq_?( z`VzQ2ALh{fRW#G{FEYLRI2l(xF0=ZUO58SA)a3I1$tp^c@knfMy*PX4c4;lH$bPs!sE*|;3Qa4YR; z`6c#E%(3s7*sbJ7lw=yo{oos_kH?Obf!N4OwE(;fi8@Frr>V^$z3yE1riQoq*tYC%pHB6s8$$pV+4-_UVazdN`zQ^iKMJQ<41v zxJCw|M*>aZ3svoG8Kg0%bSglO&;sc|VFrNmq!Dd$QAL%9OaDMu9kHm(YEjztaomJp z|99YREkY35yqO%Z3%g9!D9PkydJ+3~3-YjgnBgl!o~)n}#)q^)sY`6CxoD#?X9jW>LfQ{V*+ zYiN8JUy_hOm(@v56&~U)llf;c$I)Ed-vTfV*2UZ%N0qVXR1N5Lai;C^&%)LIg3mbc zV76*^3(AjigQ6Me5}e8&yIj?upfoQ{g8p}dwh+q0CbSWXg21bL#E&F4Ad_3b-a=O zFgJtQh>`uJ4Q_*fgbazm5@? z{NzE;;tzOoTiilrUOuhq;xz(d&Np?y^LT#C2fyA+;}XF?d+YEh0xy@T-g$_SVNFz& z9NENHY&3b(V>I#{!hjvCd{O!lr)w-WT^KB2l{Wrsuu*53*QHSuC1o_G?onoV^vl&vKDM%j zXQ<-(!Wlzf2#NCqad7iMV?dhB$2C|zQQO^`zD6M{FKu|4sW{t_F8;8W>&a~&e;vtP zr}6UK{FSp+5zP;AL;cqeb^|%bDv#O{jj^cM)W5rS#fFqX07edcv6H+{5uTMmc=AJ9 z`Xmoh!!MlWU{F(1sXZq>=v6z({@_zyJ!sk<=NmN{Zf(1uNjTIgBzB}bBj4rKr*UgG zf0yPRa{K1&>tLj7*oym}m}1SN&X#QgBDM~E?b%|RCQoiqCyvxnBVOu*I2ztsbvny6OwjWZ-M!V+^&ETEQ!x3| z8<`Jup2D`5ZKZ_+0p{0<4;3JN&UwL%e({9Nb0*Rlw zsTk>Qv4rLw1FXcU&v!Z8a1MQqRiyG_O_}%8x_Qmge-t<7PuWzVyE!MicNy48dY`$* zQL@fl#rwfZB!h)*da?jDE43Y!Mip0vE$aKgcw)sdsJUSW{MJXU1t!=m^^NbT49!*L`k#XZRO_?yB?k|g{!WvCXgLe_rPG$k8M1_ ztzoO$uQ$Z}od0Z64l9RPR2k;*C~SD@Z$$CuI^QTKFI@^xV{+4ur>&TC@gS40Hy{kHHJ98re&{`3(p4zJLt8bU`>^K8G7u#fLSfy|;yVvjjM2Mz$Len!mD zmS?-*^;kKe%$`$kgUq?js*gAct+Vb z;~GvfTg%2mZEMQdO4}z=`aU&(cmGuc_vpwGf9GG zQr%iJ4{Xdh(o*ysoQ0i$r`MJ)LTY@&DufSy>oh&h1wDkGwphQGWy4R8l>IkP|L<(c z>feu4m1vnvL^5uiFA##xcgT;=D}frA=jDjC@Fu&7x~H4XuKqO5yW}Qh#tW02+U73@ zYpO%uH#_OT42K)_JU5xUuOs^Qusi=`&eZ@+_`vibT=p*J(YWbMGHMaBk;{c|X6_<3oS@{?;kCZU6bQ z$D3~{QRwiuKKSY7Pv5!Hm=&B8qaFB%BIS!Ssw=)8a#hc%P3wd&Kl7CQdUmScstlHc z9bOpmT4U~=rcCjQpts-OOHYZK4*t}*bzsqVto+(ft8y;}zQm6J-`sZMxNkW*_TTCa zA8gV-zM2-Jjjpa!UEf0$(X=+!^U&Oi8ADS0%v}iuc^NBd9<#v;B zEAdyj>`(U4HpzYSmmDp+;-!6(xd;9%PBY^pp>Oc+J^Za1Zua>xY4`IPaG9GrRz4y$ ze70|v`^n&B8^7{R^2UcKSIInq;nMxNWb0pKSC=ha7J7tx2J~f9M!Y_VauK(x7@`V8>HQ^O^nrvm*_WPmLZ9NIgrl0Nq&%KB6cPYhn zc~_qKO(F-U|9mQ-sthAcJX+)we*K3~KJ;Yd*}Nlf7I3Rd4?}+p{m|U*{@#~!Mc@g~ z?Ds(XwXM|_lf+3Mo~2{eQ~d;FP4v!k{r=4NcL&bEqtB-9GQ9bD-T#5g|J9;sYL4DH zYnEzGe01^259O-0y5P^-5`I&*Kz$T2ktwweo;Nb5pqb`LT?QugDr8!=Z_x`?M9BO& zDO(-8;K4d^HegHK3}`s5KIC3PjFXe{7M=CV146^BN@)IW-d!l5{L&x#E&U9zN%R-Q z4;5*p z29ghQ@sUjtLA&{pQ=hbM`|WlU#%RzosC~N2UkTxgvXAae1xrN#*!$*u*TZvu*Cu3w zpthVJT8P*;oprANQ`om0(@jll+mZ*CEd8=qi0Qld4^YVZ>t7c7A*K=#Il%OyWVT2<>>r=Fn>+Qe8^H?IfA(YkE(w$x#I*fqv~1Lzli%c z7!3bk`WF_!?X@m)9qG)u{(pcKT=r3iFJd-!m%Y-E&ddP)1ViNVMcn6zo!shKkc9Zv zd)(YZZPTGyfB9taIk+>RN;$GNekD=Tni3bxqlt?Rp>xn#Zg}i}_%e-W*;UKhp!OL2JN#Fg`$?L#(`8 zU{U9xwm@j;e70tMh3rK2%#n^<|AKf7GTkHhw;-N_4eU&r9`zN4wrFd{F3=P9`Jo>W z>&hbenEcU__c=3;BwYhGi~ExJIWjyxgD%K4|9-tN296u#XyZZS^}XeYaWPg|y20)# z!+YbSpQo0|yPCibnnhapuKo2G?bj6+uES?=yJ=A_+Y`h;ZhZr|R$R+2c=duJPqTk` zgxDPTn_&DX_awTlko$_+b~k?{iwq5bQbXy-)oFwnP1Ifr(qT zefjU<-VghjS@}z)cVF2B{2L=Y_Xs?;4i=sR{Bd}#bY%!TuZ!;A%N5To;AC$-T;yZW zsYTuy4V*sPTHr~nzqt?5{T3O~XF0e!Un=TAR0DwFpg{FZ#XSK znYkLa4pPV6Kl2i#b-3>TN!vk@i!efTvGUFSh%Ir6h{N+Sk4o^y>Ri7W-%>7)iIaL(i6^!^I(DGB9SDrdG6W&(;a>dIAS>%3#(kv(BdGWEjZOb}j<}1W5&;TtC zY6LUXQMDk6fA3C{=Le{*8K2ORIlSwjQg>eQ6iBc&h!R52pn2Cw;NRg)`;5*S~g@Pgy(_9IB-IC#mn{ zzSKUKGv%kGsrEPC+RdNTe{bx)1vaiGUZnnw&UZL>)4w31e*7>gX7WthRMNEQ6|iH6 z{6FHGmxeaqWrMdeE=S0w>K`mv6eaoR!Q1Gn^Vb@ELJBC)7MutWxnK68uk%0~4I=9k z&{n|y89dum7F;Lr%kSmV(GbZ;Gt%@Y==G)m(f0>H!xkSkN zV*P7x{`SvRabdJ!Xi z2|Np~W@MC;&N=@axKM>|Dl%kmM^|a$h#hZ?Y>0guuGwMywIP+5%a1njNOtdC`fdB! z;uROkKgHhEuEZEt2Rys8f`k`qnpUgcz>$>ua_Db7a(dYlTBi?f_7yWNp>IHvM z`fo!g;upNbVJ-Bw6MFegGv0fq|7{wzAoRZO%G|Bz%HHFIXWo%IQ<+iGdE1Qie^7Tm z?yeC=1-+m2eV3D5mOJEnYKLQ~OB=K3YQ=N{Uf&~(pIYW{(z%(@|ClQ4(aSb!D?C** zrk-f?E*7r{RwFQHQTQQej|6ko(`#aP9|L^dr?#i8; z94d#ELuD*C9h|Kk5|y)@$teVICi%n`VnDjR z_-Xz-H!jCFH#8W1Qe)Tr!wlt$sn_pCNH5@#fl^MchfKdrcoF#!hA^r)c2p)PjkNdU zEcaO^3s$+rYEbXqbC3KKkkJ8?%T-&Tsq0Aamb`f?{5M8zCi#tH_2NjAStsohJu`H& zzC~-yP{KO_acSST3~JR+WuL>VUr^9htuio=2#6El$S&o*eS6aCuNQrmr|&oFs7_x? zZR~mUr}iI9DWn;phOPO71ow`q**+`utxYI z#AMKI-<8o1m3M~};Dy}no)md-Z-L__>ButLU23Q((^+|^)LV|GmLdDtwfx!pJoyd1 zOqoE{q|ZPYXgKq=4e#_^R4v2QzfCV!c&krX2+?W5kS)mQ-KDRbs?b%9A%=9llr9xP z+7pb3sQ5rzBA~}WPZwVdR^7Gnj?3Ee*Nd5SH`hBbw3=Ga7w~{Z^eynMJ)V@JE{VaK z&2qsDE~ka*ru8C;%!kLW-irEUlkBa0r{2sk+w%Aohu&(5=1NrEOTkE>vVd635>ei1 z%fksst7C*W37~MpaUn7nU7U7eO|AiN*=f*cq^Of$6k9|PF`_V3UMk zbMd%I`h9WbN*p!z+Biscr7wLxb03k4sJt`YC^v_3}3bVOS* z^ZBjPv|kSkV`I5bP$f5m$vy|BWCO{|L_s*$AEGM&1)^P>6}I4daKEn9O?80FpT!p}K>Axy3DS-6G zg>)(i%q?dRZb029%Z$S4j6sIy=@p-rhDmf04$ivjmtwnDJjkHZ#wdAW)GsKlmP&H7 zDiAu}Pp#gUn&Utke`qr8fh2s6yU27k9XBfb#<92EvSs>xb{-e|(it5Rp7`N3>dZ84 zH@ra4uPBzxgNv)7i+wgzitoK#UZ9LwjGQV=ugj?=&LO4g4Ll*f3zO=%Sy4zNAART> zMq*k|$#MPjn{Q6A0i4EKT_HU@P?&hjV&jHdqRhj0YZ;g^^aX29V!c-S^rzz~CPE&c zc*8>W2$+*O|FgbeQ%}=TRAIl`^2;|EyL{37!6!1Da}_^fKks!hOK_~wsN2_0-+n1` zY*Am4;S8Bf$1bE9P0pwhCTA9zrmLu>IXh6D9p(hVx`Z7TcMU6Uu{mMK?=8LQk-7`4 zH7+Av6Y;Q)VGjLd3-4|}&z<^;G2xhdT|H+XA|m!A;a*XJcqc#&wqsf!eA4{HknNDS z0nBCO51cVor7;pYBgSbW^jZG8KyTQhI^CT(>Wyt&iGd*DPhPrrs~vDS3RJ>gAmwB> z*&Wd=vg4z-xiMwD4j-yfz7WwwH4koa347r-Ez}&E{gig>hK-2!e8qpQMxsO8R~$M$os3p zTNpV-O3gi2*tWby_jRW( z80l-&F&z6H_9J`0vv(aMJe?9AER6tztkGv3X4AEa!S71Jw!TVwD1-O5Jikm&O8#7l z-lx-X&b}8iWIW<$sxRkf#mXec_~q46-Gv=Xh0oohWWe2wlM5tsI&fW7@tOOM;&;I1 z24RY)kO!nlTP$DB575(39yUgDyU~{{8lC6%Oai?gASAr30gwgt5MoJU!?YuFI?LVqHul}ic1 z#Mi0m%6O7T03$t6KAIkM{A(YWs=DjA-Uo8?Qeg$p!WX&LqJJGdqbI5dJWop{XKuTo ze~s>t2-PKlEX^^552j8VoLQA=D{qEtHJ=JW$o20KPdWFt<1 zf*bdLT%%y3^L~Jr5&xqd$E*?(agU|9?(Y5U;?F78p9Q6(%GMdUbUckyLsryy8h#Cw zC+doi*QA?i)>3?mi1fW%hhLO~KN4BuoTg)cogLYs(IP7RsX^xYGWfmio-X~?P-z) zJrPG$r+bARElJT#D;uAnZqRxyVtxEk$1cGXE`=shhAbqp*QUGlBI>FDLS48q9ZHdO zO!l2yzQS~*D}s$JH3Xmom!AYkQg_|WlapenNm!P$Rg%a_9ZyQ2wSde#MsTN$jGis; zTE;$@rzmKS3PzqA->D+3;9aAT7cH5pooC2RAI+Kl_#U#Y2iW$nHPKd}i8{75vIMnc zAiXE34O8OpCwilsmX#Cc-ymA5#l>;|##zYo-{DomI>q%{OI z-x=+xiV@(9b$~X@^G=i^KOu+8I$bto?(`HHOU%6bw1Bd$#-8mSLry-@QNJr`s-pxf zn@!&7dpp>iMn~4$f$aTebOP-Gf=1%>z-@hl4}-jBUwSjyRlo>=c2<~WPh3#TCoBTe zqE2Z$*z7`C01Oso0Jtj&miwzSnU&COPrfL%CeG&kJ@)|8nRB)Uub=j!35Bq_y|mS! z`#g4j7i~rV>?l8UJ}DM|CKT5)b?zVciW;S zgyen>rCbM?!arp30X2`&-v81_dkNbI2ATR$>sU>b+vc%-e+V>-OxXjSQqf^}u-GdN zsa~!mUbOUAdx@-`Y(mOlj-3CjIb@TTUA{%GW1~h9o(F`ITJ{IT46^-I4$w!SlAe*5 zzp*UjS`Nu-?II9hiv&idkOV93c3gE8^^UtC7~TNs6<1IAHa4tAQI^^|j`^QQ=lJI$ zQ4Ee{NDXgB|FRRROrpK90>q=sTkxMb5dFlq5hs z7{C#&*qDCjb$A0A#__DTP9Nde&KaedaOeWKwMpLAF6~JxN6*g_2JL8MiO4rao?LvXeZ%TvLxIRjc9#Z#^YwBH4q4aOPzj;&?$ne-7nXKx6r zQl0^HO6;#<*S7BaV*|hLnR*M%LG83U|{ZRseauTh`?#9xqMO~%BLvey^VB$lP@=QE|h6ws(>)d zA$Ko(!+37vD;tJx7X8yc8vM)a-6}2QbR}@y zJ>F9*Yc$IUUeOi`_hP2noM7X0Yo&di&HyQnnYLoEvk!d(TI*w7?8k=iQbL6QA?7X_uk%1i7z z3#hy<3J;*Mt=*Hvuj)kI@UQ22(D}iy)a}K`P zadBygB-?a#j;i)x>sO^;V{7U>1`pwi(XhO_L*w~iwo;r0;%PRFPEgcd>U4yf{O~lE ztXwkI9>;f)U)FrMu}1*1ZQ$;(YK-ytjKk<=psK$LIcHrg(yHHMQ|@ zYk0aH<>^H2r}8b<|Y&wN~ew-MM^o8RaMM|dF}sn zk9t-9FtxTnAmMA>+ANl9BfY$tO8kna%f^I`w)*+_fbQjr?3oWQt|#!=TB|f5f`p`Ht1P{(C#Y z|2&>w7lzyAJX4%%9y^;T(2N!TXze$MWU;4BGvs=HMz%Y?$u)5o;^26%e8@cx>d9vZ zaqcP(DV?b}tsppgIn{qi0*!Dg>D3#<1{`PvEvv3+c95@!HzdMmJiTOlkqKh%K16wk4YM{NlH5Q-%Ty`iIWZgdB90oAp7Pmh=tKfz_Sx$C^s55~ zT;%7JA=ehg?MGo&+I&LS7O!Vj!My~doU{L5O}4fFlw14rd-=ik?dwM;z&%_)y#oW@ z@%!gb8^AIAe%e$8gmeA$n<(H%e*gUQmhA{W7gQyq#W|g5cw_Y{ NV?$GeD!t3m{|7I_AKm}} diff --git a/media/images/gemm-hierarchy-with-epilogue-no-labels.png b/media/images/gemm-hierarchy-with-epilogue-no-labels.png index 59bc99fb900b60d49b2f60311cf91b23af36f292..b87e8e2ecb17304fa47fc19a90a56a071d395652 100644 GIT binary patch literal 184294 zcmeFacU+X$x;8w9sK)}th=L#xH42D;NR@67iS#BYWk8T#MtWy3MntO0(5oQ5N$*9a zC{=m~1&3awcjmpH0o0vM_Pf9D+vlI}ob~$!hVdz@-1WMzwemzxMx698^k25;DF+maB;vmgz&ED`wqoGdkCwmPRQfMyPXBe!9sb^DsUZFvDy`x481j$$*QBqZ zP#L~QHtz0+zYpF2{f;FHb%q!DZ=YBIyB2(T%1Z3El_EyZ%J!~>F3Q%{metVA*iz^2 zeO*?Jg?{L?05u9lxPSBdH6=UrWFP7MaO-fM4NaND+1Wq7AT@pPhn&RCP^AQSj~|`x z?C&_SWS*1uac;7vYMoAtQ&(7{h=&Jf2^re!@kMqI$FX+ z!c^i}X=hw|>kY|A-1Zg?tLaR$1|gj1M?O0VaRiR6m&nB5U+mFb2lYqyqj*!eO*(%{cb+oV=}6NU zBSVR}OyVar2@dKuUApk+9!dm7Labsg+ zPEO93@CG9j6R&`P0Cnz#ANG1O?zxl$B0GDwkK~v4z6-hf@_#{}{;oaz-y8pTa{f*@ z{-;ZhNY~1wzA#OPhHYPM+QYkGy!L7yy`cje!$I7p-7{&&Wd+zvz%rD}ZLXIQ9G#{u zBOSY}FCQ)0>qeBwvz}6CN0Y8hysZi}hl41TtBS`9$~`Ye)ie}ZPYga2r+csAE4*b_ zcm##Q2L1iTrg?oQDK)I*rwcdcYFjk47|~9%mAn%vSuVEDg!KX08B}@ay}ua}YB0~a z107h)^`~)fzH^4d;`o3@k?qWM5jjdH@b6yfP6szsBu7Wj>_^qO|J_wE#A{Rz`}Q6p z3Pttj|DYRdlrb2L)8}*aPdq#h2iD>(3e;HxjVsb5Sqpw$n(hmY;QQ-5xn{ii0P7A- zmx+IFuF2Phv-Ha&wwL^1=C5qc3lliDHW~@No!i%ZH_2Y?ZfN44jFPg#o7vAm#>he^ zJ8NxA=fnTWak^%d@4h85FOTOu3dJiRVEI?KRNkr3G@*11_3^Tiv9agc+M0S=V&apf zr6u*aVk=u)@msfUfm`h?Bk(xeI?Q3@P6k{m;5x}#ETpQY7W3l8nZErsw{&!Lva_>4 zTUc6Ja&mFuub{l#e0+Qol9MG_EC#Yt?##D9=d?y4&{!7NAL(vA8^EeTedOe&-$}n4 z8w^F-)YLRZz2HiYyN8Fz=$Lsf&1nxY&y$sdm7bRN?J0Mfx`G9HS~B`BQ*`-Fa|Az0 zTn*zw#p?7I+h=H3dbG}w>Ak6hA>Y4!%Hd4y2T6O5g&$78gD)yye)v$kwW+2+f^vP4 zoLuy3=Bni_qpcs#{=@6f)qXhQB_PUne-sp5F3F95;rY$0{pd!tCCVM|;k;!x#F zl9H2KRCdDbJx7Xi?w|Fst$0W-#GBNyIuaRK3ON$7c}R<4B60grH`LYC)LJ__MlAjs zj-T~=U$M1WHdfT)hkcm&>fZa*jQgJ!o4UKHNl8h^Smn~FX=q|%W6uYMhK9x@CNc%? z)8e$UvbwIJp<#6F#EBEL^n}^i(Xm;@8uO7N@aFRup!jqD%0e>e;PfTBbsaLAn=B?a4*}}$q7vhF3+fg_l`}yHYj}y!a zwfsScD2XYE?&zaSqHOs34@d6*0$))0JwSdgx$zXAF)JL4Yeg}BuH%sP8f)xyK*!vGh?zo|ALSImR6ahQLc;S&78qz)E^-|yPKx&kV#$s>~uepW5 z+&kN5TGZk9_byssPxXHjIFZ@y7vZjmvR$4#T2lT3{uW8R0&k}{FUofBlmipS4ed&D z%3)tr5CyeE!E!&%P~VJ@Fc*q7|6_M4L6Fa7V`yd=gZd&4*X(wg(S>LfT^003_uq(k zLCI@{bnez$T8H4K3e zBp_ugSx|6VOxBd1L7|F!J4%;wI!hdF!Sz=b^)IG%X6ef!K`;sxC9p?~=qe;AC^*|Q zu(foeXlbe^aO-`H{!o?oY(QS|c|LyrmT-5B(-2L;Yf6+WNh-R8FT!c=ee2m$!sbkM zY#^GNt#n1Zkf?u`m%a%Q)k*BsSNYdUS*=ki{duU`7h9P&`Fb>liGO%55W5;U+iTev z%WuzbGwFVi&h8hOyyBwiLgrW(!sY;h_6Q}b6fq#tnzS7JuN|2ye|76S0b~@cudmO5 zP``Dp1UlrqIzJMB0QGdaf4P*W0j8*REhjs>!nSO^Z#x>R;@)d9^xD%jXd*W!^%W;< z>$y5ARLa)cf`pvG=%AiL0zpYGxFAUE|{BuEJo~dv+ejy659EA7{+;74hU0 z=N02wW>*`rLKiPy1nqSV4COV~cA5_z@bZ)4W9obFI2^2h*ykNlPQ1AGT<2aaVbgrT zc{w(K$7N$Be=4^pIy3W1-#!%MBw5(_=PY_m^J2GqwWPGLbn_HS zt0E^SN6gvTnYE*}^}3^@Bda1Bv?(Sj>Ee}OsmTFCXeqzoDgaKB0lm~mDADw(si{pL zK0KZN0iTqS!LF*RYEe*7a5W?(#Nrg{hN!r>cxzi*Y=G@d2~SngfU}DcO8x%p9A|=~ z)y7KoK>Ywp%T;Q?f76(9wah_Y#rE!i<#3XEgKD9b;icQJKxTKF$Ll{29oL@eFG<`i zWlk@?GMzu<(^?fV5&__rUKVDc)QVvczVck0=%{{q(+`n{hszbmWh)9sB0^RB6XPSO z8wKX2yl8k>VR>&z{EOK8>bj$TczBm32W=0j=rh4FA-~|Dkj6VCa(u(}p_I(ZDZjJK zXkAdxWboPT+_&F4y@90*v4P&j_qaGYsqFWm+@^BUolU`;#|Pw@^%dsNl&)|4Q!2G@ zqI6UeQ~326Q7el}OI3)OX`S=o9dyrM>NW=Hql%bFD`JgxvF*3lmmprOpU|`>_d?@zIzjZQAR^*X ztmPjL(iv>rnrwuD^Ku4iy4Xu>P8Uw3;fSNC+VkaZeV~~cmLmw8Lz*2ZQ^s0A6|FC?&c|u+g6@XdHEw`n?he6kiW#` zVF_C+)3#-y#K^0>Q)>@Ip(>%jzkg+Gxhzt(f3}*Q9^*WrhOELV;v9@4pfEHl<8OZ*TJ?K*$1jy+*DFlj)UpHGTabO!=c|G#WHhHc;(v0QeK%faR#FnZvD5a zUOg872pF} z=jiM{KCoNw8Hk`p-DiY+TL)j`pD3oJq;y@@H>>ZX+$Gg)#*=L5u-YuWdXI*N$%0^? zDYqBzBCI0cOG(v_r0ip)ogc3EJi)B|GQjroaaJ(lu5WAp8q$$GMqj)*ht)b-UW-V( zyBbF0IW_9hJ#q8}X==>3rH%R#H}s(vWSY`faOt+uF@Jym4Bgs*AOU*|a7C9bN|#G! zDkHbXBI5#<5sV_>((I}8D5^tlXsyqu93F_mlDjJC!t)p<+|ci?EGYED+8=~aP=x&y z@51xV2zgSl(Q~YUhJrGY69yebbi?8m^K@^|t(F;l{dpGpQ4@i`$gg6h6=PxKTkCxzg zMA^z8KriH^4wIHl4wRh-U7+Io>-(mQ=Nnc&9uy8zMK67*DO)cplO#oHeJ)w=oAFtn zE@I0m){k`Nmf@euZbi9%f&rat{BYzn@)%swC$f@z&CI)!{q^@{H-U{WZ^1aM40Xi;WD&#f zqWZ6|(y0Tht5YAsm56Wa=n&V{)rGlwEDrWN?EafWiTKSgkA03w+;Tpa4k-bL05DojO>NintZ|jLSxj}BZkpBx;iF{4N)iqEDH=OSs7qSb}ga(IMsd-AD#83 zshvRi!gpeUJPm2EjB5}tE;_!Y>3<3%WFCUrlCy+;NlD56gGdn&g_3yk)yWY5CjOuj z?e6YYsHweog&n)K0sTdv0xiruAQVshXB_oj@4B%7Zf(d{I6}3*x4;q!C_o5QswU9O zg$XGsQY;{J$lY?|d(IMq=XT~8gGo)tZcc#A+2K!H*0Gzn-_ef^2!zCmtYv#8lRVI6 zwTDorZ*&POrKYDh;vp0*ag2Qo9oyX+js`T0o}XrpR4J7dNUD| zlfy+#P2H>!r4CW(4M|DKZ$-0mgVE7ZcUY0uv(Al~L?weij;u#;sKlPkuo`Q={YvZF zc3g&xfa2ky;}}CfzUz;A&**xreUSdh6*+!lB ztkSw-qWLSMk@hN5tgR^6M}S7-IvqclbP#j@7u9Yf@oG|rxNK$j-c?Km)LUpO`8DX((3s$IWBz4k~5_K7}+V4y! zSMQ6!y~=?Sy-uo2_qCM6eo1yjk575k=_VLR)k|bgmRj~fsxx~tc$3nPn<`_$9 z{kpX9aMk{oIu~xtOGo~0I*RUH?&(}irY%r+-dIRTNO*_;)V(<|qBDchbe|QMc2Bxl z9b5FarKRV1kF5=U&it0&ko!5G8vLh~R>FMON?TfABHsGajC+CQFa$oaBSy; z8+;NZjCBE-_)V0pt<6uTtx7kMv@e%dZ773*z8>#6u$Bhqs1EoR~p#UvPE1SZBb!>F>_Z@$__w^R|8r)i2dwU%6 zp2LEH0-FmhJF2=@<{&qCfUE?G4CFO5kbHME1ZGlV^Dt=}alVOMhrf$=s}xrlYx2oz%X9s|Ku^8|}7YV5nKiJN%}6#XH)i z*-givuw(lR#-_ha_T;hA*?rkk7s71O=+t~3SEutSY+?1N56yiU(2-GmZn%NHmqcW& zXu02UD6^JTl4ZtIb3OT5m(uon2sRBJFPt7>PVR7ARrGCEQ5e1xwELDg-*^%Tv0s-HQP~}t*rCP)2Uhjwoid7@h4U@fuP zATVDCF{(#x`3SB)l&5;SuLz7Iv>iNr_xkPMQP+{DwqlkhyFiI30)KmG1rg58ZDX;r zSMdGwErZe65BtVZs7fuUpE4~nF9;ByF9-3?KRz7(5_v*xcXpq# zC44*Djl%7ze>o6@;^rE7hrE!8l@t|~512m`IHVRFk2J=h;#QUWppkjkk|T2W z*CrwNLbS=#W>i-lBTzD(aaYrrJEwZMuW>jnzC#%OrTQpdwQOjlAkuhqDU#9&`@70O z$Gotn+DKgdVM5Tu{C1O#J8Nw}xMrAsoUciDSxg=sAK%7>qVoScQH_&hjsYy7-{l|L z+B7Bw5Y8zEs2YYN z*L#4hRd9VWYulAl0O;|xYr7xhv=7s|xpW)G$8KL=V$ze#Vl&eZ5dbui**%992Q-Ot z5=a|8+UkNi6vvmrZ=5}k+~7Kh;5>hA5>hW7ejtUh#72LxtVO_??X(=dM_L|Laa7>H zN}{9BqBqA?0QZ;c1%yW6+aTEX9K->-KV6z{GE}b24uN50G}~3&J$qBf<}&Mvn>$P! zK!%b_kr7(`%sZYCqfn~`j%CLl0mb95gFxY)hV$E2A*Qz$5T6!+78qTEr7jTrNrb3( zEYe<-DaJDs9qF=(1LY5!Vb=PX;=`UR^dqM2UrfZG6nsBAS9zU+)x)_xXk>F-ntVdb zZv)N_np_wnr(X^yTlsh8=u|h9{-ye<^)f{hN%~+STon{YS7Sd|63ccX?1`gOepgWC z?!ZO;TN+RP=jW1xP(_oNkt0)1xqS@E;O6u7Jk{168iD`1UU9Nu<_^S!fFq)gYH0Kk z3*ri%xBsasL3OpOam0C;Bl0E-rY-3f5slz39w-8g90HXxquy(HIj$m&8}-<%}Orb zXGmE&EX_vnlB}Nd3F6l`GeTXq>2+D3GH<)35*x5Qg~9T(mMk;_cK6^P4UMdX_5!75NLSI$ao-th^Z-KaUkWThU;En%R@&^isYO5*#zA5XC=d zd<wkwb)8Uu>9a1p?ed`SW+vg;Ng5&<^S>j<;E7+_?FI%0zt5oleqNATX9kpg zTF)XG((O_wd!v}forysd;mPrJ@*f)3$`1b6p3R<%#(E zAog<*i!~$ZO7-XWeu}k7QnzCO5~G*3hP6;AHwc4?MDhZN+`4%lNl6~W zWF_+0a&Ok<;ksGM{k5mkhwT7d5uK$JXNh|j+NAT{l|XzcBhrU1D4Ow3~3P(ozGUX zxv=Lnbri&L=yfzewlv=Qem2r~;92*FsotD($^x5N>ZKn^DR1b;fwmaq$9+0Wb<`m1 zrnS`Xh%0)Vg&~yQZeNN~PYaz7;oZ7bJu)YE%ZLAL|7z4dgGOi+Ka}>qAcb8%03Gd_4v~ zhM@6a(~;!^mI(+B6vkez9zk|hY`4>75QWGNZp=5zvdnx{DeuxhMA=#xs+$iIuI9Yx z$yr}ru^&kyraBgpn!*&}#wyhfBeo#O(mover5nbyNt^4-)jG9376rp~SIte@k}d2L zd<3+eHpDi?p)+#ItVNTV`H%}KdP0oqg~XmP-MS+#1JWhhj>1djr3U>d#y{MgWjV?=B<`b!TNM{~T4|=ls zAbz)CfRgxnq<^l@mXLbU7G87fd`>Y^kw!=+tv57BLK=cJ`1x~+$H(7i7!75%x|#L( zq|SEw`3}!jVw_c=FgQGSU$AS-&dRipq9rbZYJMaU` zirEU|d3WEs?Sv_N@gNwFk&p|qS@9yG!aMfTPrEnYJQ2UsbC2M?qRxGe%hTR~Rsd6> zhAAE}Ejfpa%*;A#3uB4+R0u(K!ot14H;HnHfT98Z%_&9-?QmDLP_129PQedmgFbzyh3j&Ix1=|`!wP$WCH(EZAbO!MyN~89Ah`wz@W(YU$uTAa; zfZ4q0g@xjzf+0#0F$`hXxq-lJz1#PY?0FAtH^3?Mzh{U%4P-=c3%ED{VM!t#iBI={ zI_Fq#~8xy|7K(lG2p)%*5_Q4uAhOim57%P9#oqq zBO?^hHlNc#Aa<;reA9$2+jpC}wtaAqW$;{cUmq1WlU0yeH5Ym5+76Ul zGRyxo0cEdl15;CPH#fHhvOV~n-^L7fNO+omU{yxlVJsGkOb{W?@?htA5>NOxaD}rM z+1MD_|L5>-sP6Lx*jDsjwNxZ=%^3TJ@S6gfd_gt>4EfVryS1vuRrGxWI_$I%hN88?4&@S}GrrRx$|o5VEBv zkS&pd_xG**%RO?TMmkrQCqzqnZ^ZZx`G)?i-{7>`!KbvoJmXs(ARJQ5rQhJK zlZ^JS6}7I&KQ|;u>GUf%ru+Hy=Wq2Iot8W>=XQ;b{eG(b*o|sz1<+MqgIh^~qVH{_ zVgp_C$_D3fHCs6GYqapS<77o?X=%7QFrP69$o2Dwc88=NL{nVGTNx*UEBB)3d(#p4tfEExov@y-#G7i4z@2o3jlOa-S$U z$Z-&e8d6aDm)F#ctmWe3YU=NgsKo{D70L!b>({+hv_7qC{+3kJw=KEj#?H1yTUU#= z&Q2;87M6nE-rm!|u`W3Jwe!e_4fBymtjh*$p1jGT4IUr^*xjujlK(evZZ^L|Utpwn2(;UAb1uV9?S~g$) znPQF|L-llz4Im=Vx-;?kA^II{Snvj$k6uiI%(J7d&41hf%nNsq-Tto%Yl=7H%XM=gK0=$6dQN#n{>f8Vl_dhHX@&}_flB`)=c3xY+ z%|&3hRss2@-0`AhdwEfJQ(%m3;|UpYALi==Sk!jFi@h(mwziIKMPS{_?Gz5BObe%p z@_sqwFE$A2i^wr%tMb)_u@-=ZsAvH0LC9%4al>8=GmqOZUIhMj@C6A9m-(Bh&7+8i z$Wu@^{Pr9t2{~^hDzC=!88$Ye9w`Aik~>nN>DNf2o!D#phJ z@ZA4GglM$(WaFI80m{|r*cjj?JKbX;iL~U#^aH^Lnuz(HqO@(072{>3523V7g1Jpq zCF~#F06I3bz2{4z>uvo(YC9AkIDhspS4RK^@&MdxJEOS#ebt2c5cmOto@=Tmu_?p* z=}$!V0)A_}+LMQoB^rG#h-Qg_&FxP|^ z5G2vMlA$i2q;P0}sO(=|e$nccJqRF#KD$)UsLp=}RR2%-9=T54K#h}-M`DbQA<5w+ zP_7Z4b@*V*w}ky>b#+<)@-o!Vr>8Y5yERkZ<2za-jt5I2rM#@hRBz2G7aF8SaKoT?1pyu?!PpEFRl_c=7#|IziD&~9;1zHERss)1chK9eV(qYIZOE`kUg#3)2T24N(ZaGa;#N@tA+Q6~u)Uys5#&zFyupqP3eUugjG3z6S# zH`jElsfR%q<$A26z5V3k;$nfiqT=zXsVO*G;r-S%`|TZ|L4xaE`6CgJ*!3H{5D@a; z#l&_egenbdu=RMQHD1`i7Py{1hlYG>bZZ(VVPf4#)hr7>CAk@FDi|XNGKerilu|?l z%J6W&gr?_m8DTT=MErj$Y8TjGkN8W$JQ$eIUd~-vhlB{LjX+F?bQ;;Hw_S{-5=4CJ zF3%Rq09wditlj374dB++`h&)5wSgG)AiD!R&@-NThBpwoN5j^{Au2@Ca+gp7kf{dRnR#yPPh zt7mKWjTqh9{JfN{9>a?uQL9qP^9;fVQw{bJuHX?Fl}WO!x=!w*r{LQtr~ ziZM65n-@}@M#PB!tb48LpVbH_T08Ke7yx7W5{E??<6%We;#(F{5zo4veVCkJS-lam z57nN(kL1AhBAcnq0^pCbn{>W1iPH||GFD#t=tH*BhA~_2z4x zF&<=hZmxX9-T(Xse^CHg{2ZV zs5f7a8g%9AN>oyF8U1vwy41fpDOA8(;ppjJ;ZXAF4_7a$q=nTd8-ICqZMO2|Y*Y5l ztWMp>ea$wDm(=qwua(ZJbI)Yd_BXw{7Gr{@?ZGq&UGRzwQph+adgFTNR5ySvk1&H3 zPnxm~TW}uXj?)FRuuTRa3}~*IF;K(1olH^kG%I^FHX(s=yshVQk`j|dqujB&ki}bd zp*(ChebyaRoCb2;Kb-K7CWe=;9utv-4Sg?au8Xm6I$$SB!mVUw3Dhg2r2#C#pw(az zvjGTC6QjdjCgOrqL<4KBqBBDW#%C5!@(LihDF=fC4x|l*pMR;2y1z@t3uQwf$WI_U@s1KJ)WE}6r zhuf(3I?j|Z`D=QhVKn8-ZJD~YRDupx5(68w1)EFkPAgxwRk+6IYezE0*gQ|xuj&mQ zkU8<*o2Q?SQvJUC!gL>hq0Lmtb}!mxF=d-J$e}x*B%h@G4%M6?HtedTNp6>(Vb&Hx zr4)~`m=^RtBNh=g zKC98F&yW1Mp*TPcDlQVB0PU9&rt}`HnsE(0|0F}WmBJ--%9qo``{9S{3}){0d&4zl z#HfK5bp{B}+yUxz9e`fn=`39Nj(J(Tnuw+{>qB5|c##0v!tqG#mOZG?D?Nm@zQwYC z_qpZN&t~|%Z+KJ30$%e9&p&e~GEIm0uZLBdo?~9Zt~PI)%UUAY#vQDoz_Fu+_t)xO z3_!Lf!!D^9g;ZYQ#=l96!BCa`t`x6Gq=a6 z_>S7^Hk!B;%t$wVa@EKT_88f0(uuwmdOWLOLbm9}48H?%B`PI7t!!&;1vUeOD`bE} zX9hnViK|I(qWAI%Fjb3FN|7Hr!2UUgQc=}fF;3ctK<3OJN^XRGx9p-Z_QBSzqT>Bf z%*`Ra48@-CmTg;(b}5nNp2=;AXwr&t>ncvN=*XMQojSvQccUVkiCg=fyFr#lC=L~+ zji}WSlv!jFZRtlvbSlNcMpex${qUpoue zo@yz(X?~QN_ZPYeo(rZm4Z8ZaPkaYxDKKONPScvI-cyR&Uk6v1nBH&WNqBppl83U@U>laX2i0(U<(*JULl63%|sCl_!bL9A!(i%R|y3S9*s} zmtSqIu7?x6Y9x~?86t601NKknfw^1x?%BB^;E0=ci>xABrVN0CCIR*Xg?kD1QyDRI zu5B@+k79}TUCsch+OM=kZs8O%mItuHxv(z)<`~8!9g3mI6UTg!zh{gWSBLbN)9u>QY}3A=`p=eHJ#%nkAMPK z2F<2g>tzjUQ=9G!s#CAp2*p#GH%jKsPnWJYYHxKM8LTN&r-~1ARC0)WNk3zYF5awq z)tQNpo2^MGi#NTQE;JyVR+??gX;vt&Zn1c^X)TnVK1zT$chg>;v+TygM2D~`>UKml zrK>;U-T!$`8~7$tCaL>$rGu}Wr{*+xs<7&xJ{l_@b6|FQ1oJX`{DN#a#lr)XWw`~4 zj~nC*Ebu014N*5z54H>5zH!nal@0up0WhJT-aMKeWNy@*pDvoDC8go48NQYKt9ErN znH(~k@B9llM#Y6C5<8uj`zR6i;Lc>mC^!)FdDlN$z?JhZb?OW~VTd4dVNE4fFfoNq z4%MlWEcWjWb2bBJUl(Jmi#X8kIx?%dpD#qJT64Rvk#c*vd8;`2%`^&cp6#buyH>pR z#`SjgefjPHT+x|nR&R1&64h|3yp2y7!%(2&4aXD1!W~O^o8Ea9A82?{IZ+`t0pdA{ ziFgSy|I!Y5oR3pu>UhS2bL!Y~;7ozPQ0Xey0hEVncGC^Rwf4CYapD7XU)wGcKQP|b zD6;V;<>|dV-kwSW0g!YLSBy=cs(i9dJ#MaOrps7+6o;)8MwfA!&LwN;KkU4lr`(hx zHe}G6{rQM*V8~Tx8>w&sK_M5cR9y%=zv=w?)SZs1_BQ_OMX5fLp&O+k{A?Gz!cWha zp`UW22*yp(%|{ZnG(rT|L(2{lVV<24p`yO9V9(9>E*|q|RsjG$>Q!61rm*xt2$X~% zsD?%iAdnA-mBADaA;Ng<+>Huwoe{bCMtKHn5B&rECpN1D=%hoqq6iH4aLbZ|fMAQhkn;cMN zi;`XaRW;k-S@T-zwHOEUo|uxsrVcZ2)_A&*+*R5yZ%eZBuC63(HeV=X@{5aSLTpY8 zZr}+mn>$&mtcNGo#=846k45L>+41>Jor;f58FsUS+56j79?j`9PL!z|_ORsjZf~a} zc1q-v+SZL>h?69bqeMQ{+D;>C1C5cKXoSF-qJfmw*j$k*L(mP(pDvY*sg^F22;JzO z%(hnVoR2pjx39kL$-*tu^Sm^IM_8{u$giH1x}%X!a=v^@wlYz8L;7@XPows1bh9Tq zPG;@O)>0tD;3EdK=OG=2c$VFnf!|NFHoWU5Iy7e|R`;O}--I1~kPuNbolrz#SQH9( z5W-cY-DZRAQ3XjwD%#aBw2#?Qt_ zJNq#EJ*MX6Af-e{PlB0JqU!n3`BkmY`sSG$j8%WP83=KUDI`74B;%!GF={*KC7Xs}* zF2%zlQ%2aauo6}m%|(43wU_Fy8Xj??*QxTlc^sFDFzAW zYa(xV8$1_z<0z#(q{4*C4h<)}D|r~a*-Gi_1F?bja=havx0!QI3Jf`WvTt^O zwq4Zd8;n^^ub`n9e9pb&$Hag{B;;x^2;}C0b0m3AYC`nmK=Bi1C|8 zhrX0{(ew(Z(DVvDeeQs-&p207tm#cf2Io^ft)oBnbO>`!7rInF;`A#EI>`~=D08T$ z3XWH32BQX)IbU>%LoCsc5tmV}6RF?CVOX#+fT#TyIJ3c^d1f0@DMSJrXs#9`K+5xg zoL@jjV73Cm7LYJjki-}bauYIFL|7HBE)y4`!WPc+82Sz9sd4#|xa8YTzEVq0=+Jtb z&0+}J>p5kypY>LfQp0fb##}|joila&Qe~tpRb{Klwmezh@3bxYfF#DyhCs{lqPU47tLA^>tr2iduYga9ecNBmzFoetaq^!{DJ(HX^WkZq^jZgGxkChcI_3{p> z2+M?SknuSUvDh-Y6qu_X0}V}#9l?cLJ$P#6bBUZIJey<>CWn* zCJNg5O->+jV;Ht_Og`12Et@6rW|o^d!Lp-jahbqMCSVeoU5^bNtjAv-s!;|2bf}z!e0zuc&P1_bN#;St!~qOY`~j6fwIfFOXGEL= zg=#PGlNGwkTJZUKK7evVm6V!NYEc4RR<(K=>*O}?d`?`QDvr@9?_?M(c)eT!$|&3O zJWe+B)UD)O?}ra~Pw4u_o?03HVz)KhG5G%A`t@zwZ}RcoXz_&XP<8`lbr0sKq}uL$ zf&MQJ%<)`qwDm&_MEp*K;p-9Lbgmx2fL_;qB+(F1cJ%_BM8#c4%EKJ}|1$ONBHem0 zXFO)#o4GaL5Z_c>smGM6HWJZ&{s5^H|MxjxKjeI*#GEf|%ufe$KJAv=^_TU9%iXok zgE$?sn@rJQYuTZIiId>5`|^>9-9e|!XG)fDJSL-U=9p-pwe2NV&m}6SUDlF#N542L z{?P7rhr>dX#yuhHZt5g^pI;oCqy>49#8w% zubwV`n+TMbXx=ZcoCj??B0|9(1eN`gq?{b`b9JsTo1SzqJsb>!?iT4f#b;%J9I{3x zxbPwS!Q7eU^((gMq7B4Id zo0>9BYjGZ2My7PGCvQ-G?{2iurN}qMfCvgxO=Y&$)TIbNA%HwyujlS1pIH!&Vc; zi#Fql@l}vp1^$dulJdJC?Xf;nCX6U(sV|t-%3%G7Q{X3VhR#63x()HhjK*q%g}nr( zHAR!61Siu}o2Q&NdVT}}bCtg+#LI+d-^rsW5n0VMXc|mVFHn#;kFU|%a$413w{Bj@ zU)EHqh7i*)VpX#M^Vn2vzC4Jq_y_MWPIPc>9UBontH{O<9y|Fs4=&=GUQ=CT3 zVJG2&i4=N=(HA(aJ{m3~B~H_B&hyHZ!2KTj_>dgQeqI7-h#KgCLvv{1l$?(efh9N> z{YylBz3qw6Gp{h)?`tP#zG(k`?1TM`o!J*bcCE?du>r-OF9nr-3C-(cSTe}d$?m_K z3`Vu4f-UQ}AV^2559Aofnch727)#;EDSR8;-WGjJDjOy9v8)>B81Fxd=s-u-Mcig!L)w*UXAPA z$re(P{8t?{TSlZB);BKF4mb%&3g`A#2QQl{4+ingdoE<{M_syw_|g;Wz+q|m(TN_8 z-FgCmA@6Bj8zl7DG`n|;(f&aW!!lPv@4l2fTV+3JInN^JLEU5sr$Ej=3*AjT%2j6} zX^cA*WpXPS%{A?`0b2t)Fy3ipLJ(5+ByY-?*D*b=xLZy46R_M3)tXyx4xo>|y1RKI zP&s^ksCLLznW0J++sa}(w{EvEGCO{RWU%A~eZ`yo%**vcyMqU4{7YSgU{M9>pNFGI zW1F8-ajZUU6S^R^{%dL2x|oeu_{QD-ygU9n<0mhuO>ir`s_;=1b1D`Qf9;V`1 zY#|OD!EsBir*b+j!T^6&5y{EqSv;WJ&`e0jm7rdgzVj~;fa*6>YS6AbNO*Fbp$ehB z1HsiBkRUcwSp4dUrmJaX1Y7eVlE8Rr7na;k-iETl1z!A!c(rmWw`9-81=+9wL2e|v zWsY2P9N=C2;5qmqywN?;FCL&8;4OKU^)207u%BC`nx9?C)^IF^Q}n^PQ*-D^9i8i67{NeZsZz7Dhcl_c`(Fp%3N^zwto4-QE!)}4{6ogZD80cvfsC4Sp?RdH(Eo)3L5ZSnObA?O*qdELUw zXTQu%esYT~I5V#blG#nQ=N9^K{6$mBoh+OO@Lf}XVNE!A+s|TvxvoSh)wG9YFx+-p zy5|iirhg;hPL?9K?YNc5!q~h4H%WPo-B@f#v`+P4ddBi(7e}G(Oa%D)l-s^&!RQfx z!K_t$DUaA=5}WsW7bBL%eV-_s%G3>~2~Sa%%#4P^l_03^H3G|8WcXv1F4;==;{!D# z6jpiu`@KwnD7hdfvO^komb+8yVGsv{Eji7{_56>292tbSFtOg^NJ3 z&i}bXceamM??EVe7jt{eiu_mjet0#GR3zxJ8|GooOI=-Dgz{ycd+(l(4wPX}!y$f6 zP&v%r?>aXGC%iUpITxmg(>rtez&WEh2_!xumL>P{S#u{!ylDBkePEmRq#M-FCp2}6 zrg9}41E84aI;Xc#6j@O{{T#C)ZDF@$Sr>2X8VdN>U;K(iU`BaF=8&;XJQpZ9Ud}p81*@sICnQ#g|PD zptv@rs^yJ#q-##X$r*-6PQMn2c(97eS0P2(kL5yz^`*d{eGz zt~)45*_-EqVYIfG6OE*$4Aa=@?#}qP9vC?;>*l~NorhJ6nbpllo!3iNdNd>4X&icw zTk17R1DFN2YBJOhsZ=5-O2Avcwv?NiE_XKgKI#&oMilCVWa)g=SWEmi)T-6D16Y+) zt3Xtdp!7a4YG1os2u-UyH$Ue1I(?xEr(yeL(|O0SzJr4U=jgqrc{Z+?&V>rj_gDnF zc@vi3JO%_Qj!C=u&dSZ@0s5KZd718E;ttgAJPf_?dRJrffzn8bNQvhsd_yi6 z)sbASbf+TAi0RcO-|qO~R!CASz;XlZ)Ag(AT+xcJe$``50{;Oo=d(r~ss6c@7o>uo z#tlY$LnTT)$EYpZe5eY*Dl8nVZ@0PF0j29L$j#w|COQ_?Y0 z?DuT8*Q()z_QuLAa_Chy5%?g!+BK64Nm-qZcT>jaMKT8V+b?&`+<`~{?BNiGb2x_p zIrKau>UJGrgXF#hi_QyY%?^#m`1)wrbe#u`3wCEOt!PkYmCjbt+O zWwyd2~`7$>mOt^}85bjmHhniW*VtYOuI;owUnk&)sTn{$l!8wri!!sBE zgyWSRv!a866$shsjxA&pH^O1xhd+X-;M3OGhO#8QFWgD^mP?TN6mr=b*&oY?@Cf}4I9bbSFa zQ2o@{2Oc!*k6_?@@PLHka#uBA-vwgGki$1y;gjEMpeXyfjH?E9bI1PMCYaPt^8|LS1I4na(j^yyy{b)6E339ze@Hx3xY~jnoL{8--;XU8w5>y zCU9{1ER@dy8bHoN_s~0e>Go4@X13*wT%)#&-%AJ1F&wX=>Flq7)E-2{@-|kVZmVz9 zrJdCSBk~K(p0uJ1WRqG=?$`agU-!qV0tFC!2Na#De+X^;OHPNj>rIqly2>&}w*~rdL;U^(K_t}& z{3INk_^1WbE%F!-^!$xy1|0`JKQb*Y;UQp!@gHwEFhtaa7P|l>PA*bRUg{nA+)t2d z5RQ%^IBDiAD|>8(i21Eu!dS40)mHhYIw~DGI~%Y4u9w4Md*%8XX)IY;zYUjY^dZRp zOE?LWDu1R}AE5zvoiFdgy@{1K1}pE_BFDVUR+$p7>KaoJY+nC6@m*mzX{kb_PN!FF z5On@y_Q#iO7M{NeKH;(N>)^mcIjDIjWCv-Jfg|&qDSUDJYk3t)4#a=kX|Db+LWcH5 zf6hqf@cho^Eo|5l8<&zL-8s)a>AcHLVMG0*Z@HVS+h)cXS2?{gs&Qy<61U9Tbm1OL z31BPbSed3Ouvgb>44l<%C7hJm&-ADBPB~>d)Dv>T`6`f>OJ(NMo>yx;vyeJs|L~`X z11O1>%r{{9z|$?Ol6$T@k~E36uu%&~@DSB`{IvLdiQ!X3PEqEJ$eN+Dw$C9HuZx#8 zf5iEbZicfAqp>kvHNYWUqbgA|ORTdx5L&C+)PW;$Mq#EM?>@VJdfQS$Qmd{vve8mG z9@?>@eVsL0$M_HyBBi4_OM3F^JC~+b_uc<=DpRmRF1u&+)25P#g_kO-zlUkx?c+#` zTaGtTy51tSYhN2qHbdcrEZKlDdu5r{3;&R(+?}z>s!fwO@@cN(`s~bS+0ka=PkwI! z?_(t#9rf(RjO?Y}FQwnEE7bGnu#v5LABPzCX(;b>MQVF~?CAV&XI_r&(E1a=q{q2Uka`e&Et4KJU z*+&lPdm+gR!PAT>({TtDVEAxhaKU4#t|QnoPw~MM>@c$7^*!`obsIePythOADcJLv zy6dQMUE$odgIWr@j$Kl>tV`980QP+=7+WD~Wnh|AV^aaW+L)*_stmkb(0!-AwO5Hy zciOkm$LQ6yRhIB#A)pew`6f~1yMC7fC3UoEgMfWF7_@hhn*3(JDR%VS5_hr@R*lr) zK3Nup+V1gYE0N%UU1EKMD=6SU+Vg|wT0jmcmgUDTpyund&weW}u`BfZpCL=7gxvU4 zyRBaDo(T^v#kc2`uk**|jfl2bD`!630n5M!j6^|g0rlf@YR=Abg}^fZCb}1;s8Jp_ z<96?)5-Ru6bk})xa8aHF$2BRPllyF16^o4go2!_o_;-f&s8c0 z%~T4JV>t+d%fUTs$o~5hP1EZ^yv8lJZHt^!2?)=a$Bq^rxn_@zg?t`YM1_6piLJgY z*7`1dG3A=YhqkXW3ZG;V zNHV?+nMzlPREf)B%S#6XRjP8%vqjam{`yEIe949O3gr@6(a=W2!o$5&BRr}PwnrnE z`?#Z0Js$J>(48lixq(S@dBR*TcbU7#|FzPhGD@=&HUSi`4~;oc^9F+ zC9oLL_eqnVW~^v%bq3z-KvTJ0!&TC69Ma6&g0JI04ya8LUHL(*|6Jknm89oc#E7_w z@7PW{A9Viu1+mIynC&#T52#A7khgHC9y9}2JvhYf$2;>xergIIW6{-rvLpRgm*EvT zDItgU^v=HYkIh3|SNH^>&gRiDui-OOmG7_)`(?(+`oJWU1gCdZk9CBjA3pSbl>N~m zIPz2Itx9A&J!}cyE+Toq7AFxD&DK`&87lb$uZ{ID{g{!nr3fzEDWDI+pqr zP)>Ub571zuI5O+9?Wc~~9Q*3tnAZ;wzzzU!UfI|OddbMmlbOZoACeWheH!U#N8O*r zV*#P$G7N|Rp+8-^<9i9CPBp#fZ?c}603m!D-B-6Frd$3pW6njJ_DaLka1YUkH3ssN zm&2xG9f3{BMqi{ZH#^3uYYk6heg0ph%l}t)@&CDx72dzDUO&g^xC^ z=ZJ8$M|;uN)|3R`MY z?5fG&k=S9|!A=x0?q2sYNW(RXe7Cpk8(+P5l#=!4^2|R!k7EVuG)I{;U*7Q!4ZIpj z^-<~e>VahY`1FG0r3>0n34t>}ArPBWEma)qo7za2+XiOTZBpO*YHW2uHz5*aYg(lF zvX0CCm|sw51}PLY+7ngtrY142Jlk4kk^fk5LT!XmocDOQ;QDveUtALcNi}aw?8(vy z=3%omY^?q<-@VbLPCa6lL)}Y_d?(ArT`WGio(f@Bb)D7+Ew>fm1HEt53a{3*L&tPExg%EVJFI zYGBMujYDvM!wpbnOF-< z4GHtn$-IXJIXw|yw`#|pLf#`Mglo>cEytt=7Kg`vXbHh+;;COc_sh(e=zf4JA#)9f z;ix?Um^{m-ZA5R54tz0_MKP@c?tqhEqM-%zj+x(L0-T6TuxcT*sh_GrR)Uz!Pp19f zP|>STd2gQ5z66hYKRQ^5Yl`>fn1!BMwANyP z90=+#+|Y(tK^>M8sK>G8&|=8J%2-M<6L|^Jj;tAMrV9ei64j}mhf)^WcRlEje-bB$ zFH%eAC9V2QQt->v{i1@J-4F2^}3RWU7(6O|X;aaO~JhQ~`}-R5mBQm?xG6 zx`qu9>@?)0uzZdKgkyt=6gaUaPQ}qHRP=;%p=}{ zWrCIWSvt-t0X!%uKWx-=_nA*5GTv_3G`Ei+fIW%Dr2Y>Hq$~XhU@++eR7cE$%gi}h zA>h`!gEt}g%4IE~@Jc}w*i-}rAc~}o%>EhPp+BauCO#2!>04!zhR@Qgels2|&6D{T zPyD^4XeqekPf%mfVE~pOQVoEna^}PH`_$uS#zyhJ;Cs7*nlB zwKw=Iq;69Rb?Tc6u-$$#e6GHva0hv33@K2#pOAg-ZLubu9_9@G)kl&7GTe)q(!Wb$F@#h--dZCNR0K6Y|O|t}LuxTK)$tHb*^E=ovqKzlk6&F0R zflrxlgqxQbKN_mOLT52S#_O9n`6w7blj@`An-R6&ceemV2)Kh41Q(3Y14 zek@tA%@~~&fY8SmV!{dSg-pBoe^0MC|dFsxKoBE;SjCS z0O0{T-yvpfbO`g8-QEERXFs{62wkJ%pxgH?-V-a1C#0F)vpSlKsrbzMWAb9q-buo= zPY!;~QYHo4XbTjQgF`bKP~Kc-wh3f!;_bY9sLanFh~0ajvRMQZb#~AM#OejG`uYGO z@c|acmE$m;^tfw3GV4+8eARCX7&L;wPRTU6%wNc&=$0r}4nAtOKe*`7o0$8Cj&THr zVjdab+IXk5rXVoCgmK-?U4Hi=!l3~-I`)E`{6NP>85kji<;rN0*f^2fsVjTwCIHQ4 zXvwPx$E~J7qq6Zdm<|y4IPE9T;q;IGiL0`2=RV}Q6UcK5o5X~=HO%R9?riwSTEQk| zCKv(ICGt*TEp(rK%xV;;u4R8Jx!Zs^b0Bcosxu6@GzPKFKxzyBW(4>E7-rP6t>4f} zOf`sPW~8NnHco2Wp}voey@4ePmSy6Tnbe)86AqNm+)drLmpYZTrZG#X(ixMPMl#$= zBDb5C>uaS4uidv2*P&Wv- zY=Q19;;0cq3UkU569#xr7;dp}%ic73C6XU+fONl^j(`}sHFAtkS$ol+5_V1rp0*Nf z3xeg@d^vs>cU;q16_M9fvh+o4pRYqf-|$2B-zDsO##~I3J-5_VMHF<^9{=SnNheY< zc03*!U1KK{?es-{r{Qe(67~hGbWl}gW7*%oUto5^uc(M=VI`vwowIsj2_M1VY-1?9cwMQ?%7r*(=Wa0Od!g(G> zbA_1-pREt#EPKbe$b8olUY9QEZ%OT8jE?8hq@@Ih!*iPSLG7!y!uDKWUac{Hu!Yfva z-F#O(2C34!NAI^g=qBel7DvD7D$I%drf>4(SkJIU+#>0_7UI&3l^J;S14Y5iUs@{s za}l8|34s0-dEEo*zR+opM~yKsHeA0n<0A3GIFAWHiEO-xh07vuZFJ`_>};%on1ez1 z*gtN-qv0j`KIqZXS#4b@bt8g9%Oh;Z@;Eo`Z)I@y?kbL5ATLiQU9JhH9ZS`*q;~=5#g50HE?_pUWnUQ5qykpZgc_ z&wIQX@CaEuifOQy5Ztu@Oj52VN{s(5S`GhT*Xwd_iJpF=6hTjQDr03`a!L0@G;aMz z092d54IK-@jbQAxs2`l~s)4~xnvDI{Stl{xx;z-PQ2l;UKR@0|rG!*=z@)7ix%Fnz# zsE=oV1xW!&Xp$eKGaB(GAONO`O;epCyamYl^{Bi&DM_h|7P#Yx!|9MGRDHs{=MUkv zWcFrQzHVjJ99+S{U~~twO=7qo_kBl+j)y}Q(#n;E=bggTqh_eKkC?{huH$X8#c`xY z+=A2!l`)XWm#trcox(RzJ>E#pmfcl zZXp^_(|=o%1r)EB>%|@OYzE}%&R{TtsgvxEm7N1}?RFwJIp*qsZ>0FIN5$?B?T5+B zwu-p7#7(30Nl$QGSo@!g^Ntd!2l0}s3%{+0(cK~jYXy`s3+OmPqR;t{^B(Nj*^H_E z%LGKQhJ#V>fLC^i0N_-L*4Y)H5jXZDQ>W(>2Ez`nlylb?ZJDB>3Dsb!hK)f^OI)WbVgoSXp3e3V6j;6zMdt4q4DOo|KDYA z|2FZ=XVdpqHGB7k&cHwZC0RQkCB%9bJ3;&!s-vdB3 z8>RVjHq*zu?degw`o8}k4n-e!zQVq3^UJG!_G@Ps`)Qiq;9%5AcdToI<^G}Ya!ea? z;>zS@TK)g`O{kH$fe-U;-^FBiMe5TT^VwK*_}##~h*-tn2$D1~k0rE{gW96V@0_nsxqzFRx38Zzk*&f9`n<%FumvF5j>2vZ-5R z#!#t2lnVhe;AI6u?TKA<={sg9(2d$(lK%x2e#8NaXLC4rZV$Z}nOMmuKK%8C4}&Kf zqBc=?oulp4Kfy3EBQK>>@mu{Brac0_o^h1BQE=s8uDtUi(hz0z=!GEi>YrMGucl`(wRyG`X_`>uG^wpMYVlX4}Y~X z6v`O<9<4A{&3eFGf%IS>T;4}%iyh4F_HAN9=Lj<${ zO#eo9M~Eh~={C*QwQ_m6Vaz(vFaZiTaX41c@eD9P>X)wcJA(sh4RjwBE*CM^x+GET zmp77}h-!5oj2;>y;)>jfz1tOsY4xC6PBr1LcYV$Lg~}p#NoNUvcxFVrM+^eEE;}d|9#6^swV5W5=y#Z#LLe z97~GV-h~u-B|Unlf1yKLtnF6Q*YGXK>D7$)oYk=YzLIIywS^@{DZ;!*R1z^Guxyjw zUYTqBK5`tLq|0t9-VAaco6Hs(bKz9*)oe#i06JplP zL8%hi&n#(TZ~$&(FBDlk(N;pg1&Y~r%pU7V zMf{;Ic3z(T{X@4`;AajN4@VPjMV0SgNIbR{7?z4##b}kX!>H#fghk<5Qfd}y-&NS5 zykVtbko0KqJlRyP0gB17{dC@O?KXi z2#aEp36?S2(NR~_$l$R@7xE%P8G7iMH|Zdl92&V%hylY!l`?v*m?-n#hB1|hU5Ivt zVAv$jl6sjZ38qT_O}z{PfhAF^M7CsjMM<=7SN1ig^~HlpC~jz zkBsVGax&_4JfZ<9aJ{s|;E{NXI&m|1>&{wj7~w>ORR5Oc?{a-R{j$gAbJ)droWEsm zI_s$`Q6`Y4oCTLcsoJDmj?P1TqntXy^9P%q#BGPW-am><$`$SSdvQ@c@nxJq*{z2hvJhlfE4kl#vq3iMVVH_gsTfY7{bbnfAOndR=fjD=)uOC(!|6;ga zb%&3bH1U5J_qtHsSxA+ml4)hU@e#0{KIm>EuoSo-uyVF)mO2tn?jyRXU;FFyFzQK; zZvhZTRQgGPp*}hc>g^D3eSa{=5{a?)TwYd>x{=}Hu4fmE8{YftGRvy=K0wZ5oTAZn z*)$#5gX!;es~F>+w?@jLzm)DBaxkM285UMn!@8M*29PIRB*CT;dXEQ`URF~iXxTI?^XQbiTn0Qd)m`PZ( zS$(zBq5NiDaclJg&9BzNLY?zrd(F9gu(JzttTmK3w`EB1pBRS9Nst z23mwodwlXA!UHf9>wB?%eM*JyN-X0Ls7jOzSG&7!u3lq#l7fb{ci18KbAJvU?|H_t z!= zj-?Oi#8*F6PQ{Rn@zBUYkT9cI!7oW|AE57XiRAUiM^&Rf{EZB@$+n}K$@8NYJ5cQ7 z;}A>*nbM;~`-~RP9bX=_P4!zWzaic7qdwWB!jz9J+z&4Eq+@|PH((> z-E~wTnL2v#DUDV$PiHO1V2e+h3ae|k?0=}zhQy_WikAJ*xgMPxek5*R>gZrAZzM_C z@&zTSo1}9xSGEYPNf4nTMdo^MP-p?5RIZriMko_po^-%98io z+G7h0nT1i&+X^4^SiiAa;aAYWXYnpo$G3O(`0>Z`zVl*Q8s2fVYq`9z>z%`xyNCoQ z{B|Cj<2WOz^7EW5GG1>`?;i2kFP+|^@?C8(z4Y5qeqVJSCnM9HiQ((=Y&n@s{bS*O z@UQ55gB!9(!NOGL!IsDIky5y+&doBs?|wAd;(qg^-am#%!C$FA%Ll0Bpo!foFEru* zq*JXMun{n*c0A0it3^#o$w%QM-y_|8Yu>)rPi3*El>RD?3mMiJ*W@1V+A!oa5ZYby z%F#(Bxzfw7;dYqnkloNode=vvXBAz|DPwswGvb3Er1$VpLUv8a_4S~C0k;%R#1q(9aQaulFu~H9p|aMNQx{T9pKdW4 zaDsc*#k(FmAbfT}L#_dl_v$>39a(U^{Lv>@dkZ$n`@Tg}%JqRk)2=^4%F_%lycf5r zwyG;Py(?T1Qzbmq;uNSXnrQY^wl_Vo#?C*rI6Ck8ke#2ishIh|pQ#pcn{(!Ac6bx} z?byxwu}RExyf+j0GqFF!`bJ_;K(l(@_7AR5!R19qw-kCp{v#OL-AcLw3`&Vn;}tYz zdv;|!%z5J}&}Z4~;B4rz;=S;G2~CEi_<;LY>Wi(TSK(?+ zmSmHkh*j?&s$QkR8Q#b-c%UpKvWuFqBnC`eNL>r24p@q*bcH5{EH;S^h4uvm@UksN z1^dH?@p5&BM><{P!%sE2U!p-5pV?&qyGy~#9SGluwwaD47IT==lPFA{L_A@39PWgO z9&L4MD8BIhiVjEgem2JJ!fp|sx}$&24N>4unmLuBrjj1w-}GA#^IA)V>vAVrS~3aZ z)s8pZk3lWig||UtFwQvdBwIz(%=gV;G)iB{8yvFW3>F}qOhk|_R4mDvWEa3T3;k2v8(NeK1Xw9k+U z2UZr31v&pk+#-l+33)Dt7?NG|>G@VvqoLgvvz0VROb}VHr8w)4>Ay`hrjND{9?So| z-u+L(OK*F(3!3F!C=T=VERz_OA3iRZ5kKY5>&En!^aw|hO~h<=`+l5=E~VRkHb{hF zT}l)*h8h--z!5^Arn2FI-O|y44W%^0G#e1b*G2TLB8kz91Kqtyk#t z_mEst2NPe6k4f#QW}>HVyMy_#-Gvu#?>qLlqzLWxpILNq3#Zr?GX~FAaPtQhV}9O~ zfNsFQ_pY$Mx80Lz$}2&Z!WslnSc*Q?SsDIl)JSB9oI_1`I;sXOU99&uIvDs09pZ*`9M*1(Q=L_EVWtaC10A zXQOb1fj8bLKBcLaTXJ$<&wA1Df;SjTBZ4h|X`1QdXERVl91TaCm&~4IvWYSq_hE@( z8*iE5;uVr#pX;uNaQiBgQN+qteyGLEzl!G!o%5#`S12rN+-^tiE)Yv!B?**ScYazS z9zP2S7|7KZlabhPbS4vTq#BR};K70LQwe5O!lbjnLm`G13tJ|9V^f|y@#k3dP98Wg zc*2XnFy(WkX2|cIFX~n0wPA((*nGp-)0*mo1ydOPx>POTafK=sd}lWKGoO-21oOi~ z$9%%L&7}Y9M^c>OX*~GVYN?(D=}M!;WHw?RL(Pxu$pWW z`^;y)YNkinfPhQT#EhR>+0Q6_5?y)fpu;E7odnAelXaWGU=H#R+j~k+Xj?CX)rX8> zEBG8wl2U{0VJCHal^2!Em`}-VCe9HPSJYDbgrLxy{#y;O1`AX7=hiykrT=$Fge_jy z`&h#w9jH_B^g!D!MiW$Le99voKd;PowRnduwn^29>KaTM5qxE%#CbuYBDn}`Z(#a>G zN>3o62qq7bYi|bs#-Nua*^Gfh%VBkHBWVegK}M(H*8-s*Noi>88aeS7nyocXan5yW zqv5FVQ{Wc`h%i<-b`q(OIAz-MaP9x$(j=K(^27!M(+$EU_YVXjKwWE05v6%j*Wrnm zY6$P%@r88w;jG2$V9ryzSbpb1qE8j40!)N~*x!VMOZpM>L*O8c3xme1Bl~$C6|6(mAy`S({uW#1W&{QY2F$&K{pT5kD|3GcIc+ zJnTZzA|CqO8u}G!WJv`+Ic&+xBpD0s43qU2WLk&>LwQ1ws?zTp8m!EQxJdP>->LhD z`CZY|MwDqn-`hfJ^%7OzEkG*tw(erw8e0|na|@l(hzElI81r#4(&Jl|HRoQ1qF(u5 zwhc9x+;l(8JG_vtPVL7a@jv1iH4Fe&p*cN%roD0-z^YmXxlUWYJ0eX3Ws4|$qCYPe z@O3PIKdtdye$Ds154l4Gvp2_sW1^|>>c$p_L#bb*(@WAci|s73UcvKbc-dgxF@C2r zE~&@a9doW*mU!=~&o|bM>ZZf3zg?er&yTzPB;*HCzfQP;<6#n;Mh}&e1aQP^w9|oS z;;Dy$b;FYZTdPH#8Dq{^s1TVlB@apgB+UlddC<6~9I8z{s2et6eh??HEJnN`b`<@L z?%KBF>ePHErSPT`ukTP>sM48VRd5UJ8Sogtetgm6JnCiuLk6aki_#M+GH57z{K0ak z^>sYh-urcN{Yw+`Oa40jw$;x0w=D-c1uu&^)IC+m>G>t-^0~z`b25w1!}KUMBQM8f z%z2m7TE^pU=2nDw<^aa9VWEjnX61yz70{7#s7!k3{(-}wZpLlm37|wT-AhtjJ}p`k zG++gU*(9@t8PDR)$#V6`cvS(NbVH8Bs3l1ZQB{ETj!UWmDrF9kR$1&qhrA2(z*yjP z3JRH(5QKV(qcNzk8UlTR1~R&wx2<|3*~8 zU845vM;Q(WuE&bKb)^rrx7bvkPEi(rs1)fn(#Lqr_v0=HipIq+rAurLC&DvxXV_mo z*@a^%c421#k7OOYz~P5bim;I^=uPTLGUTpkaHk#Y5P>jliu6&D8i5j7h^#Z1ArVq; zOC0mL4@}sbIJX88h#)nEDFx8{1)YA2l@aYSV#03s;oP|Rl{NQraR!T~h#5ypVkQ5K zZ)}*sQjXNv#_2dbx@@$|rNLv!{2=u5b^p77c@h&uB#-uV8D0Z0bsaU!<5;q`w!{98 zv$}%*1r$zjl`vO1@5MhG{y}m;jM%bSWVpn90It?|jpOcbGOlkl?HDr}1LB>@=_qNM z4(Bl(P)1APXwyU#VS4J*5`mnM^vBQ$KuDHNIG-Z5%EH+jt%4iSoW|Oq<@9#&5rjVk zt9|Xo)1WGEd*u{G`b(7=Z$Z<2Gj*omKq5cp40DriM~`N#!2v7+4EL}$7YA3j%%*Q+ z@vuFA^`cJ3(C_S(qhABIomQ#zduKG%9@4OC=1U8~A!~NKQy1bRw|ks8y?>`o_by9$ z|61c;Bv*y=VDiX!wNcTCRf4Cw?m^G!B?UzuH*}j^P@*;$1m8%D7S$~-_)iC>p7&SC zh{$K(ctZT)V~8UfA40kW*wbWwC=K#mItuMDOU>?s03;`mPqP zOZm~mW^J)UMaG(z@ej&2^j28~mmhzd_I8Hr9))A>IuAEoS{VFIw_X2R`LnLYtH zpuT1zArgFG7B+6Ys>0i$kjK+tRn=NZQ`$1wZoz&-o5 z?|>cF8p8ge`WQ4x8hN5R+?=_EDw$|==2xa|tcL)a$xUH?bnkxd6oUN=L**%x_ zf`uetZ7rZwPxK83tP#ylw1VNu;K@BqtnM8bW^lns2cA_0R&} zgm;Z*3qE&t=KDIf5MF354h$|FXq+9CRB4p&lUJLgn3EOAHQ)VMgEU^ncy?9MNpHP9 zaRsf=PK;m49VKQr$S0EQ4(OPYf2D+;TZ8z?qWz$7jC8#;kO1@x?-kxC)ObFn5oGjD z3f6Mq3IKbO0@?s$l2(=63vk_7KkTU5RNp;P6{7eVuPsWkiJVA)Hqn7)9^J{OMPBGk zMaUFrxxo1@NB;~8l$TdTVSo7CXEnC=QjAAf%Pj0k9}pR7tEj7GZ0Ji&9y1EBF3b%( zG*VZe>;6r@v9?ozzX$)*5?fm18MBs~anuYzmf7Km0eJ`!adMgc8Gw4yR6Nj$v#<1X z>B|_|d#gWlx@Gohk$*n1p0(ph`iFaG{}`1-OtR^ywGuqvqMm+vh^eVOHlaoBA?g^KMMF4PC7vk5^7=QaRe?ek#2asMLX+=hb#Ou86qYhfKcrwwD| z$U!p%r~j@*5sdyDCmuOsqoJDw&h^x`sf0n3g!4(6g#g@5h-QB(x+4aP-L>tskd*^W z8t5;yFqh@6ml#{V5M9M2pZY+TI62SKNO`y{NsVlR)-PFkbdybWSG;w%zTC}khB81V zTQR+6F)HZ7?Vxoe*9JKzt}~Fdy8Od~LJX4q9##Igow$;w0gvCdR%wID2@b&sfxLr9 zT;8?&=+_U=ey~?f<_d|~OEgipGADa5g#i4GYT^(eIwSZ?&bWGDe0xdzPTbRa$g^+6}K&=&hqgE6b&Pl&#AmThE~$aq@6a-Z^HOAh>l zhZ9AhyMXFfCX0jeEdKePP)LiiAt{U-NXfIOla|WuTO%owd6EMZlTDD!K0_odxe}^s zSxQwl+5*EH>RxsN9MrEeO9H<6Fr+s6vhn>RRZkh;>J`5-R&q^VR!@xgj+*6gZ&=2i`{O zW^|?w=&d4PNMc=-Cy+!LHswW;UQ)qXR#$o_hEc%H$_ck9;tU4bO_!`grP7*X>imYX zY6IJSg&8bfO%`%NunqjjY|lg{mE3AtzAN)!k5Qq6AV_=2P{ck)qJ&2{e8| zQ~V0YC`_|s;1c@}mKdAp#3!HGO#3Tv{V^3rfriOQ5#v(cQWPP6HBU}~KY`itU~fAc ziNp^ISF9zyXuNb9Cy9k?)Hl2f@(RC{<@EJmnzAu2x#IWpbFe+` z3_ny#RygzN^(x#P0PB+BxwRM^r#fE7jbHVAEQDRyp5*PxtVHS{dmUvVn~a?h03Z|5 zXbejUw%u0Wt+1~mKcf(gtCe61JYHJ_F0!21xwqCD5c)-uXLf6 zXRcN=XPVI=X&Z%_h{rcs#Y}>OvoeZ?O|)zqLy_34w|(f0DSYpc#hKI|h_$tdcF=!Z z>A4|n&AWI2G@$ho>RoS4HSvdG6BFwKTPG$OVP;pi-Ckjt=<}^mcav`w z(0%frY#dDp9^pID3F*V#`EYaq)4!PJ@5o{WqpuRqjs)%LJfhpE&T+=_ZsPU%ud6Sv zMG=U@2OaR6u-j}?;d;efXOh`y1B8-%rvVc!g|l^48euAurz3sKpQ$Zzm+w27M8DKxt(m3+cI!hc&78SrM4ZJy)e#V%KC@M! zZXuy6HVqDLyBB_n`VV5WwL4zAC67szB@&@bx;+}EAtf^)b%)7!D(~fPZQo>6q}4Td zfE=U)n&9%JZGYAaWe%&t^RS4*bHHq8XU?132V?pqYcufSJ-q@ zL@H|>wQdc(>ckXRDH^GG8A5$4LYj16JQ(M6)%>-Zm=*B!C=?7`Rdu7nWLHd!tu*&) z8^iT^%*3#6O}`x|K|DntUEfDa9<<<4C{W~uiar=o0-2yU;r*T@1P*C1B-r2?1BDhL zNZGuR+R4!`EO5ja$?D^Ic~#%k0&r~y$DofxzYbOncVNbc4#B!hC-I@Tjd8}lVMKKF z(sm$l-i6w0{QZ4g4sRZElCr_bUt8;+{8q5A%7VOp;V(1i$+4CGCDc-Mag(lx=Ycqu zPZQH}YTIZuZSp*dA0UR$l?LAEFdrhu+3=dZQx;D1nMqH4__MqViT#g`_@(f^#b81( zYrHNbeG`mi4o<8yn7%iZjvymx8PolZunbl^_>IAJefD)EP3?hf`Z}h)aAGg|?8c21aqz2b!qA4TCfMblSfSdkeI`R z_RS|7YHvH$rWdu22DRvzQ^tukSy%hY{@8!?YySW(vdgw%cES?LT_-j>@rL$!_B!`)(#| zz{9+C>GNcVd5&#U9_s+QQ3kMhu4ICLoTav5_p@iQC}$$uOfY$3HQa9**_JykZ5S_2 zqoNE4e;f35)ZpKs4R69%YGiLf&)gsFbZu;4v-a&a%>~Q5dV3Q~0ZL;8IW~=1951@1 zzKC09EqYg_<^?h4H9JJDzxb%os|AuNSjfV%7AYm>d6K`7Zj^QoGQv72Nv^*Ml{G=L zs{F=s8O{|e9x5;UgV?;)(m&36J-&K;&*A=_6$MKOO`pwj?c<9JvY3|S-o9Nedh(9u z1O-`!!mtn*jr7IWl7bq3{+W%$gL+;C!!>e4j4r_}s7%5NA_S}!^SVecT(rVVC zwXcO?GhMTr5ph-l+@>k8BofeU}y^uUGl4quGF9WrB4|`gJ zdCQzHQ*9S4U#w^FTmfn|er_3`*HQW?0sS#=EsLZzetbQ?FBN3Zfpb%*H{!ni;rn#5 zECy*S4@2?yOv<7vr*P_i`HMI7@KZ-Gp@TZ@3`Sc$(Dl=&>vdA?o`Q;UT@w@(^mKN^ ziQtL5XN3GVxy~f#z|t>} zT58r!Y8A1D*4(jr*|W7p{nj2>r2Wiyo4^riXS3l*$F7rN?N81Zu20PWs@z~ueSQ`; zNBMN-t;q`xk1m?LNk-6m=9dNFP(Vivs5w*`#UOq;dn1YYik zr&BFz{L-Ck~nlworjgQ1G`mDUA2dqET$!fK(uy*;w#Vu54CT-js#d2|6?8b8c`M8%Y8d)=jyX8yos+ZBs0(_Z}E2=WL4*Pu)GZ zVRT=*dRL?kgJH9qHe9Cnn#s-Px@D6uQ;vz{Zu$4$f0usPGO=8vgo6BWR7p=Ub{gEP~>+zp1qXW{8j*hrPlwWyRyYcQTnoW8E+GW5aCN!ww zOh#^C7cT)#;jZrPH|BfSEm)_U?x?birVQhU46u;bqu48%Sa%o2pCM-1v}+_g4JAgX z#q3G(NU*5Wfx*QF+EMfLVz1^T=HELQXK+rp(WR5Y*y~%)@~xgzy0O3F7x}c?PfzU? ztle&Aw!Vntj9uXRiXxrwNgb_itS%SIByx>{if^~rbZ4}D{v_-fcYDX!*K50R=ktFo zf?JpFOwb;gZXDu{Y5?Yj=ybc`*)f8ITwGkVrxb0o{-X4`A}(w^=&8QRaB-=~aLH(^ zOpq1x7#J8(_usvHx96PN&$VeO(2ZbHrXmC*dGuL%l6}XS{pn+)4#ri9k$;^(e^P{~ zn*$i3y9cj|Qf@uZ3NgP|EvM7{j_|vbV9LIPf8)l(s(p6$_FwXhs`mP(BK!g_RQhHGrfIW^XJdk7FD?~w|x1=qpt9V z)aq_7d{lE0gBj~E-xD{(4CYxXDaQ&-uFS1-VLlhe0w;{=2%&gP{NsFIdsP^=!Pza+ z9_-hV1f}?t9^7F@oR&{c>0kfMZ(qH3|2@f41^Kk5yu*9naOWN~8;JE`zU|FndBMh+xXFGx9b0j<2-Vwy~J^!n*3!xXfO?+i)(-_@3ldFbNjw{sHRh%?MzQk zFKyz0M8^kF)P!4w=fTJ=gc#ii`OL=!^=!L z4{4Qr3Chfr)Mn-mB;;>D((0*WMs^i&o;AQ&qZhB@y7q|>dP-TRE6o7%7VLA(|Ijkj zsPLY}mg+bVtI~_eCbWF}Da32m1q)W9dU~H1grvHrr=_@z&xl{?!8NFRk06t*2RMCi z9suGY4}%I(zOP`PQ0be8yM^0!`S&_kE*@okDvgYvs~Qq$vt@UrMNmN<>~C3k=E<> zEvizHfXJ_N=Dd5qi!wShwf|V&r`G-0`Rmn_twLG&l=g!W;6_;zO%Fp$7HbkuQx%IO zR4?0oJgsYr{1Um!$_8~}XD$xO4fox3dzXXC>?Io>_&Uwy*r5ceY_zYFFVB%Z9$XSp zzPkj}9!o;!bKlnqLfXA32kyOZ7{b{h22cwktM=``px0iVtcd>cO6VuNfY48r#oW2G za$?o4+%W=SKsN@#UjZ!$LP}F=z83@2vF7?7(_uxTk&@}1?=Kwo?vPeBk_Xi`y4Z#&uV3SaVACcGg)qeL#~Vopv>S7I>e1d6^&8(Yw5Kbiw$b zQ+YEEC(%DB$xDX}a3u<$vb=QX(ShTr)UT@aDj~2D7Q*HTljkxy+TzWp5&t0GtV#tv zz`r?i4&BH-=ZC5PKrTgZ-zuv11vDHSzYoUOoeL)Y zbhqciG-WlUhu2{%@(RTS#ZE?<6@2lLyb{j#7y*!YQ1Xh(FDyJ5Ymgh_>x8Xy#B1$= z5Z_%l?p%;|+gZLFW^1Ngu6m7_a%pK5Q+AnWFOS_9zxevVuJ?162_Vw2@TP3r10bM3N!g&Bmy zJbtXE%o{(dmXKV0b;Ry>HeBpX-68EY&+V3tkXmD_5J1Bn10&+G>q(U#{d_=u5Y6H=1e+=pS$Y1a*X{%V_F2 zz+yyk} zKXlMdor^pU$f^KhFtQCSjRqlg-(3U!{ri!Nfre+Z^6f{EPjFxE&ceGIP~Q)Q@5!7P z*a&^%vqBB&&BCj&AIQJ}l7Wffb5VzYuJedJh?2*^*%Fo35eT`ZF*zufHUCb)oIe`rA-qFjj};8CIkzk)#TzIKk> z_QJhtH#gs>t+sQ{cp@w+8TGn5!Z-dOGA_>hk#0&dYihI!?P3u176v5uFIv0* zP8ohKYXeqt3F`+f0y{v1Cn;M#{lk1_E;McNvC$y@4iIei-KCvouS{{(*Xf3SIf|jT zdFgo2YnBRXz0M0N_#n4M@@+@KkPyZo-|^XWcXeII!p}!Zzrk%b>zgQjqpx2;Gpz!X zKNdfZqdo+PIvaUh7MDYS%R+&vVNw408LG_fmi3=zODdhzj{Uo>HD0H5%)%Fs!e-&J zWe*-XF!#W}EUwAizw&0l{5W)sWvZ*IC%%D^*-_AyS-#OUJ9QtRFxC#{nxDo-Lyw>l zSaTyoHiau%!gLH^ZOr@zxufZQXiPuH`H?JT^l#~X5%LvYT9BVEXK$p4}5Wt=FW68xD@u+ew z6~}KWSDYV3+i1Or{pC?;!>yn<`dv%MZ?<^7=n5_TBG>WvM!)cb+>b>me+&cF3itLe zq$EzOz3nq*W9XjIzVjI!86xP04<|P=xfezm&O-9dECs9!rmX`0sGL2=y#NH`T0na* zo+BaB^PI>3iz4g#tXa$9nhWb{Rh!OQ6!VMLAKCz6^;4)J^LBVJB-ft(W33mjy8VGb z9$#F#hoW=$+xvBfd5Ok!=~DgY8qLkk|7u0%+c|Y|hlA#uTx}m_O`ksf0NRK2Y&M_z zAX)L$`jHM6_ECxDn+;xLLs8zxc2!|UGgfw%4R9^;SLsbS^I1XE?kn*wsM~IAHTD8Y z<6&u08GnF(Xb_Z96DV1Yino~Mq!WNA0`%&Wgv1sB`6HW`dgwblJC~9rMOIEuj)oZC z&1%RL<2w34R7?c5;5Mso_@i? za^sn2Skg*fUY@^=9^(Aty*Hl!2K4%)Ru!6rbFS+fKYl#s6!0D~1(r&yDA|}W-xT7B z&9hajVvPz{M7HHm`?G&&2y~3mwIb9M2qqx>auM(nP?YIdjtp8yGGBP?u@^Hyi{b`U zR?Z5MPPOYg)1(Gr<2|&kx{)XFa@FJ6O29#e!YuvG6HEQ^D?|kuneQ)@4+$4H56^zF zg}{t?D{MYDHE~0zP%7w}6Vv@+-|~bnsy9ZXN9uqwo}B;sOD@;PsEuGjDGRyxU(1|bm}b`%g6@(c zUE3+Ck2Ad%6ZZLf;>XVVSzWk>`9MHPc)q-BoYnN7rEUs`%kfl{>HGe{k>3&b*Fj|M zh20N=9F=B1{%sA1oC$FxHXYTx=gytOuE1xdP51Zjz8^juH8FDz^C-X|Ugot^#M7-D zUpw?gAI8~n`tg5k_IbA;(DoD5eu(uX+2#w;CNS{()U&5)o6^NZ@o(L#gwTX_0#-F2 za1>=cYLSZ6vvDfDO`V-WIi6g=|B2=`Cx=d5#6HMF-x+JEXGNijTWkKcRevf)YAEA0 z>CoF3&1+M65v9D))Rawk9Efi%$NUyI0zB}706b6L2-4*-dZAXC3P&y5wZw4DySG6l zA|U=m$sVFWsEnqc(u@5%ojI3TuM~MT?=~F<(Xh`RY|scReh6)>f6oIhQ)! zkd-czDCw>1N$#oc9qrr6QQN#Jab&dW`y*_o)xS@!F2+xG{nz6Nz+M31XL|}oA~OMm zT+0h;ZYQyPT(NW4F7DlbOw+)_d=izZnMCUzK_4O`Lqtp>1s0wFYo*e=4N&36W>t62rS43N9NO&f5#{t5f)(aOAiiyCc7Dv zi$>%+%$HpQLNUqNwB=V@@a8IN2;?d8%%twxqlFVZV8&|d=C$uWHl@l%Sv=-O{Cwqf zR|lEng6nD?Ij`o(mv+7~hq0jNFRuOOEqDB{9lc&)Y$fsCZo^nQuNqR_Jt1enW7lO~}Vyu+z%aKp4(4;#Igud6Sh-=^7MOOm43d`#79`XfxOOdGqF# z<^}Ef{dZ<}>?hk9Iv|#?rN?5k@EpF2;4r_M!UaTcZZ@itNSxA#rOUC??y#Wt)3xX; z#h7#5VC7eN1qv*(iW&JA)IxfR*gMYS?RbhNno0Xb(c{~I6E&t>l=;r~&Rfqd?{d;T zhlohI;k<71`=xKT)>YN7>f%aHo|)rtM^NeBHF2wVgT`-_csE7dRAL)*XM)vHCvl|G z@o)$|jlNmm52in}=edu}#EZuN%x4wS**Spi+XU3}&f^t|bOX$kYg|I?l7`X!>Bzmr z*P@gu<&(oH(cCLnO7dUFKtwLmdHf>I2JxNy0={})=Q*-^09=3k?4GTjo2(l5M-tBC zY_-Or9UA=C(IV+yHg;CWbD?;33kHZB8uj73Aj^LpZ8qs4{eWW`paeRbuLde4oK%i% zYlXtd4=p*$-nv^b>;fE8;n`7{0M}k$IGu_*_Jesg=uJXLE7I5LeT2Fq20J5JKd6Db zxCet@s!TX?aNL^!FQ`crMUrA3)@|h_PcsmpjmN9jm|BMdFYe1oP=~}k(g3VC+kZPz zk$gG~)s9pOBPdKxeX~rQ@Ig+8M|G)DJ`XkaC>)m&sIW3{`sBA+dG@UBtSfUoI34Z; z*RH=UzL~-BT#qDldX$1Tr@*lDe_MA8TB*PU!ded!_Z^TaV|7DotI6D~v^8j!&YzPZ z=eMEam)`|}lGu(tw$65EWd`C)#8^;fQEs!pLLJoa{hrr%Sf7lvyE3$^P`?or=%XB$ zEAY}b$=0v_7?JZpDg4diUHe{MS$=n9)0;5)XrRF7cG-PiSZ@Fem{z?82KXdeRgF#E zP5FQb7$q2UyN+A3vPgEhD!ip@E8|ZKY{9C_>svC25!TG$N#>spVD)?Y#$~sR*UL zOWJ#>-p_f3`~JW0^B(`}IG*G2lt*3H?>xWf_xo8VCP;tuc%>kW|2^Ic;$*|bxXH-| zFtwN*s9Vor?(YCM=8LXbOu%QOv;Ozg3=i$CXU{gx^r-b8O4FY;60+>b2o`cskli*} zgU?cqg04D8(K+UG5(ebg%3YuP8;+2OLEfPOr&3bH>7WB=Ak0}!>UE1|MX0ELzc~F~ z#k0aY;Cl7LXnu`df$AvkAWPjaJT~To^?RMZi>$Qc_c!RVkD$XpD%dYJ-itJbPc6l zEpR8yB}AwK*a#HXh)E^u#!J#qp*(KvX=!@;^yy#A>M8Fft}sWu5db3kc2~=7(ejZG zd;p*#MsODaJb9eIMHH-_eurMyULeB%?ftFMbm<$>oB=n_q-;LgzyrqD(}aY|_M{zC|!KBfeX5#k^Z zf`;GHztr_i4xMdTbm}Xa+6Ovpf1@B{ygb@nNKL(g(l1*7eu9uE{QFx=1@Qd?anCti zU&e|?m(l|>j|32^IUd^&AZ&lUdQo_ z50mhqT(f)Bb27GxJF4QK9&~{6=>nj&s)+lr3ZCUENF@+Ffi4LKZnim{drIcZOf^= zp!dGksR;ap$Oz=q43tWM5LxaNC_Du&QZF|h#ZGszxd}aa_N*s<$SD*Ha)~Om1=uH# zp>|Tz{74ZwIDh-z_D&|rw2>n=m^bv4h*X=e;)aFvB~*R4DeEw*yS4S7Xg>D|@_5LL z_3$6;us+kI^%fyH@g|*hP5(Cl-S1V{*w2bGn8sqVd6(=qP7bE8{Cg{fMJt4H?INt! zyucnmFFr!Yt1ayzS~OjgiN$0^(`?+$Ixt|rW~*!et=qRczk6w!wf1);yh7xR5? z|CWW4bFR$4e$V67zo+NRTQQybp;#;X16+Ga#v-DrDfleyw>OYby};xV*&zI@`!A@k z9+t}Ptxu09GzB)%u_1K3(Y_9ISfYtLL>ku&ufgUh4ONI&=OzmW{0XE!IG4ZDB&1{L zluEqvE&|$EGHUc6_Cs0}bVQppG&K4T9WZS>E)^=0Nw_OeGVzqU;+l!^BN7)+3Lu#m zrrw*DOtaXeI=B(7+Eo1D!Gn0y$CqhLOIJ*a>?sp_!sGHVmo;>_t!ID5ey2)#z5S^n zDM6plp$`-``SW44Q`YA1B12`Z{=4)z8%BUf0&m4T++O65%or(*-(y>TeQMTXB5iNI zpeDP>HTw#MS-Z^d440j>C+p4qip>Kw3_N>@N(h}lG*VJnmkJX;aAhunVB@5sPiiCE)ney8lpDcek!usF3+dY&GLprcB(9I^NU^#ls{{}qyO&ab3PzLR%O|uN3HA|yJ{2$~z)S-E z6I!5diSxbH>uCIj!yP>t%=|p#XC1>Espk^4?xuu&R=OJ>{FDh(i$}O-M=P}-TI+(u zq1iJO0F%5~6rDxFhWych9Hp2>W5|9C{Yo0}6^Hy8d1i1+$uGdqIZE?ZJk&Hli(89; zbY*QA9v}betA$eJeM#Ns1SE_v@#7!|NWmUlwQAL;Jv->m2BQJi&jp&E#O$Z{-0-MlhOtGBc0%wz35mhM;hy~DzvG7zZ*wjrxP z>txWjeigF&@sjwL@#25LIFLi2Dtvi;V>#c(0{VdM5$LSGfC?zESkbud0sXNG2+&Uf z4|_PkGsG@_FD@@HHbdQwns&BJ}d~gs{1l6$6NRAby`ef3EAR_^I~9 zDa`Wf{25yt8{lDG`|IXrXrChg83_3ZX$V@f+N|Gt;o)IVLEq-T zbOjCgK#t83h=oAUj`1K8tplJLc>Q-^_dt_zy5QETvh-{~fl94(RI0Zb({c#n>oAj& zf+)Z2MOhN`_8$A}^^pQv1#JqV9W**fhzV&yT7HzCmoS^?a_ znB_rEY@97ru15dC*^V!rWAwVHytL z4DeQ_?f5I8@VXMO8oAMox-*NuqIVAHfM3}MpHeXxkr<|;b=wYIR(pS+?vzkfC@wFg>3uKd$0@0{MB)q0s&#%W@8{Ww5@m_Q zN~PDlme70}>B*0AztngdM}UI$8#hM$VzsPA+$T_aG=7(!3x5y;{1EbT-o0Vlh8^T& zFU>n(+_ZOoejX_E<;$02{`&9@0!{)Lpet)V&XH480p3a*1^*X+urwhaEdZg7wNahg zIy)m9UBQcRBR%dhfke}D&0>{Y2PBNAg~@*v2LTb$|7&0Z~CNy?hDl#=0Z#BzT4h z>{6nJEb-?J!z?Y{TW*L1sW{5n;=y(9*>SU(F}>afS-NMN17T!2wRmbBD#~U_@#CI7 zAPH>ZZR1?ed(I1`3g^UlX7j#qDV`UO(=5$Tj;Q-~u=9h48kh4*sv(1IYj;cz4Sn>T z%P3mwYmv<$I-WLOAa2&tV5zS%*gbVXE~$UG`SSlEm+P`Qob{pIV1e27rxjG@CLZB) z&;M2CKu`9kmEdVbvXB?%)iU%2d$8_viGZlZ0E0gvc5z~mZu8fy>HIAYp9}$mNo(V? zS4kdX5iq-;0ItTOsu`+@l_NxVH!fOFwAfqv>9Qz3`2txm?K&>+pRl$RUwCo$$R355 z-C#W#NE5l#d9-i5K8ma8^zQuH(0BLC=99)d4%9kLcw%|Gin+U=Dta_)AYP`bbS)tW zcQw8_Z>;pW^YMlI0xBvMYY2cIBsUWL3qW6#g~v4G<`(tg(0m280Tlo@$7QEJvRj}GPa8pH~wYBxNg9K(Jy$Cr{T>}Z#1_P|G1!L{-Ke+I#7&@~EG}Yd_XbbNy zIpnc$|8Z58sS2l8d`x+ukX-gBt?#c|<#qb>M~2G>Fzv9;1ZY*X)*k-Oew}t! zb}7%J8>~6f4uEIq@%za2@)w|-)o5cx*2=tTT0|2k1x;pl`E#0A{Nc4!Z%wlf;Y`t9 z`vLvs?ZqX6{t*2pj_vbpMILyYRL;FLITk95GO|aT`Hxn|sX=f=)I=~-F5sgA786DE z;ixl%qQMZKVIt8yfOKcD5#uPGL~V$^>lEmjP}rWzXpCPXZN-u)n@+pCw|8q#BDp$V z8rrr36%NU0Lw0Vg_FTqa7Wnm**d+zNXsf&f&+96+Mr+*Yk@<^@ zs^9=NlgD1SZTOfc57im}kyRFq{yFAUPhxgvhfajdP~;_1PFGp>nI<(Yd;77wlDISp zB5A>36d5)n@O@Pv|4uxnBRJNPpXBCWIu6S4OweueQKL9;(|FcXIvVw;vC;dsq zVxUsJ^n5R40$2^I>q0O-!_JvgY?_ZWS^@D4lAM*l+Z?H1jF1zRug$Wqt)eI zd^0VX$Zd9u!`IfI54g$W%!S~D2e=GT4G<%FvD$*yIgaJ_+*b=`5ieX~ECaIV%G!^P zo6YT!Ciq+);SxQ~`T1lGIDglfxbz5bi}({_Qg?Es@FmSDfmIaE@8giE=J8CTmnMR% z0FPfF!?pa9PxFhTGzi%Jzo**u{kLx2(k3$^Uod7eeKODRAHmNhJCXEBDCm@lHjOB8 zO7j5h>oLuyCZo;EwCsmwfqwJmYlfCqV1;`Jzf|11eLL1E$Wm85+$I(*BA9N1Mwek% z?O;kvx0^IlYuuVP#|Nd>KPd*a@<3~&NY!pR!Xc7?XGUmaFgwZlbpTibg{|Vi!^)y@ z5;SNWi;vqj>el@-`Wxr{OSnH|Z@9K(TR0PyGt(j4!4S%uETT-S|4J7AmGn7ZOLv&H zgt1Ak0~m5unxRUjMXw*B{i!aYZ4?2e=?mzsBvdq;^e}O~0MtjSWO%W0z{PnYdJ*j- z8UHa};uIqVg7V~M1vJMU!45i zLq1c(rG=3oSiAv%>nm<6eDiCQA@bIq7hRK zj=BHv1p4F$i!i95LSHm~t+y*Vf+vtH;bk-0rB{@QVM4K6Qvvy-JQ{gmuM+O{9yBWc zL9fq_Hp13wRzAC;oxVoK+m_w;&TtM4I}&52a4cNAEQEh+huKEY?rRHwHry5p+=Opf z^Vm)faFabx;AVm>@sprhEbJP@JZK|A@OPP+XodCw$A%Z0BPrazkv#mlLw%YaNn zdBaxs=sP$90Z^JsN!qJ!^5Ng&*$b445N~=78Hq5MUceXxqdeNT4xGcD@Z&)v*Y}Tg z$+MA@xpgD9xyl08ZzcK<%MHE~?Yta?JfU{}I@7i1=R?@PzR%+{o}Wo0eI=z97JN2O zY5t>#{!f1;!hj_~;6~ClI|u9_vZ$rOT?`=$v_ACSy?d0R#`wFWP#yYr6y1PGYfs1m zC;&mWBp_-aWkiZV@D57nZ`8QP2dwvbpv#}1KPnFvT7;ud$NaUGhz&cj52{M@ATHel z5g@dYh}GeRM2AR=_g`njJq8%#uY{q%ltLXw9%`Mb0p_2q4X8UFaXSQVqsQRoAwwY9 z#iYDd{(#b59x+9PTt!fTM7BFI^AqnzDF%|*76z(>or0J54=FrVaDRsp;!?#yqe5$& z_U@HcBB$S|?>425b8@F6tKe&VLPMT@^$-u)FuW$|?%XSZZ-A=TAWPf;JDc?6h zat|RU@zlS>X#vLZgRDpj6(8cei6rPMxD@C5N|JZ*cU%~C(48zBxy_~1d%xJ}(n`Rp z6l8j642PUhUHh~+WU~{6=-sZSPW1#z6M1eDN^_ork6C9yWn+p5D5bcl)4sr5SZ6Q+@{r) zVEW|x@&SYklnz=2D5La%Gsts?TDK`Dlm6}{7#PSz^j7RroND4y9r3A_Wxp_&qE!&| z>aEBixLMr_=Wiu<)FBR(LsM|#Rw_>Q0T}P9zKVJ=)nJ(xi2i{g!SM1c{L1UmXo*ehLf zn<2t2ve98#?MomX*55)j#ec@o|8!WPfnc!qqA4Pki+nN#Jm9ULa}{Kk`f%=q>@AIQ z{&DdQrQPWJc8+TnK}8-Gs^2QmSPvCzc3Td?n^Jmb6&)IK_{SwBBmiTkKoL!>S`aYC zsvIN=N(CG|V3IfWv`Em=7)>mxPdDZzr=ukkZp2BmpP%lo?kCa(()ZUeY}*Ei+`tR_ z^!l!6pk5?Fzs8P3NEa&up$WHA(sA$Sr)QdZpVr~8TTxJ1$`brP`>fw(AAg7PI{r72 zR#QxOc|=VjWY$4smu!-(gpx1K`xk{i{ZR&I7zPW$YvicH_saEAh?4Oxx(VosT;H9z zoyrd;9muRi?IsjSE%CuTrlfxg#FLC8Z>d`mtQZ?MGj`PuR6vkIz!&e) z@1C6<%*T0sIhe#vjJcdgg}7`s z4nm}LiuvJzWw#+oqKL)5K-Uzdnzzle+zgFmb!TC~#|F$`*b#~KsMRr-r-)6OKxJyj z{kL%@OULGXjkJyPM;%JIO-ANM($8_>^F03k(|=U~e+J3*FNr||fRBMo?oUWCDPW+1 zDSX_-rvY@9>~$>Y<4o|Qsoi&*%ZhdzXcjtY7&YgiRN469!-p(4PCs5%Ufu}%Gkd!A zb~jIivWmMu;o-h+;Vso%nU$StlN@a?extN8>sYd*K+53TYzq&MyOQ~pC!1oaEw^&1 z)j2w(GYhpaOeeH5SoMO-K1CbqBg4SFCV_P7D23`=ULdwy(N8Gsz()2ZZci>RR16Wm z^|NRUNmCJfoM2s8SYP(Iaf!W0k!P9SjLgESJ!?Gv=Geom1>35JJ-H11`a7-T@3bJW zIugCh$ql*pqnm%&h`yg&W*64QPXd9jIjx~`ErH*J3oITXeZiab_Zv2HKtcscB6I)A zjo5BL{d|$ND}%*Ciq?>InsGA^;Vsb^kuwm215ox6FjWsz5r+K%QDy{|l78mbL6Nlk5SqMpQuzT}q?vV1{ z7nsU^+DCE2E-Bk|S?#Y<17GhyR;&-HYpDJiNZo5Use52>cA(kd-0v+GrXIet?o*LAj(%6TQpaxoG zgx5yeQ4D-wm?LpauYWYVOrx)OO*9kK`+zacI;Dx#2X!SN1_Oga#wY+~mZ`?QB6OY> zqg`k8;R>?^4N^X@0ys~H@S5n+6GK*+GC?&51x)Qsh=2e&2uKN)&oI$M31^N3*({HH z1R3FGi>>J09gyan6FRw{(?2x=0_mTkGLc`uSXLRW>;8x&g#TSzx;9-G<3s6qgo7LUCEQV!aB6 zJy}pZda=;|a>HktR&C-yI1R*Scx`V^EDxE>n2l|6RJ0ekTK!|%HeXMzHbXP_w2Ju! z;fl}wxmjkHyH6^m_9s{dx7WQBG#i)_EAh#9vkcAel-VTt_+Yd?ds}<^PKuZ`^%1jZ z-Hh|_-22KOychNTzS2;g&2OV_+w`3sr#d9?tT`Bsr7>-#(rB%84Jx=m#eZ8!R_Jm8 zmAvc}~XDO-foa#L1&;GPDOwSw4Fmt{sls84FEOycw_7+2hN?G5@sDb?LHY zM9E&g>AWfkM3fvhs69Tri=GJ-I6*LeD0FJv=ZN$WIU~Z*!aug`(*sz)VdF;aIpNe~ zY$zFkT5J#<*Cn!ui;L@DGX(aYq9sS*Z6Z-J>#m4pvP0W$fBD;6Y)rxrDKC-eBNl(I zfQsd7$<~(78Y^@%CPHMiQ73_GbK-*rYrlQ_cB|WhOTHb8Z_4d>D5jy<%f9-av~TH& znEhA6MD08t80X!s&ax=07O)&s)L%QxFJ6fZ6wc^N@qTzOt}tDxk3K~sEr}L@e0&P5 zB>i%H3lI^%?J6sUTAhw^dC5o0l)i~HPc`G7t?2g;_h0wG|I&*b&nAne)zd~_m-nPw zzlf0Yk^OntpoS+n_NnnXD9v0IcD#E???28}gPQL#0h6!N;OpkfpB1AC&E{9Ub7v={ z2Y%A##ryBMNK@-z@2em;P@efYNz?pW<_F$OFNe#!eGKNU!h;X@^Xh$$@ZYiGq26Rq zz+@8xPl#@;%tlUb&hrsT)pFqy)ep?7#3iEngwy&w?QKT&R_GFbm9)Rnd>m6B$A(Qf zT&>VlD7z2>2eEW#xTvfTbF|D>7=#O#PX;5&NRo@nIr@cqZq{B{FQfT@EV>S5?iEqS z(4j-p8C-ghPJIJB7Z;UWG;RnO!Kou@V#3$GdFeT#|EktaE}zbYbeSlWhUPbh96>wROHn1N8&?WI~z2<40J# zD-THvO#F-xgpi?tf-yR4-J@mkBNphE*S8nlz68Hih*A7oZ*v2BIm)~ZTtf<}7=XqK zMu%P;v7e+a2n_4-A_^Exr*hA5^eT8>#d*i*Nkb*EkA`KejWicQW^|brv+vIvr{6Y` zGATCHH;3=tDE)TfK&2|MS*f&wr$JU>ztR9yTKZLH-eG?6Xxsgr{pY`E+MkBxN+K~f zU};cNU;wpK@U)Cw7;-SDN8>XY0oF<+vRB9rQHsm9e$dfA!iyc8WhUn23TY~_sh#=XFPuKbTK1eHib#I`>B_T1Gqw2>%c0=#KhYZ;~ zm|Fq|3pvKjAlFhjgdgC_-0Cz5-o`x+16#vcrF1)J$Acz${f~90snpMi%4}=ul1dYu z-x@nxXMQETus7`5sEYk$reRZl`LH3gR=q9jh{Soqi)7oqJ@>$iNoD2Yws3y&8}gRe zn+D|=u@~%*89;Qk!iRzbG>Aom(#~3{H?@Tbv~qcZ{QJw%&x}9CK+`G0fK~ z?F0mhJC5Cid=&h1oy3%b0)hsfO5~BVGZj(okq!?dz;hUtVCcdV_XiY4VcAzxxk{7Z z9w55P+j&X)7ujIf6S=>$ANqKv)J=tLI~CQ!nAGtj7do8AU#36Hd?lIY;}R2lqv^*w z&?_oUYa(N2OM?=9@F8joR(4mIb3N%jaWCB_7}qkC&U~dT#B0r*L@l-W0sV(~b$v&4 zsAT3MmN+Dw$xt>E{+_e`Frl=xpk+;1$>)SC{$yExC_a@~LMj|axPOjO>t_Uo5W^Y# zZ*sW27q1!wtAzx2U+9%ohWxyUOy%Z{Y4=Jn)qql zyi4M3s;N4P0fD-_K1$`RShD;Bc&iDo?md{8)B$C%0ou3iFg*fbkEp3)R5RtTL!iub zNacNmnEZAXM$V8-9|Di9pSS*}@wWOJS}PCTx`xI!#h~!s&>Pw28s{9G+~e#TJ~TGY zFOWv&BA9S@`86yNi;`u;fcz#YofVBBf*?8_&WjId?Xb33pY1&5{+-5zN6;q|AOaQw zmB@D&2K9uy;!n1h!-GK~_?>mNZna&nj^QNC5#f^IGXyLOADm{agM&kC**$`;v4MRg zI6VP-@VqB#UY_4&gU>h;p7{-MLN>ad{`k-)YzAOc+EL#nh@?R;kpvA-?0UK{6_ZJ) z^+k?2TAK$`-mK&L%y7+4JTsH>bCT*)|Mt-9a*}a?;I+OC`0{B8zbJV=%-VN#mc>0q z+fMYenAE74$53w+3t7%2SZHHjXCuA=q!Y>#eSYH$=r-7iB^Z0cYEYeo1A$iuiM+_Q zTOjO82-A?_X&x91#WDv*#L3At- zj^LQ=09#L-Jjsk`$x*8zgDyqWjL*LymF>)v3j|K_)w&umLad&+w*HmnHpC&`Bq1(X z+ML8@i**rK^wU64eA!lPE2bhMYe#>WRH3^UntN2cipxzq`GbUN?6I?waE(;%N-KNS zu4B{LAGR^h*s@^S3Ojh_@rJ@zQeTI^0uE2OJ#@Rcv5}D)5ywLr_+Zyg^(po}S zL%#>Z3i69i0)509-ixz_s?tNKV$6;QJ_?aJdC{lrQQpkle{y? zn9?!x4c|jrfTBMBLW>l9kG+5@F=44?K?4T5tl+WWvRzSUg08w~Pm*{jcZa$0nkv}e z0)@;uSf+52Q?+cwsP3J=Ft0-FXNwXkj`i=CZ+ueU- z&!O8EZhZvVj?==$+D~#2nM@qay_HL-uH3}}D?`tCqPXnX8Z}YXS8fhx#6n9l_n^Wu z%;zkpz>myfcyB{TY{~n=YdzC%U1q{dVv;^4qK>AM%$ndKPEw-Zm^FTpdguotevx`2 zxnMQKN335mUJ-(%l+>@c2(-xdD6*HB0yhUa4zfFlNfT`GZ7EnCUimt#Sh@231I=a( zsL68)O2oorzrY{=X@CR~@yS0w&54JfwzBWk`@U|dgg?`yBU}N60hNai=qw =5Vc z`8GPDjkRzyz&QoqJ}`w5B+DojRh3sVQaO(XYW+i8Qb0KLspWpWWp}EzH4Ew!x08gb zU&&q%S5PlHlorHN54!o*lT{-eSaY#U16($fUYVZSD=nzr2>pp!iN+9KjI! zz9IPpt@!1&#e!SKX-Tpn>aowBixm2%r#dzW`_-9iHPB^b;uV(S9kc|5d*%$y{yF^r zEP};FffXNkvk3dhAG#Xior9A&Z+=E!U$PF*HgzYNbCvS!;XvOH-6(NW0s^HwVBPv= z1>P?F2@2mHHcrpZQhdDljx7DwrDw}qrlD9NsP@1RtRPry*Xg|&KN?TWl6?$Z36DDe z=g32H1(`ew!mf4s%fVoxEtUbw6JwG94H7_9sP`z1K447JhpS=ulJVN%GCq5gA8HM_ z1`?{V_bdWUx@Lub7krlvm*zj5!}kK=?8Udq{{`m?5%vinw-Z$YPJls-(CD}Ip=To= z3vv)X3j^@2jf@tqB48Ap5Vb!BX9Ou5UaF5#_z2vMk7cs><9l>*D*nh8G&{7pXPtRN z8pg5_l2C=71>n4i4%*+mt#^Hyk~HAd7X z(@X$mo#+=y(F_9iRt8L;>GIrv@#>Dv#b+xdN zlMEf4%vfsL;J2}{oxgj6F5-kE_(F{0IbnA|H{a?i1qRtWN_}O}vk63o5iEVQ$IK#Rl2HuA?r8#|g!a8KM z7AASASP!g%zw!TfKU(V&z7Y-P-B&0R#GErfH>shY@oVHkOdLro#lKA1VmqscmFa>_ z?@bpZtolTJ!YE2x>mE&mK4d2bSfq8V|3UHV%nH0_<;W-+fM99|ac-1`qTkv@R0-%_ z)U{xFB`jOTmn0pTT8uU%xryY4P^5#gTwd56y`_o9dDAC}exL9Y>!{uu!8GLu2#-W8 zc91xNN5yxH16%wcot4Dpf`Ew=w$gp~MVbRS`BR(SzJJa-E))34+I!3{H5ou-7hS#1 z)GJc~qr&C=RDZ8nNnPp^#1PXz^sdls=*T@jDc1uzWwUQGr5-vLg5d) zLI3_!0#WQ7`lD^2(*3QfV_m6Vm-3RBkVp%_`U4Y~6ucyc4H@TvxRN8-xmTd@ksxB2 zE`Ix%3J^@W7X?kl-?F8>U2E-nm{OMv;Zh?kvR@nzG$5dkJV~X~kpq1`I;ZBP-SgL+k$+t6Fo%N~Br7IWK&g>enF|M#1sjB3a>SW=PNlbhf!#!!0$v&IV=iO6( z+S^{jOn>xy|8^sd45^&F;em;ybsw%bRv4u2I%OYkR{oq{zQ6lAtwE||H2Zi|wNqRF zSn6lncw`Q)bqYG1Uvq!R+A3{C+{a^RCTZ=5tGVv4^;h=hHl4p=@QGSoE3P$jV1C++ zuCy^%V>AG<@lPiFa)i6p^k;|SX(LD5Dyp>fXaf`2ub5PQk~tMy5a`f7S&~>*6q6A6 z)RQ%LldCOS{p@VPbx4nHF%_z_qWSM6XZ5RfRR#K71lt&v0yT?~b2ML}RGdaWY1<^q zy&#AnnX^c=JBIrLU#Jxkww9nx=q@R05UMMA0Q97S4j6Wz^YA4a@q^ET4jMnm&(BY- z0E=uf&8J~M?6SXO1}VqBIKqJDTnR;RpUl~hh14hgA;QGu6+4dggXu5Pu7HzDA|`AV z!q9vWJ5`M1Vqo4);){x%LCDVxJ@4|#SruW|{YV$<~`H)H6F4+Lt1~Joj$sSRu6(VfOQ;yyGjqjHa z<&5-lRX6LDDRFl?jn3=36pyKVlFL&XeWaT0a0YvQyuK#3+iw2-^zOMW+pn%>83Va6 zHdl|?C1LBSS=rXfcsx+@`qv28a;yB~1Khz#Ol+DCo?G!v6F-1jL{|X#pcv0Dma?_$ z5IR9l@@PIHlp*{g(f%S=qA`n|B;^sQ1O7XuHw)arPAC&lJBel##qc?XIm8|nqa}ue zkZ|wN@<4BhSPdDF`5?S;{FV}62({G`Qfzi;3jPL~Pm(ARRDthyzyLG?H#q4kQ@45v z4PHeDVm9{uTt843zUH9QQx}1;$F@t+(ji?wkWceg3=V4CYKk~u34^N@{0Cp(^XjK= zuRYVuf4;7=XK&#p*^le`I#BzBOlvg`%o=;GnB7zwuj2H0d-UCs?$udRL5zbkxl(H* z>h4^M{N`Ha?QoV^>eH@}DSO`&BWrDrl)ArUyT`ljlN}A+hJ(i=hMXS$Trd zlYVSUqo$Z`%x!9JOu6ND>t3q>PZl~J^RcRPwdO`1t#uB@|;`!}t?ebB&b!geV*c(zud@=*H?yV=lkGMc7s@wuZ*JRu&HdO>5rV zF>m@DWP4vATI;~(b5?!dwu6LR>@kSzm-F1SJJ#068`m3Qc^pQq8pG|PM$LJ#OjL3^ zTCO`A2&bx5Z&Qm3vEwkBk)omH)m)JlG2nk>#9rm>o=4GFU9wWxviwQ8Uq6t|Zs>s0ZNZ3!@3Qr{7_bD=m! zG`SNq8(|O&UO@Sqb^#adkE1?1|*9KrJ=?b;-TJe1CIrkJ@D8j zv;#w=Kg2SK$Rw#~5R1q7nv=nbqsKonW(JIV5Iej7(*Js zKnE5PdZ}O*MbNcq)*g{Kk8J!y20c}g3N5W*N4GE9wRaw3Q*>7s;fQvR@#3tHDoOad zGh*~-T$T3q)i!!Z*OnaYSRqw=Fr?!rN0`sn^GeE<)^ZCSq8u@u1=F|9*oR(OhBm$1 z%7E^uKP@OOGz8u_x{coDT!9h8vO8WJ|UXu@UcZHN%LiND=RUL)?N#^y^fSUIWjqhu=@Sgv1#^x1ws@3(}8a{L< zH>|T~SqA3~wI6v^4UcUl=878O z0A+H8`__5QKPZB0CSf4VdS-syv5Ny~j8(Xpy9W zS7`nS)V(tM;abUZDXTivX|?tMu2} zU9cbBp8{|-%`Y)LZ1AZ)UEA!ZQ!GmQblb-F-7LT!>cQ-!06aaP-0e%76Muz)F-gah zZhlwF2F{CF5qnh+U*f76snrN8`##J=_|KVnyQ)2g5AR$qPC{deW7 zcHA|&s*S_*PUtFj!#643?0S9)!Y6&QH&~yzb@bL-Hzod?TpU!z_H%_!A9M#pxU(zz zyt&fjiXT1QYPNsqRcapNjn7#gvo&2=ThomddxyN=%Ishe7T??_aHa6C>B~*j%A!Fh z9x*6s9<)-;usPyQD;f7SSx@olKR@NWUk^HQ?7Lc4!tE_^L|R{_Go+MeDzSRio9Z6A zNdJASK1*&pcuP9;I3WT_P)(wve~QfJ;U)(=cB}aYWfwS5UHZiGq@#CPbv5YKCfLNL zI%Y~g(p~qy_UGV9A&WYVs^?O?CulZ(9{9Y9hGxci;c93q&$$S=?z~Es#UCwI;wqI| zuv^gB{J%*0t^m)T_SVyfMu0-6(7;K{LI5uXs-dwrVaB5{%%7SHT}EY+2c*1=MAFfi z!(94TSlOHKz)Gz(TMwL$<|7y^ag42K(IbQv?q;Rs3Kp_|%IX|Npc$gdu;9_f^*n%Z zoO=Omr6%qp@>CDZq&;Cov}N z`4RYd1#-2{<=NZxRxvtGwNfvJooT;G&?>ywSWeB+T^>k6CHiVVM=ncz_6977PT#j~ z;ev0BfXC@HuYtlSDAfM&>L|SQP0zYeIi6&BWgXoYv+r*- zl~SA%-Bb^roNo|SiO-B`u9~m9`hIq#Gik|4S+!JB+mN)7{ctgzOU~w-uhTpG#*>v@ z_-}6XC@U+Jsp634vN_-Fl@ur=^Qa+SGMr&c@x91q|AC8HlJoo4t?Z+nGoGEvH+(AD z+Ue3+aEe;{e9)Psm|B6rxe9~n)QWh4nbc3@fasCWduV^QZJC~0L@Ev)x$B*AIxZGI zrDq>_`d_>H#?sq{{c*?eW0^GVS~jZ(4eRo|_M3n8k<}5FYTo0}8RVbm#UKC3OUhW* zyT|xU`kRZ@Y06wG&GpykM5%AA@hl6|@cS70k6Gs^m)Xqc&k768lk>ijAN+oZHl5$= zz~7tK%smsA<2%T0BQ#qkUlkQGu3BXsg;s?5Qqoijj?6nX$LUWkTuq}k@ZVolqj=;H zy3ZpkZiSncYj60tje-wYSkPd`Tq4*chC$4ULVluWc=LS5`hDHwlnz ztz@*PCwnJZE@@J%H~`~bDkfer1K}rR*%z>Ag~Lq?B``R2f~+O^8N+iRxsu?~xhu8g zz(@F-NyNm3IMrI3dkz7Zj*HPNcc9=_rRbdpRykx%G9=-f)y@!Req}pQI^vMKU7#cN zNS&!@RNTtq2ZhnvZCzoJ_0K{_DmvcF8ywe@8r50Jxi2OlXS%n(%Es~1v`tw&aEg-P zy~%N*jW4;DWYIk4Ij3CW&k=v2L_w-)kD^=??|A5#ZDO`tjOgj-If7DtDax$E^@hHm z)jU+{o7M?y8`jg0TuQDTsjnATN$;JMN9wmdH5V1ft*r}&M>J*Ls;?5IlYX`QWz71( zTY5Cottl?!PwY;=c)02(|7-Eby|RAOkGfP|^EbRO7WR}}&F}(0r!*b5p zKgK9ix$8iHzRQY7))sfhqO0V@xJO^oZ}bs6XNyMYl98I%A)|Anf^0gse6q5-%f|(? z9;ljJO9;-fl|K;DpzZR;t#;o@Tkc+V!>j3DPx8O|)Nq`z53_5^v#GK)Qt8RfNq1`& zQ12QD;#yewbGCWX=lllT?vtJrPlziQn=g6J(_5^Rs!pV>F!(H6t~IYvsXFHLs{d!_ zd9(4Z75c)x2EK$G_uhFrTDl~Z2ntiV| zT^Y@umv5&}_I=D?-#4dToSboE)H_BfOZL5S@b3O@$35|km>c6h2bcRb!AUhU^` zHQiq``c!d)s8dN(@+=$Q)a%iY>w<-mS}d^ZLqv2@JML`N+wk1$mfmW~aaCTahtEDl z$VAF2?3SrJt~aKjnZ#bXN55(I)YSu_9PLe;+2=F{xR^>E_C8X-c6qM0P3#1_miL;r z&elk$r6!?M%){(zZ*F++(qZNd`6ngk$>>GBvrlj4X#8{_HF8bNfwS+cx9K-_IdZ6S zyg22lI{!0io1xQ%d51v_r7zZA^cw_IX3qAg|C61zV}C{U=1_x_s(-w?6yNTTch1$y+3U|B~z&*8u>Lw0r$qN4%hI!(H!yE+M!+Q zy|JKl&v0btcu9O2^{ALNqG0K>~v`B1%j@RY9@|Ky|nxOCbhgVSJT|iq>1TyeMFO#XaJk&VjLWgrsf~ z$Ls1A8~%Sza}>brdL2_dv9W;wpGLxY1Xv&FK3awY!dSW#=b*ykBpzDJ)>35#V@|;> za^8;dH>%zBr=Wh6nsbStT4&j0CqDS$rv$-W1lASv@GhJ^R#(}-yZzuiE1BsY9_0NI z%M3t#rq>cT=_ltdY38(Z;OUJj9@#tSdyRWe{Rg-B*C5u1%H@wk`qN4SjpmaE zABQB3G)4+^hDed}T9w%rrC=?yXYhh8uSDPDVnHwM)4`u;9A7?(eMP@v^jwKYgjHvg zK+=23>F{}$q?}r_+Ua>&p}8JCu5BDowVW>oUKtu-mA? zYrihBH>3G&@zJj3lQeW)NfT4b)8>_7q`T}%ig>v1Nt0yh!r3O8oA;X!=4mrMxwuC* zX7`dBW|6`De22~&EpPl-x}UIKj%PD>u8?Q&tSoipIB#{vH)rRF^z4fM&hqgWtJ-at z8yXuT6$eV?Qt8VT6a*rR^$ky$Uo;~N#_T!QmH$=9E_uBB9KQ*czK#EaKDSlAJKb1| z)3x)DW*Ni>Nl;PLf6GIn8}Lk(Sd1D|^@H9%>iwsN^}zSf-7-jQ42uMkK&s1i1_^`x zWNhP$|3n$wYE#9=#zaUy1{84D9Ta&%VAZWqMS9<*7|orND!FqBQ`+S*Uq{CsKW8PG z7TN};b-l0*j;gn_=q#vslytpbv;R<0>Oshb%qY+<0(2P5#;AS#}>*xce>?f~jKjtaYjgz~x1K{#`ac;bm? z#$7b?Wuq3>rNOw*^{R#wW9tDf+6H0bcYh*W1vT6XmGaDrFb zYIF#!F2!F0RF^}D+!xTCr$&HRxWXV|?&_XHyo_cnEVhnT{jLDuNwgK(n$+9Ew&Mfr z8e`LMlzEj7U3hhY=Su9~vwi>PY*?h+dyhaXNHeQATr&rL%Ntsi+?8-4X-%w!IE$T+ z@8eNDKTdoU5v<*%zx z-D+{`*lnH4z}%#FPSFPQ##}$nD{k3ns}jF<^gZvmcFE01%v)zzVtj)+&91SV&D1d} zIcPtkpYj&ceWxu5!Q)qNsgUx&)8+%==EN%y4Dl;dsCyVs*vXR z>D^^7qoW3@TaSe_aZQA}z2cXS=Hp!$N!lxta5g%s8^_P3Y*pXorgYbk(enBPh11zG zxX+tY>Up+`Uxd>9toq|=qmUhWqj}xdc6KUZ@A3+RjrkB35zAb0=i5fc^T#biYxTZz zbVNDyOpHkN$g!E`P>a)c?6sTh_idOq*QbX+WMk6o)SlSThCuTuox-jJ^B|2l`S!$H zraO|Sn;)6|NWn9Cto~r*-sg=`T1|Eu88?0pc*A3_wN@l4_&lDcUCGe@E$P@u=Z7fz z;CFWp4NxT5z~9nmtTOB?X_6&KnsFLN+?VTme38Y%HCoLEJPj zzYFkv!U1#j=R!`O;qQZ7=X!rO@yEm2q90P$1@s-cW9b;J;{?(L!>a>0;@8*X+^7+L8=)9zGqNFSxN+H((Qv6uV#X`ywUpC`*rW>&C zB3ftXZvi)=C9^5|ZNxO6AX*v-xCw{9K6ZfC_DG+lE|rVTdB((tLW)et*AI~_IHFHA z`LO>8uf%DIk7b2_kC~Ah(JmW~%X+%olU&G3GfFa5au%u+Rok;fk? zCcU@c!VQpgH%QE#I0dlTBMp>hnPo`887>q)LIS6l>;zbjA|K9=5>7Co?Y`p4WK`eJ z*OM?3DQ-V)=|npKc-k9UDa6KA9?&kiNF-nqeOgCXeHbk1Qfw!QWlLJz13a7qU&~%q zjHu0@>k2X++p>9iZj1ZuZFO~r>5F<_lP*RqktX?iDMpQTGfaic^8zkQK|FDrG2`mQ zIPL>x?|&=C&-p#FsEt>`OF!hzueY)!R3wQ0hqs{4I{gst69-Cs*upci z-~Fr%!Y?lpJgrV=!rfvt^f0qLP$cR?vFZ`yzn_2nel&^ua*%=?FQCX(-={9~y`SQ4 zCvAU?R}J4>He;*QG$eQ;cX%k{fMHW|ZaCslC|9)6XIkv4mZ}}r ziA{(TmgkV<4D+8=ZQ`BJv$nC;s}cIkcKi2t?tj_Z9LDo=l|`&o+M9+!WW##zRok+E zd!*cTzz;+O7908jfvlAKTYmHe4z-fQ=e1y$c{|62_jk$yMo*qGtmd%)n8Q$_x|! zLv>S;OHsS2!G_Yx7xD`}E&MsFl4-^cTY>Ynae#5}et7x^banLynGJWH$h7T!VtFO6 zetWu3;r%lL_8%l|?96tS%vcz4&Zu$f?Vl56%(P5FFFtV7lcDfV!5#mDsqGFNbZkRM zvyN5`xmZP5eGL`pi4D`9n~i3B+RI-%@rfhw)3sM?>^r&i)N&#(l&C8uv3)Rj=*d9k zWsWv4W!2}>)p{D9TZHr_%ieJBypyw8rKxepeArak^8-QJ6T-IFzuQVj3fvIip3w^7g8x zQU_QABW4c!?iEi`HD-Fhyf!%BR6RP1fuf3$+n`F|gcPo?veA6H@nODln;drt-Rt&A z40x6Pclq@3o^EBHnCc%%bzzv*Ve&C7@imhtL)st}* zCHpm{!R4&R_H!Hq=#=mK>`{~7#1j(jt3TZrU#T-TV=DhP z%DLQd{GnyYVD?42h~~C`(1jl!?oCjGWB;Sy+MXMXr^GdY}wrp`my6>7q5$) z<8$8`w9vRaN_JMJZeLHN)MiLa2x3>BRsh z6-&vyoavNX$pQaNc5QnyyOu?);YWMx-E~aYLxOJPldwEO*)MgD^51@JCA6(g{M2-^ zSC%$~Xb!c!CO0x&8#f?h19HYb{^&v0{6`w2`!rptNPCgQ#oFZNJZPa}%%3tDc8Tnv znz(W+yj-YvQm{F{(1~ZY$5k;8euHm(v2O@ti{$!e^v+Qi(~Z=)8Y=EkKM_)*3T=dh zDiUdg80gVog$!T>u(tlV2}M_js($<*&>_;W`NV9Ghf&ar zn027o9aCn#y8U}+>tD8+_t1bO9HblDk=|BPm)e`M?2yOwP_z)gK+@}WXF(V{|1=q$ z*!b1MUiQ;{`xUitn>D$1i$t>!q!D%&c1Que1vl|AW}Z9v73WjCwF{>d3n{g>=;~KL_(pn|DMBj zjMXN&I1!~`7mB?c5b)rs|F`fH591-#ID;7FC4u{1Mp+JKZXMgOM?TNMhEdSgjTm_1 zS3s#{fxKbjQ_;URgN}_{d}h1$&Cc(gwW3BJWukiqOIU<;B~2s~EJI~)WvtS+4R4M- z{XAft`Nt?^U@dj;4Yur*q2qA%rzBIjzt-nL7L3GI$4P8>C%I}ej+18^kxP8Cqo{5B|8szqa zzibnC-ck=;heP|>W1-1@R2}8}`wx9vv)=OmA?&^5ss8`|@uQ)4rKMd$LP=4fWu!?Y zsgPY+Nkt@kwAZ_2L}XM#WknRS3Yn2j*_jd9oA3Q`)aSac-}SxyzUQBoIOp|xz8>>F zZj!Pd{!IQqrIlBJE;@0FAI2+YL*4pq=9`^!eBK-7S=*{u=1u%N3wWaa?l)FF={aW! z(y;PAl)14hhuJF(t~!7Z_QG+Nli->bzMr_6`?#)ZJG~@(@01cr`LDD8?kAKo!DEmb zAJjGWMa9sTMnwTP|Llo6r4%&@35kE0tBR{bUtg`-d+yLzr;9CLR}0qK!EqzXAu(~r zaj$vGH9MM23;~3CCv{uTvT5plOzwb!!O3H<+xz6Q-p%C4^ZD^jC46S?)I=k==GF$U z?_Tp1ow;^tlaC=7Ec2^Cd1}D(LyN|;@Km$jGfq&ZP5x&6&$b0M5=pc^d9DRwOIVL_ z?$clh6`omY8jkV4Fedt;LJZ2sr-(&vDHRkT(q|-@UJL}ytn^8;XP7IC3+r(nR zl6_#}{ub*Dz19&$9SUsjzDHCF+2qz79A4|ft}EZNvD4a{#dWcWs=MXA#E?h2;)gpQ z4j*gNa5lMm3Lm``5*000M;G%g zKF`Pe?1d2^eomzFqIbP8$3^tiW*`HLZSA-6U*dmF9gAvy2Oh_GvvyD`RT$TYc6$G~ z0HBF0l=_q~^5g*?-JJ-iE4E!Y=erVkH+pTKij&5CDYb(K?*cwR$LD*ZVd+fetDZ17=rI!J@OMz%U@E6$ZQ-;ASz`Z%SZNB)=OCKWbJuu4QeHbkH`W) zd1P`yNDdSU&6ia7JhyISXP9T-ght@1jBLaEqn0ddG`d5nB-G0^zSiKsRX!~_=agEh zjf}&N{!0nKpNF0dUbz?V6)rrOW#02}>;v>LXMg zH<}N8c|_3D#7wrAJEgbqJM?b)U6P?9A9&}{i*NZQdO5E)ew+T=vA<(VQugq@AA`?6 zEZ=?r4t zBF_-eoT=;t(|{8zU&9>f{2W&*fyDM1fKRdILgjW+14w`?ZRz+&Y~nNFjwqZjH4@r) z@v$>)g6dHfA;y;s@VZLb+i0a%g7j~(Sfr^_UnBB!wYKv-M22!6EzCO~jg#x~N!wAgU{dip@t zfS&pLk}ZuxO6nKqw9OsKs!jLL&Q$g{N{`&|>aA>B;l@D2`|lgydpfvq*G!EH`e8N{ z`*D+zO1)Z^%hnH4?2P#f^F8gf9jrQLs7-vf+3~$C)Mr3acx6rHx?M4Y2c<&$o<$23 z9eCaJYJEkaUeT^Pr)!>@6au=n3@;nSH>>{^>k{u+TXTNou#IrXMg@lpJkv84MYEasGB&F$4H3QUV-FYJnVU^bc^Fk=du&R8Cib)61uHh1g| zxSub7$&}Md``6kLgSSV>$0tS(gNS5oTzd$_{>HHP)vj1@r~xOvjv67Ds2&97pX$Wn z;0eL4GZKk$uj*n>d{G4b7{kke&u+Qgeu5a2n4Ps=VcEyL-;6h0c?{eQy!KtG*mOM3 zOrP}e#C}tlc=O@b`KvSPq4iYpLi)-SZ~T>J8FFmsJ^5c9fLO-}$=!d67OfkhMh-J+ zu8PqzO*(LesS_+GeEsfTdcd! zh3)L`%FyEFuGOfBX|u(GvevH2P6IYhVXSWGP1N`AdB|sBkmV^5(I7i;`}64GTE|Nt z{?!sW$Zz8P`q8Z?#iR1ekBrz`2bpRe^yKkXVPRYpIF^0r=*PEL6I#Ze_eNCcB%SXM z3$?bt@AK1IPN*-5cYM`oW0^;C_QP*`UuacaKDxtF|DgYd#PL?!Ljm2uN}jbzX!}@} zK5;$SE_uwh_3aDi4f8a^2li>By|Zhac(jO9<3xOZr;uT|e}OGsTb;Ds!Uk5tm^1dC z))vIG7X%xZ;+-=FpsSYyhV7X-^0evFgSt@En_i zqeePY5~Bwdb*w`ovXY=m$&HLrq^A>9cL_d$q7}zbQib08{tLur>uvzD&r$&+xOwkM zC`xWXO&;yNSuz`?4=*=&icP3dLoZ2uyzyR4jAlX8wQ*+FnQ`Wxq1a!ulzzX33>j-> zyA;k5aSaO?wL3ffW+-}#w(92FrfPd|sjm(w``5GeJC|hWT(~{T`ST>Y=cpY<6|>Ra zxcAX3h2ZS?{x;s18P`%9ayviI$m9CDC^`4*d%53AHcQqc<+IXJ1 z#Z1Mxc8FAp(lQ*~VsY}`(}>Z*1D}1YYeY{;Y8+{pw<%=-^RjabMYc^EiF3~)t)BCC z)_oe_n01Zn3gL7Bt$TwcO7s|Y;9fE+v9(}fF zI(b5?aF8ZV$)zk&P8)YTi8*Ig$j!sJcD80}*AGJ&kaPx~N*&N^%s#Jfe*_AIjWJ~_ zWCuFJAnw|v4iJ+)SvTIC$3?E^Z@dri)II)SnZ%|pp_x=M7y<^|?UjH{$H!-|*?5&( zJ?F4*e!0QU;+dWHgn4Ui(eBrculT)=8WhT;hoSzLtKqW`{Q7v$mJ3DO<5rnzbmIM* zN)jnoIrhz)Ot#zkHRJ- zB+>*pjhJC>8HMxA@6gc+gywwG4B2|#TZQ90!g6?nOY$z(88g~ub>V9TyT?a#vJ$M@ z^jnWQhV9&HbIB5{qzi>Fm7t8wsFgb8c0@;~=EV72v>fx*;tCsGgJ(-@tP-?%&zc`&2Dx*eH=-&4Y9-HTUmX+~;aCD8yoGhR97sAcOt@~aihz)G)8qSsHKktXCCE+Gy z7c{x9Y0qCt)~68BSOmJS_0LzqBqLS`J>S-{2!GP8BIDI;v_$&Q)?o^@_hl1773+At zhD?`X?b98nu4(h+u!VvPUz(s{^??)x8e%}stxkLUKVnmVXD!~(8t-;g+t$o=aa06lNf>NBrF(O^3^aJJ?H8XTHOXMk^SWdP zkqcFCV_%S$)0rIcFvl~(sBt%+T!R}>x8Nch@opb{fK*^A4ED4eHGTYwe&na(G}!!d zX58|v1M8@`+&R=HTXt7Vf6tGZQBhTL6$SdYMAubHb1j(7V|QY0gs=PDx8k=4tW3O; zjCX!10w`VHy0UBGd@drj!=#%0ssxbPRCM#$obSxRKFJLy(&(-iw%$X_VFG!jOUXV$^Rr?iF7Wz^0@&G6~2T}MTGDF*g6hOS&jbV>Sq+%f=4v?5`gi!OI}tJlB{hjTUje?wnC~8AU(?uYO#V4YcJK)X)+h}r}>a+GSB|I zEwh}vV?P@-W><~aYQ~&>63ziMcgC({adT@P2%LH~wF2ZFA9EPFq`fA&Mty2{-dane zR`p*B)&YA4%RqKLC2l(IR6HL3?vh`Tnov#rXuj~cjf}(z$4iOD!|yzbN1pi&`v0ax z;$*&$!#(5a(sE`Tw2www_e3mJo2o)#o{$Fwz9olZDl|yN9Hl+eRuqaEa;$SX))66# zTuZ1Ct*)*f=F}AevsrbZGQs*7`dPic_D%UUz-$;*#mBfqm-3rQJ9feZnMKJ0vd>;p z#EoM8ylL7b*)~?>@=<1xWxPa0ReN&cUuckjMBUF|@LrlrwDt!T#}wpb(;`bAzl*iP1oienc!>lZ_ZY5t;j3HE>YPlLr+`ryMM$Lb8&iMt|=mwauW| zf!G=a(x@UuaT+|DBfY{l;`^a-s>?cdH%_x`JHoF)Gf{p{yyA2+G7FN=Um{wQv4{>Ah0t{L#D?D zAN9N2?VT@wH2K8&u-Kb9+ahFZZ>%wEC{#kiWA`SyQH(&zi{?*$W|D*PDnAZHrDdh9 ziMiEq&|fA&G9mDw3xBt|S6RcaY?GW?^=tZ?sTaG=@A>PG3-$zB_Eo#?R69H7hPRg2 zc;+#uQ@Q!z?^9aonjf!Dx0oD))H*D~(w$zWT;`E}Mk$`W2iftXCtvfe@lbk@=s;8i zDBT3R6x-pye3~2PFV2s7$+?OI`X6u5OQ41lzj@b zJDa-Ie$*Al#2Y6b{_Q()>DBfoBWoMEsd%TAz5)CVR4kQ9eth<^yYBA1z{)hUsJe32 zumgv?d9(OrzHuD=6mb1`zJ`|H#^KQDjo($O!@9%SUPv z1^zx#uG{u%+aJJr8*FSDoGD$o9Lkx5NFw2iz>*YC_8HQZr~eAz<61f0igy@`M~Sr)5F{uy&;>Llde>VamFN#Z z|EcXgubGa*xdAzS3#faj6i}aK2}UIFkiQpnjEzeo!Rk3Nm9ISGG`75Yn_?w)FF zIQi9% z87}TTSADc7BuG@S^pLh=y+x~9OfXxR$Z>vAKkgz81y8>79+@5wG#X5uQ_TYN7sr)! zdiYokTyJlWI;Ik%!RMItJJ_zO*Le9*u57DDK*Xi>H3zAWv6x-_XswyVYVG9-yjtCt zhTC!RIx-42FL5gWL2=f&>>XWYH zeqPw>@IsVEDww}~=LiCWE~H*W zC8K5;7V4n6M0&TkVY=smZ*Oci54p9}6!yCd<-yS5g++pmvo46!F4)nYiHbn^l!=Um z`VvAY4FD#3w+n!NL=bb-;?N`_4I|n{rTDS3Jq9LmmtbRd%zM@(iCi~Dk4K%!ox-=~ zP4I8jkP1X~X!?G44Xy~MI5!9WhZo0KH5U?VUi1z=izu1oq+crIL%ki>r^!q#S+U{S zsnw;w4rlt#=ya~%xUa_3x~*mWWmQMzNcoQx)y-Z%&qs}9&9eM*lMnWo6m?W~jnI9XuN?I8E2f=|#jkpz94C|zeU1fP zf@62SEX0HXJ7ut$OMMlcCE9!gZ&XuFk2SGep=Ff|k9GEe5fyHkgIONcIq{3Lv_9AA zJ~zvG6!-9d9#avn;M9zL^&Y!qE{=W2cXL_LHFyUgxK9f!b_>tb)%KSJ;988qzq&n@ zayqn(uT>c_f5ZK8kjzP`vjTTX9^*u2Syih92N{ww2P~-p-8l1Hsg92hq0QUs-5!F{ z2Fs4HtVD7Yg@Su%RwPx7sDStgw)SgABKgdbc3@VLO>fJok=2I~kYc=5zg5W#C-Oo! zrhwjv#LGO9Dn6yC72($M!}%NURC82CynnVPY3XLkEcOZWP@QhRTh4V&gDRm7IdwW{ zkiP@3`viijM_@n;RUPJwSMM^H`;6KyN=;0eLz01&%0@FI6B2wIl{>lZE@XabO2B5? znB&E}EXZ0F+BZ}`!QdG67-<+VrzX*J@Ksrd^iF@8xjmIT|xh=Ha^>3Xw994 zA3g?}8F;i*eGho-pm%UQ?qZ+LzwWSwK&tuh4;$MeJgHvWO#H~AntC1k8#ivW{^VTo z2$gTzzX7@%82u_Zb;c<P!$2Sf5pOk`0Z-aw+jb_drSbwhCyCyeMzj<^ z*EhjUj~i-P(NZM6M~J3#eUqp5-9H5$t840}b?g-6QEOv)fjgi50-%n+Qem!qZA&Bz2 z*~9KvhlB&E`TnOYLCxGt^rGvad(XvhUp~BezerfUvd-Sp#4k=F%y}EyqNCX#+;KXa z`}b9>so(1SjNp+x#dv&rJtwA?58r? z*FYh)5aKz;zDQl){d=bJ*o1s*GaJ4p8P)8|_*2obHnLbf=GbADiTx~d+h9ksN{|FI zp6CIXj{wmm2W|Y)3L@B{ESso{S*x4ZzIkiDFE%J`_E?{3QmaEy)EGstXUFN%GGcfA z9T914Qs^P2B5KQ+DUbg7wjGD>F(T zBRPEP&lUP(pqCnV@yAVmrpq_&?ofm0cOmN1!{ubFZ88!+c>cF}_OH(+E2DJ3&&_%y z;+Z-!k$ArMt;38D(49J06IIov`~2ef1f#d9!&Tq8iY6Y;8@A^T(}?!C-?nw9-M?`g zy_01haP1JfGGl6B-Z2Xfsn1#$5^eiRBiI;DRYcGvlWF2l5Uv}25u+nZNmpm`V6vqy zWJnxF>6@OBAwys|T3zOPi{*S1k-#*@eqU}?vVT!~35Ew5T;H1Fq=f$B(7oT%aSQu3%gL#Av=KZBQ{~<*JzP2y9IJEH0}oNVcpusQKo?_fwU&Oyfm z=B9OXM%O}yYBX%Ew02yE;{o}Uk-Qg0J>P+?ww_hyGIN|B8+`7w zf18v_zY04s)VPohb1O-Bjka(yw?B^m9=*);9VVml*mp1kRUYA#$-XQX?s+7vp`76` zK4dtANnQigZ-|S7j~`D6CfX+fU*!%7&LbfYybg$CfAa{}^Mi#*B(MaT04x9v#D_w6 zVc5=h4r4Y{(e?E!(H&x8VIdDoJYc^~Nt+X=KMSbw=qcM}e(JsA)6M=DmDCEfc2f-} zJk&lk<&4_TCN&nkc+5l(&12LTA#acb+OdWkKt2Gb$27*Vtg$b%35$nDhmA1vt4+oJ zo{x7ANhio|@C{W{XeYx_Mu6+vh*c9zQP3uEldKk1AZ1LoR4PYCzClNrHezm)Fh0$J zz=PNh54ic#rVH?UyFm&jcLiwraZrgUK1@^6gmLB+)5DXBlC?bt(aN;o%!yvG< znFb1JVm;zDG_r?kAX7SgXaA?!!kL(4ydajGsL|1|_+W$^Szsnec}Cs(q0vKFW(f$u znMK!etse}3+=aB%kD^ZFcEEG6uLi5N2FUNe%gb>|+vJ?cGxrqGJb9VMasl_}GE{ay;f+&rAF45U;}{b{J) z+=qM;)96n6T-|A<*jYu3?tk<{+FA9OWg3=8=u^Qxv1^~d$kGy6C~JgyvHe&WjX*U* z0|Mr4VS<3BJRsDjn>`jU@_D}E=yKsxwVnhqXna+PtV`FP+Xfo^jWAE~q@prC5=a(# zT2P{vQonLad#~vIJGq?+O&lKs5Y{qmo?zUXrU5zWZg&GirTIS1CfyOSRz2;_*!KfTb zQ)T%P-v(L<1&i!ap|~KNsYjSom!xiytC?%W+LC5CHI+QpVXY!sQHJ6$9oCO-k|p6< z-_PDYXGs>nCB0N?+!pE)SN~Hkbpyo+Km=sWG7;4pvGypQ_53yFjf8(ZF?G8e3dWKd zURNG8S*{RiPwz<8XJO1UWWDWmvhlseZuN_P4!-KyAAKF73~ZmO z{4s}UamZ@ZQWoqoNq3!I7_{GBfd?R^s1?s+rxu-fi%9SNNx@4@^XJxAK{FX&hqK-{ z?5nljVOKu1%kc`X)_!23q2jSxokcSh0g3a>G6x{LC=l{9!oy+rNvxLw1IrSmww(IGE8rR8o@@Q1AeH4u(Ng1}?1$+A=^|Q(D|!w)snned zyf-9!vi6ihUB>g3sv-&Vr5oTeF8N#hjc@<)_V3AxcWsuIEXqq=bFW=0{=l(f-yN3f zThD8ogte7i=*|2XqkvBf#cE}?1?p%$0i zvnO8oY$j?k|9 zx!crJTnj<+!Q;+TXm(3~0k2?la!~$~mu@L8Hn_;jWNKHcAtK9N!YW?7(lww1{ zeCVvzBiDij$hH@#nO7?9HxR?vpR2t!%qqXAE5_oai@IgY#nF9)8&Vl>LqUG@t%}0T z@+H6Xg)dDkI^>uyvueC1@%aO(9;j0%TUSnaiVv6e7Fo$E-bYl;Dcmm~xUY*HxqiFb z4ofN=LLxgcdf_#1%C(ut&d!>c#&Jzsb0Mb4X=zY-(>z-8a-vOyHE-tcL_hFKp4q;TnT&10qXzz>(-EnYKJ?5m6Ayt^M~@>P@3`zucMRWn8A0K4^wW#Vq`!x_A?B_Yb5$~mfl|Dd8c z8fG9?5oIC&g4HR7bx&{DkmKm>&ZSMGWAVyuyp;BE8}2Rdh76Id@%a_8s9q}7*PJ@_ zRX+EZM2zY?&Z>>e-3&cT1J%zi&ijn3w)rE-N4Rj;w0-XH7A{D|UtP>%L$hTbvb`VL6=T{p*VQACaek5xxjZ<*>Bm#qcVYH#L33A1QSA0>oAM8E^axbYRa<yl=q! zSXo=E&8d61>Hm_-^oO1;u6I1`rgzX&Mg7r*=gQ6E_I;r~NoGog{QVDIE9B2C&Qpnh z>^zN?vy}JfGrQ>(u{Htx46Epr6lsdML8d_$JE{^lbX=$zw?A6^;bo91C#Ra;K!KR| zbEfBzk-62idYhP)Fvv7majy4#b<^6lid>!?6_SCPt=;PP{h-@FYHlY|rmng{b$^sW z*PGG;rS%hTX7z&t3U!yyRjlTdcQ0;8BKw?`ThD!6BxW$rwW71}5TTt{iI0nGT1_Uo zZLp~J+N^ss$P|u(?~*t*vjD_e2ex|IA~71fAbuEMy@=h=nTbQj6;Xli!d!1Va|-m6 zguV>)zO+H<7KnDhyWVqXn?Zk!j53xrXsNe)SE?dE=_77>?=yofOjzCAi@)&aR9?q>^!@ZRz9B6uJNB2Kcq#6K2L8iz@ZqQ>evf2K3f)V^=mt#@_tc1wb z2<4Eo3d{DD*R(m&!47P~K&<@>n@!y1@dwkUjSlP)O<9hIjvPPh>bJ#m;Cpm9?-uSt zDK6bk^9!+mx&X9yA0Hdp^|iOzfppCw{V5CQ)aWTGC77>C@1dyXvcPw@uz3C#&*;{o zGkf+N4Lm8ogY#0v%=O4T+~e18-YjhY$GPF~HSK@J&9p9LO;-r7yC-%`KeN7|-b*OB z-FmHbi0M}R@Y{EiK{X>)2>~A?t8I3JrV1U1oJdERqty@L(?)0smFEWV{VbK@H9Y-Q zzBd*>UnccEPmZ1Nw`51h=!pca^oCBrBVQGUYl$m#<+R7Z8u??{8qN%_U8z{o3ZlO* zT4ZgYc(>w7eMjy?UsS?pnc+AOS4$rkv>y8qGzO6u3>?k$rcax88%pItng5S~>~j_y ztwP#K=^CdQj8Me#5e9Jw!?_U9YS@S(a^a)q2JF(!0g?H<(nR8`-Ny^LPd@_qX=Ju4 z$i*oxrCL

    iC-Lw3vb^4k@{9xzutbOzI4%BqSaVz=8L2^rQRSB+-*8YFjrh#`l6} zwxsDZG}&)FpOi%$JR^IX+Be9}2^F%H2m?2Pm4rr!ldZVrSi&^ze_Y*E4?|1dDR#k< zB|#y!Kv%yIOmR50rnP0RHK}8gnB2noBZze`Ltcv5S}*IK5<6EW;{?W5xzeFP%$>Nl z`_P*2m=&nsFT=1##0Z?qhCE*luA!&c8p>w5c1Sw0(*dAdzIyfmB=Zv;^+@>FTB3{DIfrqAI|^GDZMX9la!PaP}MZamm0U; z4f9r`#ik}}RP1;fKOgt(PfiM;*>c)VXi~PV|A|iNGdFMAD`3}K*8j{cTgzy!Yo5wT zJ4A~hbXmkQB=i^E1-uJ>Kp@$v6$2p){xpR)=MWNxk_h6!aQeFiHC^8`B`NNy3C+zkD! z4}iH6S~5dF%XL4hrnRR|okAIlNti79VzG*2q$r2f=-r-U_W%QJn!mT7sWb_Bq4`Li z+Tz-u)%F8g)O%UX6~YLaa52()+iiS3ZpOWN+cLK>6VAU0*jacxm6Ll0o|~dYVwk3? zq?$9rQ#)+y7~E2jAe5sq>#0s0SPz)lkyzCWczo3 z6F`G8C&9XU*4VoU^O>lonL{rt9~!q1egiB1KR29kVgv~6?slafo+g>}n!f`hSX@6E z>QJv2hSGq0%d+09*J-Ur;93Qx9zfBc)@CHi5@~_}TFU9&0%X;Unhz{%f7Ea@(h@QJ zk;52-_Mz%`R*{PmHco1`zJMuiq#+dgverrTVC=hnrzVz3+C;a*Uql|KN*NF3UTtv8 z6gt{aJd#ClHjs!k8)NszWM3Xpk{2WK(gUxkXHC_DB?IR9vRJ!_-OyM$e_dR+mC;=t zEp1}~hbLRJP1<~tHwWtkdzhI9`-fI=zFgcKd%5dhAMtpdZX4ASxlxTsy(`g1N;+<` z;hJ#-`}NlR4mC(}VfgJ{a1Q=5N=1hu@YaowVSU!t)+hUIbuT zsQnylq)YFPkufUfk!wW^^NR0EdKs((wCG(a`B6wrjP{g%yO5jik2Dv_T3cQMt_dNZ znNh9$7>QEgaXcFHx+m{`M&N%+o}?KC+>4!j?{kW)+awgFsym8yY_pDXm~A|VG0;~2 zJgWU${BBr4t=<+lqQ#xxWyjkl*;Lj! zI9ku!utzPmp3Ti~wls5N#sjV*-lK7Hd}^`h9`KhH8f$ZTy}9V>o}(<+l+ble#kcc+ z<)?Rr>2|!Ee!t;&nbjzmiLt#HOG?11?WWSS5fjkFj7=ZI4e!eDNn;}~URo+C)JaH0 zWp)G@i|(%?b&sN^--%!P&ThlQ?|P!GCrA8=tU)`?TW&lP2}H zyN$19OE++q?%XNY=y>!nN3!LE)bdFETkac_j=GId-k0df74hcDAKg`*?3kRR>2}up zNGDW3j>Up#qHZDK(|U2M?q@cII*O;*dlfZkV!9Yf8>+hrc$#vsDWa{g`o}Sh_yWdE zERiltt2Y{MDocA(cU(z7=_Vj!;Q|iieh&P0kU)2?-y-pL=MW6>sDE?YQT_WNS`FhV zjM%$>>l#UQq~$=s6}zFipL0x6k`HBak?qOHZJTqGT1Mt@Z2!75n89rTi**5{`46DU z$$~BuGmD$Q!_SgdQ_#Ir!_yu6fWTD|x>Y6!V}$)vnCJNt^PyA3COLrMC3k3HFX7uE zU-NA7fbPu%!ZM@33|-879Fee8qsQX@^&GoZzr`5564-@LB$--R7G1)O1o4uO&6Y(I zP4j;hM5r(ztcQNL5=h#5nLR6dkyMrw=R#~+dLE!HXU5n>n@4{EI_WX#)IL2AEbxa@fm8Qp<4zo_uf z>Fc|NHxDFEm~3(Qvdwj)!ezBeha^mr@UG`CG_cx_7GCs*Ef5G{x|_sJMO*kj7%mwy zx;Uuq+Y)Ov^t@gCO@{i2d)T^zr^`*vC!UR&)z$An_`kO(Z=&XzxRS+M-hEq5zUzhE zTe4ngdEu_nz5$to9-@H~6U86it?V_MkP3Tg)e+Ym${G@W6|L-#pVzceHLpd`FJ5GQ zc}3phrOSHR9odJUdLk6$Un$dhAKzfmZ=LU%qVmi0jkYreI;FVC|SIH`b^Iq7r&ITJpBFG zpT)AKdW`cVD}ykd%x$fc@rOSkBc>+xB8M(P$}b{&H?^a!eR(4Cc^JCmpet{x@C%wP zciLcM>OGu^zWAaUK(nFu*5`g*WT`7OH>L+d8EjIKUS&;kAGo$kAkB5-)%Tq!!D?9~7-R z6O|re2SGBb{EWOb)6<}$7ro7{si|4PLB2#gqhZV|fr*j}lur+0)e&>` z`k9um&q&V-&kH(T-f3h9impbTX00IIdH`&z(MRhh+Qfyi#0LHUX11%*^%7${W8ANa zkLi8smk#?Q*Htj)TK3_!N`1{;$Xh7f8%7(|n5{pj*KEc@{zDH@*BH;*reA9M>iY|C zeui}0+8S388DW@o(sC*vLUa0^)rp|Fnix+vor4L&e8v0;?c?_UHB`>?2^L*Cx=dtA z_#?B$RUx|_5?=@E=Z7=eMji}}|9Fj{$8;m$C5x${1~)}f7SkEn19-ql;)L}%r+!3R5?|lR~A?2)TjD8WD_FWmvIZAmgTIYb7wOOEU=pw&`yX2s8*ZQV!)BR3MiUZ$CS9n@-AKPed|?U@y_=A zQqZ8zx`C6oX3$YMQ1V_@M8x6Z*T%9u0=$8GCDT-o{NI{!YyuC$5JBt)dN+S!nY!72 ze?Q@$pFfi9?|(~XNrk=W+$cA9rYpZmt)aQ}4TxBJLYyrnzix0>|0xdl7RKWO(m z8h*(->;SyTEys5xc^QWZdW7((y|qFfTlZ+<;Es!<)jQ_uW_YYR8ny4BLPsO+jY{&J zuV0UZ?R$Xc3A$rdfhz_{$C50!1S&xY3){(=*DgNUl{eY?4*4~oGthZbQL{uDV_6SS zuay^a5ElXA)cCG9(EW1Y5{RY?7p-h;`Z7bu*@^>IH!ZD$29<%mufWnOC{x7tXm0f) z%cU2cU$+|3=m-_HD*5&jO*hCC@D6rhzVUqA@!8o|Q*#A`hnf-aHO2VYnrzraTACcN zfs1AxoO;^}=&Wq4b}!8Fk|Ws`mb5hNa?XwAC0wZ5Ny0J2C|HGMk}GLT%-RMsFQd{> zub^8a`A?UcAKh~L2NKo%iQIy+QzyJMI=D==L?PtwA{#if{1ceC`clAeO1LGyi~D z$J_I6G$&x@FJ`&@@+4-NX-Q5_j1^1PsKgdMUHanzQ`^nWc+k9g%8*gdp)Ofpt)vT^ z5B}-$XTHsN3LK~RC$F-tUt%mTV=&K}SSbcY8RK;aj(ge9LN`#>umT@WGF+mi5eown z_DAgkNV~?!(%-lycXrLR|0)mYMHmZzQ(~=?4(uk#0c=vWN1nBaI)(BY1cHNe`w`s?9QUz zoiO-f_F+xtXtMgn435mJNHz|k7M+PZYRTU0y?AQ)kIZ!4sP3=oYL?p*;v^X8o+rRPI;4tjPsNK0ufH@L36(wR*_6R^T}-pafBm+EHTO5Pj;e? z2@W$Kro+)iBD1XI<=;$FM#?|9DlIEycgn&Yfoh#e=V?liy@Qs8he^mOIYLGA<*?a z&xplLZ{JYcFxM-tO<2wT%r)RgZm(ps*}caU;B{ia1Z8ekKq&19oq1avtKmnG9aeDp zQCGJbv$g~*L%Hj_Xrq(jK&%pSXT@gcp1;5UG*;GZG)Z9}Cjk}PXB+a++h#~d!y35q z+u)_KVG9awlON_F+Cmu?ciYN-d?aKG{}>joiP$3{sf~dOKD*@Qi2vs{~Ob~JzD#3=UhPz~|%0IGYst6vN7Xtk+WL@^CKKSCZqiDsU zEf^Tb*$qXh{CHq32?44n-tDbe$Fa|!59HDxnE6SaqTmM}0gjzzI8aWpN5Q(T07Qi< zQdhuRn9~R;qM2aICYZwcTtqeiLxuP$^`)3uSS;Q_6fykdGgE5U^KkCku_9;&BjtzP zSYKH;7I+6~#0=OmSsboAAn5usU+nKNzQ_Ws zSF}PExCPqK0^ExTw8f~97o$Qf?XqSICp;AxC)~i@B^%dU_cW3D?^&^qf1x!4$SufY zhzxG0(y(fJ&_9FRq!}Oh==DnqMTH3^Jl;q<^?Kk#`bFD!BBdEmV}na7B#e;#$Ci@SFz=m=Qjmh*Hy1l>gplCz;K=>L}3va!Fzx&cZT`rcdKgQmcQep&(zwvGBX z(86|ge;Nj_Lt~Dn)KJ4hTKUt{anf*(dJ;T{n&}aDYB>S%1$RN!!_c33v6ylBNb;GD zUX?%1hZ4GianfSNWg;GVe_ZCI1t6bTc0}Kkv1!tFC-9(!^K@b@UbvWN4FD#9h*zDd zf;m}VVp=j{qxk4!PD}G9=_>5pchK*Nv70nwU}gsuDgtd+uBhG(Nz86UQ78MJzp;q#?2+$$6J_I8m5H3Az6pjw1KJ zL8btf`mgkbZ|tPoRB1p1I>cpiv?m0qEhF~B9<`pFw|qb4$9VG3K*G6$Lu~at_;D|; z9nYPg{qpzk-w&5+TkT#r|NZmfzSbW<=!z9w&jeiT188aM)Zc@i(`tLi5pBrD!OfB# z{g{xq*xq&on&}e#*}!x4N5H>%42W1WsAfrucs#gqm7avn3!&QFb z{z{=^V=x{*H05Uwq8Z!uwq8Q$D>!>#0D4&jGr)@TN6VC>Da(n#iBgE3;@bFJhC57g zjiQ7_t_w#kjgAY!9Pba=>X7bLq`EU^l4MaxGzxk|WnFmG@5cjzpMx_Ej2X_WKGb^= zYOr~Vg2OlxY03i?VQ)S|#Z}Be@shmO*;ef=-ub>C5BEV#1N%4`E`2_hB155FrmgCb zJ^de8J7C64EgyJ4lsR{NZtPp;SlTRCEUd|LId`~MXAIe?E!ts{JoYY}5$g@kLhO=R zfS{uwiD7MiqAiY_i5{}BMRFd*6L|zxLLZXL8nK!bCAzMQ?fV7Iw8_}wRP`4xy}cM% zc}n2q-3l^bh%#MGdi`F9Z3(jx1fKQORw%c9)l)>uaX|$D+SFgTRGc3M@%!*6^8N!o z3if@Tz$|GKelhOyFK?C3&0lo!q+n8wSiKMk4xlEFB3Kdsy1XE1)sx2Tz}b|t=daqR z9{mt&JN4$WTnAHz(y<+ieAZu@`CjRSzEjam9Zk$>xZf)IB15O2>-apz%o_+pY&KW4 zu<{U>8xhL=Dksu1{q{TpIf6f~;7piGBdqw4W%=v7+yYM70GNIlyw4XP>Tbado;FOx zlO)6xd6daWe50F+4IPuj%)AQi5B>zvPU@x$FzDth>x* z$Ceg2);u@!?%ojnwysqONZc2;Bd?(P5ll%0jqd)B0861&g6#6ZCniQBY=mGulDn96P z!E3qpP1ijmJEMV{vdC(aU~_#bnVvZ26U*S;*OZW{s$;J&#{L8>Z}eU74z3KA?djBU z9cZ3gMmWJ#rG;ia>S8Bn0K!tB+;9mJcB;~q_p-|jX+?ushc_ek`RxqF2RB<;81uE7 znrXnV53o>_Wv4dnBx1rxmmtrFaB#Kv&6W)++2yh zb{5hiFW~8PAT`U-Q#4i>`U5M8v+G7XC^t!8gmFrR;AA4pESIo)!#~gmHg*=R6!Esy zhlD}QC$=2Y5(N}N%uC!+_a&BbqO4zoME)E+Zg4z_oq?a@v})D2h1_M~nuV00MzoS0 zw5lB^?At~gX?0EMkkm$MA0Wy$+w3Y&r<$VA3dAqsYbi)q#*=li32#)c)D0A_I0JY8 za|X^)C68HfHU@Z&lyFgXYLYAnRV0{Pwg%AkV9aDe2}&vXo}3o@ezYY6?Xb={4nC>{V2ZA7gE-iYJGI1Tz0IyPcZMmDVq6BbyL8(TG?R}e%q z4uPDX<>51f;eC1!wRy+k@(UGAL7xqi0fQ(^xB#iQ<^eK!#qX;y?hqKCr zOjo+#QO8M(pHf+d)m=L9p+!IXS_<~!FaYVAsF`S^T=atk4pkYN$^hERr1qi^pG_qt zS`-?ibBJF)MEdE8bhsH0=l{tTPSY|)!!THLr3p1KaiI!t>ECNf)Qt*iEb*7A&_&=r zkUM=0{JZEX?1v{>T82B;wY|_PEvj^XwtKdOo+=4wjv?{=& zSPmtI}8izg0JC0*LZ!7j@k6L~qB`k>;DWH{?VC*3 z(lMyuh@cKic_H(ZNvy-X;?D~GuZlsqF}@9REznsHODJKDo((Jm@?IlbX56&)md`OE>}Jgm zaQ>XG|G?m+J5Rh*77Xokj^m4^d!vPVLO1;U{b6*xTX?cQ{LdZDoLji*+6MG8FJ|7h zC2$h*uaS+YqN1Wv!Sfr)Ivf6Gome+wZK7--=sa%L8eID~{V`DEmBu3!;LN-vgxsKO z#E?zF!s{7T+RWGdmASufqyC{vWf|ekOPMS4cdUlx$o#+9_Z7L5X*Dg!TMOWGv~hfV zy!4x$Z?N2>Z1oTESi_O@pjd_H^25gyZ7i5oO11I#Bk#_uQwrv;i$(V%<}`@ZN7Ps{ z+_q(;z0Z9mBA;}~n$LFUyfEbEZB*mUo?W!K){j=);?$yWdoVZRC&{TCZxvc*SLOqT3{{4ptOHmWp{Br!jhZKvt!&XWz~&!;6S_aJp2v#< zi0((U2b3gB@krv_gS1Zz{gUfN6__C`Y(2HLBe$riXjl^>ANKZ}?E~FIj>;p`Nq;(( zcWSKDal#GnCmZXTMXfz*jMF-uYir!TRH@_?H}5LCYdlyzv5m;xufzkBh&YmmY_%fh zQ`JVOwW>OoF{k@ox_DRdHqpq=nY6>2=@fF$YP10acl?KXTyv&gx>-BYIf0hvY!)^7 zlK%+x-{I1x$_$cT4rcu?9)1N;xU+_@gKUD(RHl9w!Hsn zgtp`>^_bTx9KcFFrq!%UkHQKe<|D}94x&{dfJqOb7*X($J!Ps;Rl9(NzQ%O$ac1S#4z*B6jShjRYogeR^MJdI8J2VUy)^@!ZGoQJu1FWgQ)++y%}`Ds zHti~7j=_O%=au(36f)1SU-*f!@E5-(;?x6W5GyqKxn};EfV^4$wyWjNTE91?f0$ZW z72{PvTm!U4%4p6*T<}-$^Js2_;M{qnEWnb9=gDsy53Vjmx{kZq+e5c(vQyS~@ZWCT z{UWHGW=<7o)MCPcKQCH2jp7z+c7Rm~Py&z_&42K!)$Z@Ci4e>H6=WlJ+9lIc)DA?f z5{yYcyP?Xkf3k2XF#b1acZlx|*0t>ZJc|2ARCYQlXSuzKwGdQ&83fi`qssP-g{33w zeR^iz8jxu;pbe2m>u3pw6#tfOouiNXOmfB^8+>x9QY@T2M?VO839B6i=h8s=rCL~D zEO7Q;_LPlbYDirQR=3$Cm5=z}jps-X*by%AUd+#*l=}Tf47ls*=^0aV8LR6_iVn9B z0mSIF@+Yg>$Me7s8+es4^vNJeio$){nf8I{ z0)=vSU%U!^5-<{ABk!`o0#ktIF#5xXkw`D+qJlA(xWMt|{CGigg*L#@MJ0WLgAK2i z>r5?+f?~g{y|dW;Nc4RXGo8e551M>0*d&Q6Lab5NhLz=8WbS1rv~9 zBt{1IrMAiDsps3n0USH}eBVrken9dijja^ViA*#>4q>AF)DCLaleX<^s?To8_7VxE z)w!F!@~fl!fkb>p*RkLO*L~0Df%P4+6eX7vN{C30V*kWqmCnTb>>>T59R|}XA1!~e zU^xbo9vCj7f5-qv>r&qtB$ATF3`?+!vnDTr7L_Ro^7YNlYf<{X%@dI5FsvqrW@cWJ z)}%!My)LYWHx%fxND-DKbBRxk;m_LtJS-hQe%GZ^cWmrCpEVL+mgX*#SLw>5zaEK* z(SP!$;rp{C*(;VwSp7c9oTEK6FMcStVZ$L+r;$6k-1jMNl0oCl+q1Gc&m zRRa*CcoF*(gC$;UoL{S~(ug_xNNgGZ&UYp?LfCR*JVQjHY15|B55UWmtH!S9K1CT4 zwV-yA_Y%7QTQ_e)#8Vhhtw`-22OqL_gAz^n%-dW@_pVr4p0$ZZ8z=`-6SkO35e7Ks z6isEMgy-f&P!MhzJtV6Uc!Vz8ZllS^(9+E7dzqHHR%Id_@%{yT&qpt@UALsM+6+mBR zA?s445OgsdS3thT3H8b^=%{f-J=1bvUc1nU<#`I9wVWh2x;w#{WRA61{zrio@wdSG z63JKl0J)Jl3+y$WfMd<2>1qCi+WT3T+xd&MNBJ2YynpMfa@q(4qN=7RmO3wlk~Tca zbl*Syz3+#xK@MZRh&$(rLr?r|B+CQ9U6Wji<9Gsf2+SsTOLM2rB-#=q)QWM7yO^nD zigzX;4?qD;hzeFc9by8b_&j5#5VW8LKQ7DyAhxjgN=h$e7m#MsWQP}MTK9?B;YpUf z3DnDR(O~`nYFV&;-;Z(bIxO3v*qD}d&giY2mk(c{UOzEFFoyhFW_r>X2%nns2AWYk zv5I<0QeXUp&BwMf1{VH(UfEqyhMg!AqtC>sU-GArG!>z_D#}hdX=x|&5e%V!p1%!p z^I>>SksH?HG?g-wTE8x94B$dC#K?r_WbtcVkWD=R_9L448_z9^jb(4rM+jbl#@W{FM zsa<5;7N=?u4=xdL2|PX7wd1MP$B+|~Ie;K0N!ph%KP0~{l)-qCORg7oBu$Ll;(y=e zFLNhqY5ALHhkUjd*hv09wP)|%KuiYfv7{j_F_rxQPU&Oi@jphRri=3>2Yd(?NK`yhWku^y*^DhG3%F52w{zD&UD{*AU3($7t&1{3!{Z9i zYJxmTk8aj^g37tD;RLoQFEE2a{rm*=^8)7xWZ~W_V0!&*ehMI|I`?)|Y0Tx6csW3y zpYiu`o~;f@)g*ruf9oV3XVvxm-1m11ZuSV?@^rtUT(RGXqOzV2;X44z(r~j6gPaKj zJzr%ddgV7FA(S?|VG$a3muAuE&9M3dkFNoxDf0{~Jn4?(7_&{LOF!@i%`27C4ZwH` zGAb+j-B`Amm+3?en~cnpo59$wQ=c|*eq!Nd%Lr&|-Z#TjNe*0>#gR{`&YPGv-;p#@ zz&CSFAT5I7GmMVRQJ*-Sv>!+@^X-YuZc&v&bY>*nqc4xF7SKsbu!EYW8CGBRi|FI? z9{MxLNx8wX%Rv)qVLgJtBb@lj$D6Tifp8Lztm*#h>gqy3#2QYr01vp70+aU9RynNqMYJHHQVNgwC!{WVtIkOBiON0&vT zUB3P!s6ohikQ^ycJVhLi6AW_tkRoS{Sz20cvnW+d7V^#Lo)PapLGXTYhFzF)b7Y?FaySx*tb-=?Ut~S zsGbbyJli+hRo zbUjvniWv~q+B}s#ClPfS#ITjiu2U>@r^M2yQMhESLBLih=`WFx(E6AlAg$eOSVA@&Ac%$Tm z$z7TMZM}xE@k6>wbC)8suIAQ8Yxp%Eql;m%k6`Y(hUyIDc1#qp#OpYr>wuH}z>s8T z935@zYfNFC{rcCBdTsJ$fReF{fjG9!;S`6W^hP#%LYhW3q@w-z;Rza1!4@=-2{2W_ z{E9t~GbQU50O=?;pmwicc0tz&0W<>zFIrK26i zIVvzdglN#fb5AOXMcJZ+DIt^TbHZ}2jC~sArz#6?)cyK@LQO!sL@+$!T}OKTkx}Hx zO%Nc{2gen{p29;(Jx%#~vs{j0f=1rDDBoUx&!igL~zFF9xnOB=r4b~7K=MB@_FGSDg@tHA0u4P^oQM;tTB$gxcI}M z?i{z#Buac_*f}NS(Wm!E(vk|w3ppt%ZSyzLNp$EjjIuZk-&efLk^}2ZT^y3+oF8e2 zhcP**>-bA%B#k4y747G{D15M0{*PUWLhX@&)%KT6K88FL6#-FJJ3W|+&M z2^mttFhiLQVh#OM`*!7}|6vj&@VzEQjj4(oKB6(g062waAKw@mEFOofSs+w8wC8bn zWTY8MoCwLT}D1#gRUKUxHY zSWdcbCZR9*d8^leGSJ$Czb4hv`ebPxdC)6>)<6ae`00`Tl7?E>q)^Z>*Z%38@yj-M z|2{Qd_}7rhTYA@<aqSt<8q(UCjz*FxX&eix_pE1B~%C>1^xdjY@ z#rObAUn$m_XHLj=U08BlA#CHHnguQ0@M&aO@awLzOk+&z=YCo}`5jJSug8kkB2 z> zzBJpx!GRrA$dL!a<5R%Z7B+gyjAFb+bVpcu8pjKj6~UkK1sXME}o55PAemFsGTbVG{={%rN>-ut<)uq3{g*E5N^VUaYuvP^(Q4bEbYqVM zS!=>t-Y&RQkA@&}kf_w=?vTgyCfbHzcuxVr+po6a5UbsOHPZpjg?7kxd3>6DUMM_w z^hwk?^{A+jrNb4!)@|kuZe(Xu@Xld&Pe#*$(k@&HS}EIvX~(lmFLTOUyOOh_AYx)9 zEL^p%OW@(i=Y?`grmrWD%ni-a6?pcvA~dWTvguyOA(^a>2qRlVK5q|Ds&9o;T6+zR z4jN`+GA)PsERv->kUV5;`XFv2c${$jIcMdLjl^dK8ma`Jknw!yFNb(Pqbw1rx=)6a zR@SxtgvJ7HEk{kQUQ)7RvvhZov?&^|QxM{%gbD?FKm}_ti1vl2$EhhCUT84OKNN^u zo^W$>;3{zn_Z84-+(k+F>G&jbm55^}a(XVBPMWw@om)tyE2_|R)9bTkdk0Qy-aHcW zAGL`~?3G|)jWqkcB`a!$95SL#tsRX%8ZZ0t*~3^xUclHi2~deCHk;X2ladf&u4XvP zKUJku^xb5)3o*SuXY;G|)SI&ZIuFgg5n3L&p1FX1_59hyk2%Db%jci$}N3We*0CrCu*IjGesd1Y7BV5>M;6M_3r(bQ0|Js`xaUG zKD{T7nzE9$a~Jy=R94KQ^|mPvy%88*pI?zNebi2-u*9lo?dd_0pqn=j2~-{Qm=^k; zcdJnamr2!~yAZ;`^E@mW6R&az(BjiaDsbSTOj0e&Wod(h8YfV_Hg% zi+!}ULmwV)9nvj6v^3uSnSDf|fMXrU+w~#Q)>~F`%#?nj7`|&@db~yNUKI!x495bi zFWdU3h`9px_@DG9y%nii-}i1g)0Fvt*hLhzyMa!eSn+?DMf~8p9H&<}Tj1Eea6V|= zm!u*psC<6ovBI>x%bd8;#DSPD4q%3y#rat7JFKMVViK(wj%h|WCY}hRj3nel zr0C7j@V}Vbj@7H6yzW&?hLPU;`dRK%kl1{QcLJl@(b^5Oc?@a07Lsma%%1;=UHip- z9z}GwqqeSPE9h6|b`;W(sHlYpdz_+pXNg9K`SX;FlR;l=>t*ZnLL_=z1+Q-2edUMP zTXEDRJ!dC5zY?j+`XpPg5F{c{de`2=(cZ%xhh<)9>*iN`i!L>T{Cf09`^nOim$IqM zUiQJkb@%%7V=F(e?!mMkUpy_On?&eMTFVG; zq(0h9Z>mH+r1tJiMa=wli_dk7E-}C2*-MKgG0^ZL&slcKFuG6gVD=)E9CCyKQPph| z4V?)!9r>9bpPv;XAdk418;dL+);ePc70C)s;$vY_L)spcj0F1vT7SRLu5+bMBiYbK zXSI{N6yoI(y;2%>VFE&ol#Z{1i5uqc5>`faJ1HVg4?Snjr%#_4m*(VnCewt}!i4|W zHT5$Z*ZaPIj4rfnFR^KFfpufunaSFTb!;2QIspDHU#zWN_0OoE)u_nY1h&XF50B7) zcmxm(j-$(NN6CwE07RT-)UvmLM=*pO=5g@yVmk2b((w|93pW6usKhNJ~W!HKd1 z>nFGG6iE|_ttZ!4c=0?rR<^9>iy7_A0)BJ+m0tUYNB7ICgkfRU&d_bkqcruZlT64= z%I%>tnA}jSy*SkkzZ|7w8Rrt2k~KNZ{YKvR9bea4#BZ*4F+ct6Y}$FT{J3jFPfldt zwV4g!#;5woiQ~eJ_2NBUH6v9;x!=BS@k;r=J^t8O5Ni=wlYhKo;tlmE@4+kh1~m zCs0A7(67I)!$Ggq*9OoQf9YdYsK0;Zq*>k_Z)P5`xOh^G_+ua*?Z%Wu-+8}N0H*dz z5W|xbJVo&69bp4pa6csaZPh!@vLgy9>l10RCRYl}t0ymNB0j4j+cJ$09!jj9=fd`y&y1n zzgOh1vopvq_Wmjd+j{<~V80lglfo=l5%edMj8SpItxceCW#h-jN z)d6doP%U-F-q^pPy6Uvo##1u;alfP#(5uC~j8g|QjHjDUr|iqQaP=dJD=%%1bbuTG z1JKhM5-2qA?6#(hNci-Xa}Q%SnWu5f=s#+XVJrYnfd4>f8V&IATN(r=!NmdoTF!6) z10Xved{Y!q?TtcdLol8rzhSnkCPPW;dj?-8LSFeQjV=`$LeNV+IUYb|+n0gC!Q-4p z2x&M}7fyqgX$c0g3hU+p-PCU&jS=YlWW_bC1Km6m|kPn)> zsPy?t+MN1^#LqW`Bk4`S~gjT z@e1&5eR5Y0F~u8rgfeP_UbOo<@|m&XUf+YOB5^yo(g3)Locf?Z9Un^Iav+~Eo#m+o z?`ON_e>FUx*to1>MW4@$YchUbQ9BS@@oqDZEJmk-50g%qUZni@Jrwq_f5Dmor{_q* zmKUytd8uJvXw4Fk4>rYv3~lHwvfM8~Vn93hFc6R=S}&m(tfyHJ$q-WlDv&(Xlyo(c z1?SB1oV-QkJ}3mSI{MG$0G{N$EG^u6306d^@;K(FZU zb%4~;070(o{%9JTk7_2m^qmuG_Uz1avqO3IC=XDl_gkneCx7O7QQUn#Y&n9QO;;`E zR~QzcnF#obW*yNuKlbkXR@-9Uz;u(x)V1np38&SggSW=hbIY}`?#Q7=r=t}?{izlw z*AA!OYhv|4)VXwhxwtBXOV1@!AH50MlD$Gp$Xy) zX6$%C`w2`ztN=IWPudud5X*^Fmhi2YCcL`xsEl^MrpIfERY~(ZRK-Z#?i& z_c1twTfo1!_7^cDKs}@$6Vrk5uivpK8+hF-=Y!bB3v=`L`uow5-h!Y3m=W4mYK29s z)d4Y`cAb!wV!Q>tA^c&X&pw#xrw_ofi|UGo91Y z`#zdN5wJC*e{Y-+mOg$VSt^pTm4;B_j| zoJW*ZX^ksCt%Nh?|Izb~8Ms-V{*OwrAtB~q+}@?=?d!$m6$)~`!HHITgIf=6+Q!UR z5Vxjcdc!jE{Kc{u2%lor%Z9n#95=TL$X{P_GTLaP#)jtgm$Htu3$`8Jw8_zS%F%ke zqW+cYhauf*x4Fmk+2bM(-54C!ii<%gxc!(NT_e2Uw7|ky^veI-%J7oS!Cb}cm~)z2I<5@XcUyilp;9G9y{fW zoPxps7s;aeEz(T@=n}cOT*?+stFLYVsWvPMfLIad|ABUvOArh(u_tfsTIJFH)-N!z z<)MXmlUe4A?D`%4)9MIU>M@ z+#klh@YZ5hKA&Ip$zj%(E@X-mjoRaF#KLjEc>>R~Ku&$%S-{#S(LkQfbNJ;rzz(QJ?>R~qw`q&OInMmuIDyqcgVMALJr8!J!+4tpPwlMU3BOAFl_AF*{>3jQJ?O&s~Ym%5;{V6Si1A>%N0R zs1OB3^k)G(&mK(<Y2z6&PEdGUT`Sk1yV}{5zZ;OPKh}QHKnV)w|xb3YL6I0j0(8 z!G6WE?f1M#pZ)T_Jzv!0;!m19`Kf7}&(B0G)~hX2wC!2lMc&}i#ivPi4RR{&03GG< z0?}#IaSVxd9q0*T;=!LF-QmFmen33MXt~FwOP31BcjXF1hR1dv z?4%)8l#9PZdUcbn`mS@cdv0^wjiLkK31DSY5bQVimyU z*SdLZ$Cob>sFQ9l{dzIT9f zoHRAyV=ZzL@8-8%JhsBg%`j4#DXj1zWIw5(X>1q8K^_igKk@$82^;|k)bmD`JcufeLcC>bJ3xjyV4ZxQa$Wj^6{c5{wDR! zq4|WYnUXK4#I9*R0as}K9JRn9uj}izQQGQ8!_QzNve>hVPGjsAICy{WEYs6LYcMq} zKECGjxAG`|C?dSB{NM-y>_+$|+43yP77Zwt!a04u>?m#6jn{oWo$gxQwJQA}6>RXX zJMQ}X^JU^un$F4=4ZLJGuk^n%(07;iCjZ!6#la^ja##ZM*h(YATM=+hFt2l@)AX^y4QIT|$Kim$dz#-L0iUR~87{0WZmp z`8NZFA~P}MccR1zsFZEfd-F2!V2ti>)5OkuTIKKKjnv(GopYVWEDnS1AE(8i8ij!b zx!4-V`(qAAkOGB8*bZT|u8eWkSZ%$IJXQMe9@|d}CHTZ36h8TEOR~lz9p6xj$4`B5 z&febmwM)@kZsDipPPHEcCFCKCtCbH6cJ>ATDIZ!I$T)KcrL05&@QsybEA%`&D>iMf`*t!~<{tyRy)VU1cN-miV7n$??T-7An{WSDJG?Qe^iP{rB+~!yU>&Pr zU&@T8TY9;kCi5R@Jn-JAAtvu*`b{*}b6?dq#e(MbYpv>j?~ICBbtU|!ulL~v8auz~ z?wG&C{(sF{C|A#9@LBsWbu-p_m3ZEHs9*8Gk9$Ec9qjeJYUL}9ucivOsuo5Rw@2GR zX0Kb5=iDPEBjGTZ+?3ZT(5&QNdFofds_p-U%-tIsw0<*z1J$8 z`a1y(=7Ntat+U4cUaC#?`ySSu#c(32UT+FdC*Y*~kA?O!30Zlajt^lDT^_5gO)K;S zYVbF>lLcpZ8pY%;5jMOYVQ%=4W26>Z{9@5o4Yyp<#tZ|QIkJE2;00FPmovA+FalD< zX#y(X)=qUdsbkcV@sTLP`$}Xxm;80Qt8SA^&6+}y=g4>-o+){7!Z1Iu=bFDS2}PV_ zmhfdvn{C$?U3j84T&Xu`&}-FQ6R=jPIXbeYL0lVwkJD}rCar%j=-x!WL7-9I=)c<1MkcG;JI_$+qn}C zlJ~a5*>aX0S&u?QK)QDrRpE{t5Y$MfjS_RU-y0RI&OaE=u)dq%WOGEXa_Dl%M7Ir( zeIxIsxAb^_5IS;VL(J!j&=Z#5S`1R0+T$te+=MNM=0)LGyYSBEN&w|zX2Pt^u(?Et z`IiVc1}~2A@+YAxQWz3gU;tn!nkBNkkvs9O5ow>ZJx~NwMV>fa`XidOCwd1jWn>?A z!Yw0$r;3{uB^}1cjvg^f&QoIS2nxbS>ak&<@uE;6i!RJdO=k5?g~B${<(ioH8h?NE zJvy9JEn@(oi~%ogrA7%~`Qc?+ic^JsvUN0;8)&==pCzO8dETN2jmf7_!&mLhFkwchsu()5pv$01X`;O*|^u@Tf&@bjsPKcA(w(Tf04IwrtEeN1B#speC)}zS7 z-2;V;$rO&mpmQ|A9LtRFa-luQwar+!S2=l`I?4yy56s{tq~0(wP;~LWMh#2-WG1%{Rsu4p?f2>g zdBAxfi2){Bjg|&FVw4}03P3={Fpw?9ZmZ974Vg2|LQGiy6w`XbwGFzApvFrBE3CmK zWXQYVc_g+4Ed3)9>BV##rt0o1q81nVCPaF`g^6YB1(FW#gFB>!hmOQ*Z321*S!QV-OBop+p8kwBgdcY{0_M?Dbe4EQmDFmV_`Vfid>~o>V%UO$=O+gBJAluctVl z0f~0WU>R7peGtf?$uQfy62EYZCT$u%C_OlAM;VEJ!6$FN7TqKbH1YwcheKU{+@^!N zVvYrTc~F@~EUHq$kb5CEpkm^kIB{Dglz>kRA7G=o%ruo(E9pV7G+yIIkmXdxAh_vw z6!F^Q!cX^4@QyX-pfRxKKRbp9W0BPlH04>tSO&^AF=#Tke6Bja5qA<#5vO~R>FK_C z!I?+sL?TNT0OJv*Z_{*zA4n34W$BXvgyq4EE-czyS+hpjPHlmHe(}f!mKyd+zCwv)mN8h zuUJnDCXcM>>FM#oJ!2DIv$YsP5mq;qXBz|ZAgBQ}8^bvcavsZWp<-*;=`j_kW+o1o z`y+VW}$6$Yt9vPzRpJo#MF8&v>vep%JdU#Nh)w+LlHu7IG0)! zB94+SUa;)vTuf#LNcX*;_2(E9U=eK!-_WW0%nZs^xWG7U z2cH=qz;T7{rU1Y>2*2P8H~(jCEk?~?2op!QrSx`e{s1s<0SLFe;GJpwI*t}&oU%Uw z*$pr;x62Yw9fLK}ENN+x`7{Xl#>5T{gK-l5oZijq+yiSYp+*`Iy^p-KnYI z^XIlrYsjnpONeYg=Ir<_E2x#urvE+Nz^KdsP1k`o1h*I)3bGxWfb%fKnk_KEX^%G1 z3IvL2`rX~cEag=B`V?sSk=F1fyDxBt9dNCC`I)59VQcz)mb#c+%BX?}ab!NUx|q>_ zac+;f670eisq6JsUmq@9r_dB?HXIUhtnsB6@s z%=xT#*3nLHSQlV;O=|-Ph(zY4y+_nFtFYY066`pC9=_$eGS~IflI!y(?X|O0i1fX) zPGOzE)jbOLd3gSwKlSP!rN;rDfBbEY|FcuJdu4o7TC3k>)a$#O*~XHTK8Gx={)F}+ zDaWeWV-l)0{k;U;wr-An5!9cAyCejt{KF$tHhqrgb6d|h@q-WL$h|0A?N>*2nL}Os z_u>bIcz?2oc*z&WF8Slrn#FZHR_B&h+an#G|sc6>zNHFi_CRn!(qi#gZ&qi=Ex z*ZbMIth}>;v|Q(@SC*#sS39c9$~F`B1!8)ws?)jdm%Op6x-2}Sbi z&*i^w8_D|dy|G+a@QmxzS2gnZM|9Sm!yE2mMnG`QIEGa@;5z;Y8=6FD9;trl(t*u? z)ruClv9A5e=Fk5$M&aNMQLQp6&nB1_RSMv{+&N&0uOV8C>ddXnXVsDA5^WtI@+tvb zs`Hlbdsm3YUiRILn!VugJ;2`RAh+KJS1*GL!bD!@P1wZ=TZO(4cxFVB<{jYN*FfR&+03 zerzIaZ1%lS#%CAWlUU5R2blivp+V*KS%!dZ+xg-Y)uR%++^0rW_|=BzFz)aNb^{K z#-gI*Z<<@B^A7NwP-HgNja|Hl{5!W^$zO1juwl_nsAsob6zbP4F>nQyH?-$tvBWF- zyroy}VJ9|EL$Q}y$>w*-zl?x$HTtGIU|^}&4iP__UG}}ZR;^(U$?5zhOCHiUsZ>D% zYyyP;K|huP54Y>Eg^KOg*Uw4gsV`&|cN8_KXfi>(RBRt>Wu!8TxI3r}A*&zJ z>es_{7KvW|8(Jool1C4!q6hs^y#|G;1~{v-PQsV)hw33jD^6SC&q zime=df}p4C8P$K;ekFWc8eSxytNC@OwH~i4Cmdb(aO}@!_R&Te36TxH_V0ZMXb%_u zOJF=64l5@N^$zWVBJC=)pYuZZ+QJn8)yQQ?59BDZsL5TFh)ajGqvXvJH>smZW=n&g zWuFf4IRU1oI$kWRa|6I9lEJ9-Ay1trnAj&2BV>?e@l=pSq?+d^6@tb}F~uhoqaYdt z1a%m6-cN}YQ}SB**}p6X~v3W@_$G;yW95cl;S_IZxXK5$}d9dXw|}dUEhfN7`*f>`9QSnT1W?Rsa^- z@!A@%+HGwW2R#8+Lj~p{{4~5mk9s**j2>Na~ zx=~3RvL`ChOUH=#+vH20o}Nwsgi7kyHq0*K+XO4c%E>uGJ!h3cWM#w@vG^@zoMzGL zb}Yj6!+Nb#@w8-%E%c4WK&JrA8qw-PX-1^TCx!H}4}mE7YtB!d zR-#IX>1*OG4#1H^bj!iV z{zm!$Sq;m?e(%yi!mIRMd670|udP$!c;b>VnTaRgm#$gFH&Yx1;D z?sZ|IGUY@!sh-|m8&$DXe_AE!@WhCAq7T>$$&CE+*%vIP0<7)sg0zd5E=l$Wdhbr2 zp_Ym|>l~CFTgO~T;D!6)HneqZA6DtS*N??JQ$XP*QZ0Z=2+hn2Tv}{VSh)U>f=>oT zFWNA}CuL9+&?TdNHsP-n&z8)_EVBO~nkEvX;^x@6iL8#FEtmDeW#b?h0oF;o3qm~& zg>!I4x0&Lfd+eL=Ii>r1ZlGV)GBV^7`YB?Nv;$-1#Y%P|| zBC+%14vv;q>S~@wo|Sb{^xtxgw<{qnd=4Z%Crb*qw6?-8QWJ;pff~y#ZN1-v0PC2V zy&rv@_wYlQ$|T#l&VUTxm#@H0s9+yz1WfL$?Ad--8!ot+?kr5LSq_el%CN?zge!Ta z0}+puKSr8&cTH*ilF}1BPz+a z!T2cv-?kb5PnPDPdkw{=gLo25LM)m#;RNGj+tq+|&rp_x(Y`6X>HFt8PrXmHoMxiY zXo93%re0Fevfz`!(4!Ry$yM3qdz)g!Hlll%)sLCp%|tKyF>^Vq)-vS#xo6?!z1~y0 z+Z}4A1NeCW1OXM8U)7>@&E(#%s_EDoYDX(>$a0+nwoMeq5N}K2s(}C!nXiSBANC@| z=Bq}yi!w7~KPaRaf2mj)UtB><4;}k`bbtvc&V=n!U8tR1(lk{X7EwNtbt{* zk(lFIbbfS@u>brtB;DyaKExbuJc+cAPl=D{b%09H`O6Yk?gWrVl`DaUv{tE(?jBDJ z^L_kWP80-Q)Gwm)<|SC3T<@6r{@$CA>mr~G5fh1QEF)uRX#%O)QJq;A`rINH3EHIu zp;OyrcWrckKHw8Vd{~J-zyNYAcKI&5?-4jc7(|x|9232`v!B_&g|&cI2fku@gc3ov zXv2|+_{rglQOG}Xl` z5m!!ClUAc^fKW`38IHK5&B_+uZiI8?^bCg?(>m{h*wjtSLeZt*o;Kh2+(B%YhD=yU zwE6OVgiy|bY=u;8X~+G(fXzF7GH85@Q-zR@@FW>Kqq>9WEv6M4QJ`PsguVT(at4XSjYlb8SZGq@SB)t}aQPqzai)av}1 z)G=hIgWi#$Y`Rsw5j;sh%*|d;ELIQ+;sjbRR+ zV_x#y1KI;(K&_fvEpF#~jW|*w{<<}IX=YD}6uj1!h0;G|F3((BIi|qpSw&P##tE;s z$Yuh0A(0Xmuplx3zT&R1i!IqFv@?|OEnE?p>yj)L{2YHUXujqo0u_+t-#iQOCyDIK zlIL-8J=wrBifzn_5O4AExODZP6qmjzkn)36IF8sX`dUCS@fc*VG_d!{0JVgd_RZVTaT0liGKUs^sPvudDHdmX)Vcm-felWQFPx?nffv{;t43)Sj+{uk*Q?Jau-@%yH1^K*w2w=f3Fd*~$}2 zu}9-xrc`_ZxO8h|sK1>lTFmZHm4-LHA#=D}*=S7yK7c}~+i;)oxlIM8LyZAlX4*zj zmREG?BEhEsE4S^rJck@C$>s~8O>#nGB_OMi;@}B(I$Eao&3@_>&Tw3QQyZk2A_fmM zYI)N^RiurR=jT zx9#Y^RVfN*pTF`u89Qa<`fGlX%h^=4JAPEYBNT)*`a5oE?OBZaCe`-)dXl=*C3OqP z+2F;zprdhCt*Ye0*-@8qf|R)gG_b<%g4Y8X&d$hmvU`p63S4l95;C<3_#E0q2GJlk zE?isp!*rs>BUB*W=;i=2R3ePU1RvXuZ;kWVUb)>dGfG5K25 zB@;FoUx^MQv{KW|7HYuiRGTP19CCXMA5_fcQJMPy1eAJELCzS=9c68K9isgnx_YD2 zTCcXYCLlo5Dz+D06MU z&a}k6e|qv2d5w{Agc1;Db^zxa#~9of!3vMKDiYtv$WMUz$4eR6d`gm0DR)=jY}Yhe zW>S*tNYZ8l(0Ruk-i20QpJdJ%VZDyz2*4Ix>bsfC@Rvi(HtZ`?=^F%xP9GERppJzL z7xI#uAfF>({{3XKldpwi=nuBIX}phs>{9VFfE5D!@4#&{f}`^x$|K9h9FErdE2ys8 zMdG1AQAXRLk2!{KKU&c4*7Qe~Elpd{k7oNJaBW-Ojk>F_+IJdjZo;tB2(O>^dc`o_ zKoq0y+y6T#+;s$hmrN#0Br0f;#J5_?#6+ux{f=ZMszv;brQ;>K?nRJpA@5Md%J>b4 z@=Sl%@g9Q-qw{Y8loRt57Zv?S=)@fIF@qS2U2M7C*f;MhY*bLDVSNWvciT>f?>L51 zrt$s`GNtKnPD)wrJdT?#TKx#~h?1$XNx}xlNf$NQ&S3h9;@5ZjduUF8Bk=&U(s&r6 z;iU4!yY`EvoEz{)_-(i8A*AiLf7A2ar}LHNZMq$c0+f! zuNb?S`!Yy(itcs)Zn{Hm?f}xPG_#?e3Wru()t3BkOKeDx?+ZMKxfJe#ZJ5~IClNWt zRUly2xt=Ti!KWgFi=JgTs$=$#lx6Y*W1r&9%nw5=rHBL<(u2?R_wzNJp82!?HML`*wJ1~LAOrOS<-*n&73 zG{!}t-jj;CDv}r)VKTc3>d>#7CPm^$N1WDT2{WV7=QB~`>hmXQ&?5W*mz|s?l>t=Y1S5gddSwVTS2N|xCd^^gWjhpa?-+4*HT)~t7Xl~}CR5Vy z@5R8XK)6);e!doE1B~oVaD`WH4nFm<6N6ZP+L-N=K`UY9J-FWKM*Fghh+MykGYu(PBb5SJE678Ca0GWYs)j9ZCwY^vEo@&(hV%kat>~aVd#d~(U z%v>5&JOxE90;v2Jz<044TXL94DZNqcj~)cp0%f6$Ch5-1+c+J{nN9;V-@}K8ujKQ& z+>ZOmj5uFwtlxEvpQ#A=tycL67$nx>nuqdS)q>Z;I`Rh9kA$yF9-?JLz82w_G%i~( z|8KBkwyiAQL*64c6)aYo%ILWOE{Uh49^~+>K1w!~@lFGlSfT63ysE18cNb+S1h{L@ zA2VL5IG=x!_HEu0FOj(R(klgC+YbC5M^i7mdh3r3)eUpEI1UZ%z@n6C%v;SKfq0ef zqdxKy>45TK_?IkjPCeA@{%0P2dUvb=q(t_#E<_68%2 zhqXno(pJ1XpNs}?{-nilWM^#jY6bn9eagmrgzm0dEg`({dY_l*=Gbt>Th$Vl$;sOv z%U^Qi`WHFw`!!{>KO^u*Svi1XFVWN7{{>U#p(WgFF&E6XQJ`%z$ zv~r4&yf8(LJ4Z{-L$D9VTKBGKnfwdrof57eZE=&dISq+Mcad;u9@aW+#FHx%_^Cs> zeKc_}CyNEzq);7i)c< z#a0rI2x{Am)4=@O7Q4KPyrdZQh;0xolSMEVV4Bgk^p+`d`lGYNz`N=2)t|(Yt}HL~ z=iT-@2@#|t4h3NA_EIEJsXcz^)((eG6xDa~5o)@2?lu5fof)A2YF=az#D*8+B?_)y z=0DX+lfSl$7b;_t(X%l%HG*0IG}Dx$WKatN1ICdtVbxcR%55m?vD_cy1IlLr&UVBe zG6^+~SA%|BA3P8A+rab~F^XegrS!gm^Tp)9gOpUxuG`6cY=Rw8m{h9^QA>56h{yo; zLP50&K6AL`$-#Be%kwI13UNMlJPBpTc?oIVi0ZfmfYhNTm0;yI&X_ee-7+y4CT&`M zMq#{|y~{4baUh(=e`(H{a90r;mylU1)Q!+eQB2qHDT^iim5XSx32m9AgFO{^NJ)&k zdqHK9VI)dSUqA%~Agx+oy>JGnlDvlj4IW1))FXwnoLx(33O-6STW(nXK&X+tr@kb<#Tf)H8@_RK(4U(R91xm?DFo*zBpEe|U9LaGtMw8o9^*LX^=1~dzc40H8G@qYfC z$};>m(*+_^OR1n2Cn8PQW_5Lg(Alo`AL5ZkJ}t)) zUC-P-9v50ZwMY1Np(GkAK4v(;u^Ywt z11J)3KS3H2K%8Q*eSY`%VPt>5{N~3P6jaOt1y$f*R@2X&HtHmqzvzSmMlPHIu2Dd+ zA1$aPa}lf+yagMrJkmr?hhy({?Z+EVR0!fkV&zMguAyTL$WyYZJpQ%>Gpxay&oQR9aT>?+^1OQpTx@CjxLyvoyrplijt84Hxpt z;;H;gOKA`UtEU;M!MfejmxQ@k?7>Nrc3$j6k-X7?TEFB{FbXO^@bkYd3^T-`FR@a{ zUI9v2e@r|zvUUbx(+)p5dcY@R-+&R*JrJ-zcFg_MB(>SP1=^?8fvNwC=`%L#`C2&O zR#(N{s`Ry@Lg8zh0N1|f;qYudHVL+5v7WigFB%Py+8UOQ3h;@>3Jm=I{gvfNKbhvhJulP_}}U!enPcXey`v+uzn!)Y0z%mn850<<-y^vUvgyeYrMy>seq$z(EldU;R7VWuh> zpTE0HvhVIKfpgIHJ&Q;S{4{mu5=zgYlHFq7_0#3-@x)CxKw5d#I=sBRIVLZ(QQ|N` zCwoOeZU_2&E3tXMtdYX}QJ&_bQRwtlsoPJ3cGTO{Ww%Z4yoXN_!1Zzj;Kyo_C9Y?l zAQNFYxAzCFT@R_XODn4MD{-5BkrZ=1Y#LiwxOJ_11S0q9Lk2kaJb7+0L! zT_D`p4^xTgvOG+SIzyf^tpU;k8)5sY0Z|4e2bDf_DWT-W2g#ab}b~qdt@OtFFF#k_EeirKlyIrzj;&qNEZB;_S)=XnhlAixMLwiKk zrV7JH{GYLe&UgjI^H+X|M1on>)17#`510qoxf=FGVy8cb3!F?o*ZY{9x@cfXY2%S8 z+F);VFZl@Q)_+&oVh@ajZASo3FUFXvvsdnT!hx*oOw$3zA+m34X)`{na7;Y%BG7mT zb&!iJ?fC_Eh*c5-iE-dY4upd&hoA@J;CwJ)@zZc8^H8I~-b}C?P; zPIjcSqoHH%XLJw17g#w{64bMCa+#)1Ftok@F!W4(w8tH=2aq^*Hqg0)9k{Z$(ufr? zSe|*6`*l@$>)~{hAfdC%I%~)o8=uBb-#W-1@{uGCv*9(wb9|L^`9rFWsRQz{$b_!; zl-4jG_yy>*Uqn;ax(Ej#RbX#`lzjX0|Xkstfz;v!%EAH8an_!R7v|3-$gYHffN4E> z&d_uTq-vmiEfOD@IF_$58Qi~fDt(f;9wZU=$pD{!9c2&wNE91VoVnE{HMx+=7*CWw zkBXFJ##Hvu0z6esDVO17wLd1YII72P3?1_5s7F>{?;77wxn4bHBY*^`=46MHt=A~b z*4nmOUT8%b2P=Cq8!!ot%H4-F$*{B=kK@1^KoUh7dRif-`ok9ETUuRA@J4pR_*850 zmf*hvig(f?7r?g+ip<(ib&}=_;wdQN;}=bVC6Z*T9_qHxR)v0D)yXH~bT5+s7QcJm z!sb73O|m=bvUvBdUDP)w3t4SZ!fs-b>L^V7=p3ooPXFVNo3x_@V*(J}N$!s5_u}Y( zrUX!Jsr0qIyU%3X1r1A!9Jpw{wl7Tjn3nzLDWRzq(EAbZkX)7d(HMs-`X87-G=N0N zV;k7`0gPT{U=3pl9aC60&1Y(E(KzK1&952Ri-*IC+o1dInuL+Jt~~<&kk#Q|^Sdk- z;Sn@yfjgiN+6dum3h3r-mX&~N3-umZqqBaRvHO3wJ>bOtFcV8OC|vB&^_r?GJU+FW zbcZsDz>wAFFDGDxMP#&J?RI0=Io7O(fE5Jw0%jv&}@Ja?sRza^VV#_buEEOaoy zvG{yyx25`Wf0XzQvzfbH;}5hGD?Q4qXMBg&=))r-7LfxHGhHklCrKTJR$dG)8r)2Y zDvbl97_y+q*~!WJ@2?7B(kW!YiMB9iK_!Mr%})EUatBHF6{v_96+-3u-uEdvGyo8Y z0QyOjKpNfQRl5z588{6NeKN?Q7v5hL`usgywPr~$1(`Od0MH$fVldxJtek>Fw>ws( z%q_9Sw0mfJRUT*|ftKj0*nULF{Ciy<7l3~?u_;xwk%0sZ$1ph+F|F7a83Pvry6tKoOS88Epi%sh@^N;cu!nJMq4C+FAK_Up9@qsjccC&B( zuj8!l>5quSw#LRc-8IMW;^hFW-%A+yiSFhB!HzW7(^;VQGXPL=U|C^g6;b?jIX(}97yN$W}w0C~|#k?0~c)o=$V zOS)KDb5OCw=_1gMxz04+!j+J~85A@AGu&gJ%jXRPkGqbdqcr>c_-iw2uhjT!&(8)t zsqS@r=BYB|UvRhQQ>fne&WnOF{OfZ&zqE(Q6gpYkI3AkaGcw%s4?-7bQg6v1bp2o2 zy5_UXa=fp`>Fi%NtU}cM$_3LJNOV$};@wS!C(X}P+dW%4qHF` z)U~{@ETqI?r7KDud`#I^2FiZN9=F8s`o*h%pp+FO#tTY(YD~xc#f@tn5*A+T^Ag^@ zQ_){K*s3XC;{!3V^St@~!L{4EO8X5n_p3jb$@AKvxAWh}ZAQdGQ;qLOu*UD;6iBad@_jPKJfF^9MS41G%T==pQmCQ1hkjT z3WGX&L5IsjRVF%tBiKP@z~%L4Xk&fRC>DUng!SH{cq;wRnJIw2>tB3m`~v%%i@&sh z`jU{clEtGGCNvP@RIcxv-h0#t#HH_%wuNjqF+UIsof+z-$7l`9hO`S8uZAmnt@9LR`hgV_ zSYf&am9Luf{>q?0m`OuNoHfEQIqtpXWX-j@YP6AW){IJ@^(h7IIopT3DlNZl>CtpB zdck6D<9~;e*Y-^CXxK?o7Q&2We~s5ZXL8IGtiTIqV>LmpVzo0c+2W8sfRt$6{1I_- zEzl*qisr(4*EMe7*lK}hwu;i4VoeaVaa6k@uTu!bOJ{>QP&|g=y(KmSNRPiQ{q z;B2M?IA#Q`(q79wNw(jQ(_RVMyGbs^%+K#UjWj%O9TQ&I z4ILOivZ!sTzAqWXQZOHMWMFCHi{qNWIfu@a)^+C;Sw#_7kt_V&+*qM^> zi@h#hxUg+`5-H&U+?^S*a-kRBXO*~WfXFG7+SD}`W&Fm6Psu%r@Fuk9+g9Gd(5ynh zYykccAzX{#Q{xSKoC3mBQ_htI%x%2K&|`3WCuL@4l8=kSuXpkWIHTS$y2xP2FgO3C zW*u%3?F;{={1_S23$4t+FoF}q76*jf43GC4Wq2>M|DEhxUv)Gy60{~GOK==QrPNVv zJjqwZ_NAM&WEup{?B_s+;M#Zr_nN#XNZHlD$wiFw4N~V}+NFb%;pT$sYGD~6!jCJ)4s)IR{!x_BlV9~j1jVHCE%D8*cF>#%Kc}M z1z7^^FPK=0sGEpvZba%qHf0-r%Hzp&LQXpD6bLJi$SZQ#)FS$d0wwWA)OZYQlU-E$ ze$+)MhEzVYJEIjPP?H&YqN$=-YIz^m_NVHC+NpOGn~JX7x9t01p|aXvFvLgia-T_~ zsr}1J;9nI^X(mNoZ!zNNu`>vMSg1`vSx5 zE9+caheSQNr3GHRSQR9vxD(LB<3p%DaMi|K!4 zVTehdwoD(KEu8Z4W=;8#nX7D_BQQaWpO^LsP4JZvR!CxoD1QdH9AP`r;7j7%6kulCV#6*H~0G!_Z z@}ku8b+vQ;nRu|SquV4#S*Q(a@la3MeNry=S=UFBJ^8#EK0R+b-zkJe^KIwFnc`Z` zCG2NPN6oXnK6&>7ee}Pjn*_-v#c1+nxkw!>JMVnM>qmcs>F}nH+F!IEte9amI(%DO zujVGqK58;;d}8l5wwdl5IpZ6VmG#DJ(wtwSqnb_KJ-o=KZ#RzD$Pt9olb;T-3ZqRPU`gZjifyjZjYl>UH=S~Tl z-&bj$=n}im`9_M`*!A96i;=cjUVqN|X$%0t7t|Kht3~Plnt(HL^mu1Fl(l#dUTjwg z9o~2Rf-1g2$Lt@d&_L#u*NmCKK5~3(Uu_PkMmO;sx1Ngc*Y- zMYxEhX^(CN<^`dm^SKk=8N(7JeE)4J&f(RciC$M`8-|uMl$Q&z8VTRPQ}72o)wWQ< z9+{o*{@#n)&twozztyHaPng;eCa9gs^{w~iq7!qJZkb7lyp9f5Ki~Ab?uK{5@B595 zO8Jim{dgr~a3{RrrY3*am$ui!mYrX<^0l6Mt2WkoU+f7lPp}cWg1O%wkWC-wXOJTi z%s*olNxjP03bJ(g)qaw-R%FNPba90%Gg#8;*q1we*5{Kek)H{*#Y!CEShb*C(t2&f ziw-{4`0bb&)_6tfwLzsvbjgnuJwv&g+xx2YN`~adP!wJ6sk?H62(0)qzMw(2=cwW= z`rsdw;CxY4Ff48$$U>ELOSR({+WT$GBQXR}<+nWts#Zw2+Mgube1Z3iZwJ=eDIsZmtMEA{A^^n{2r|(vA|BWmC$K)2;8VElS(5p_w{XcdmUCW^LE3FyOu^YIj7!c35@G z$%|K;4MtaX4lHvjl{lOKRrE{vp^`fa9q zzmgDJ3~D5*pBfeNHo(*^=p`E4jqh!a{XW}r?m8evlj%lU@;a{Kyu2Xyd*K8Wg4}Im z`J9ILa9#bb(Qhf#BFS%+Q6@t!fT;5w| zfoATWy&~0Wf+YT$+Zb)b07>EjB#po>-t!43DFWzG(>*KIGaSpT&nl18lV=18k!V?z z*)0#i1aI`!Dy+sJ2hA>*wL+Itl24*bV*D7vb9t=BvNxysVsPa~JTnDjv z%n1Ke?Ig(Z(XoVxXSgYd77=A>0EiUSKVvag+rk6*3#>U;Y2JgfauUfqd)}FGVRp9# zba;HtgM6p}`b8ch5=_v&C2)o;BYLcCWoQ=)jiaf6rV>{5uipj*R_Y*4GjogwD#1rE z)&rM|P;{8}n8vH&@;!is83caQg~4^f_dK`v1@v-u8|B^HNqe3F0uz*#?Yu(T{_Q}h zsnSX8fs3VQlyQ0A&0T9dTs0=K%!@PdHX~sU-sr*tW8o@j#sOw=h;SB9hOhrF}SDe6+U^wHh1o=`>Vb z93<_qm4B3{F7VnOc+{*Xi$xT@V5j=p#AWF{hXaSh+3ALs=97Mg~ z{Y=*;v~uZ>X29;`RU8NnWq7eVq~s>=4SaJHt1fV`=Eu`6Lcq2LZ}05%#wG#u4sO9O z#GBvW;Nch?0M~?vHPamNf2Sl)d z*0u0Sfj6i~aB{xZn!VR=f);2zu_1;c0ByW#3K|eM28hmRDrl^RBH!&o1IRw9mYXaZ za~NSQE=$`I#eOTxptubgJHy>7sJm@ zq%hw6ahw%)7K7aq`JSN^%-Twpv96a6+W2n&BnJPAswT7tC2s^UH?b8v3VEz%XqK%O zrWzX~1#bfj1#pilvhllfznJViaY3MN*|n<|f1zfW`XP>TS^*ABCB4<*QOmjCuAGzW zqQ0($0S}e&#IR$U#k}ze?`3Q1hOxVc$YxCZa^ zV{tjzyBwns5&xxb_@JVNunF;%^kP9Q`>hEt`q4Qxf&EBUJ8=yAe%9yP_7AG>{1G(x6ZzDMN`Q%9L5Gy~|EY zDrBYc#dZmvDSV6?rS)&^E|IL z=;3LNn-QmVR+ny(qdB>yQtgMRj4S)kW!es+BS8spS9x9CxRfqZhzNMHC|rbS777Uj z@ZnZ0E_~lYBPMoC)TG6HG5gr&A(hyA`HI8DG@qN(?cbwNhVfn0@lzWkw#KS1fI|YV zPHWVRs@Myi3QO+(sGoflP_@3oeZ-;-WLOT8fJ?H3cM2WPY}qxJkMRFf)S|lx4i2VN zarbYcZbIV?A=xGs4EEc{uOlg)qE)}ai(>#9q?s3wPbC|c??4#y1dx zrFa36ANw5#aGHeZvI}H9^lS@xeX}po7NUCWeo`z`kJa8i3bJo2V4DZX(3nR9ufzkkV zBUU^hDu$tj_7U4lv_}rP-=9M+-Tt`xZYxyU7GS=@uz|`UaAGQ}|HW2o;O{_a=k+-7 zPbwI(Vd|Rk<8SZ6TUi4Ve+vTM!>ws$48lxUej?GV{!uHZT+@0@KcexspNh@RMA|`+ zo(}zt@i&-$@n8LR%ocJTLChm|xfK~7w8yBQkcaH-v-~SvaGMGa?b~x)j_`3jkEqs0fKQcz7k~+sWl-MfkZ787MVg)SaVbMW2O+?A}n-ftmDHP zgG@@q&<0pG+%i=B9K7eEJ=#HdX2^AlNi#5nX(f|Hk3go5ilqq8VqIb&U!pTn{jH` zh~0HIJplNp9EhL+ui3?f%}J8T@+_Jvn!S)Djk-D{bXqRa zWHgKvDwqi>Ai|MkJq$k^U!p_|7jif zitJ1OwqA=9-LmB<2H+d8EI69RiQ)I4B+EyYLGlZ1s^JfOZAJSq)OD_^(Zh+CX)*{h zk^-tv4u1C3nbD`2$Sb{m#a07YCG>G41Y6aYZ{vNO0VXO{$lENVYc-Z81~T{ZE_i>_ zzRJ`%@>d-hrMJ-&3V6Hz#Pm>y)rs~3JTRs)3jJ$jf)GmNUX$pKU_=6L=?o~#j6P{8 z@G4}^fiK@v)xND~vKirh`smsgYNj~A0*9b%)gKgwmQPQhvaga`K8@np8xrCA#IC|w zRc95ni*D`)`wJvcL);0uQi7-0~86VexIHzX5c5mah%rqbE8OX#Z1>cI^?(9yku1l)d08zLHw* zF;2!}Om!7eAFUH`==)=xQUF{8e4WP75OjIQqF7kn=DF!~g{?^vp{Wt7>rrqu`o=#g z+UTw)ycs|OF5^iv)97D+OsI}t>1-!XMUL_IQIm?7S)GFBT|>nk03ivu(zv+$Uf|SCE6Te- zEw<$MsOd&PXUkLLD*;mJfrEviO}%~Zw3T^2_ZCIABDLshExfmdfNX(yR`Rx~O}B2$ z7qegK(H0N#%&P`@MrlThhoD8+J72B?sIjDpf31(;TGG4~aDH{u#9_UTfnx)O4^|MI zKO|r4Q~R;4RlKi%zK5DryZl(yNaYeq8^=hCsQOH{1yPlygWI6N*Ih|(TfZKYEFwim zQh#jjX4&>VcjnuBcQha`rz}%<_gb^G9ak;evF%eo4$fgLy9}6UqUt6&CkX7McOVFc zQ|-pFBE6BWVfIy2n)`_-N?-w;GAwi{ih~3BHsGN_P^tRUqpz5psouy}1rXlqPmYgP zXfFpE&Pq83i*tj$hik3^eH%b`d?DV$HrDjNSmhrgJRoh`>Ob8r0TUwfiC;I?TyJ6X zzEEqSp_CMufKiHakaWpPmc8UirBbTZ_jn<=K}ec5vSKv1`G?uqTT}}(vImRncdO4C zmpS=TV=w`M%;oChSe;B*rD0%bUU*n|neEn32YelNtW*B9q~ut{-&{0eJ@9B_YoKi8 z(Tai1CO^%alDCdtOdE%LlR|534BNd)TXXjbUD-wWOxKB2%GNP}FwhF{oUI!K;jPzR zYxK=U6pR1pWtRw-_^_*oDcC%5jRGfIhcXX$6F;TO3>HoK;II7@chg2a6xVb8i*;3p%7?Etkm{EF_n*c zlq?+Y%Q`t`?r8JLdrVuL;V#df3vev{_kbEm?d!_Q%IY&cj=Y64%<==yRDkWE7*sf( z44vwJ#h6R=8P%gckQwEBiFhYiZl#p0#KP&gde%tjxh~rVkk$< zX&7d;RX&R2bBI4s_+2-?7EL`!^6v7R5-N7`PuezIHXbvZ)lmR?FD(+#cABsZOc4ip zl>Cq3m2d!$GTE81uA|=9BJp^8ssU!rY#JQHr$FVQTN29uYRc zWmJf$j&;{VUOR+Jd98LiS*G|xFa52G1 zjlFPOZXEQ*K*An9>VV~U20R%C(WEOU5g)M-in6s-;vzLFhP%uoKE)v~{PUj6pjW+r zVW_G&M0R=K-ES(2xm4HIS)qOKqOv_+ojjS?K6wzJNZWRXVk9344*T?8yhZktwo&J_ z(={~iVJ2(6A&_6PEd&lJxZCa^!4Mdw}M<4nX zxc+b9B`0~P)cD91Sy1ubdmTihiNPwiud&x}Tz`Ur8Ak}K5wn07FaH+8q?jm{pn}0i zfs|JNV`N;2#}u;^Zc$pF$_1Avnkt~8XvN7bziGr~AvQ5`0+?S8g`;J<3}XRH zaVhpx2Ni&(h+o{p7s7Q%{Fa))W{RbV0b9zaY(iFI1lFh8{Va_zjW&zG?M zD3g?^D|*~TY$2c_WWD$g^PTmJ2wHVEKn#8xSHR?EPVdd{g4re>qMqE3>3Zqq}ZqPco&!(L2Sre} zl5OQuz&$$@-f)axoU?*KqWQ+Y)@+ax4PJXbs*G`5g^S*sV@yT6-3L= zjx+Am>rG2pGHJ2pwB!tDXV9k3ijaRY5%kD}8^K1DzMxj2G-mrnS4X0BB!ZVBV^J2I z@8G4{=D9c7xLPGVCJ>xNr~K zx@nk{^X;d2Exzu=Re_fy+eheRaYo}M@iSC7<3^I%tp`mu$*q(uH zjyi*CXx92jjOC|YKx9T+xQNpvBt=N<_4z1Gc7BwAP%uFbZj~DlM3vJnNEaNa&eYf0n#mN6Bb9 zHpZk1+ogg6AA>jFXA;A|9tDdgF-NlB5Tlz89+~byN<4rWJe&K3!vDvLdsKqc$_j!+ zpRbxulu?Q3_9N6HAAE?>It|P;8kq_0jt=@t>J%kIy;xFPcvmWe3d5!Q1{t4>%B;j~ zHeyAtlGZlfJ$}XB#fKa@%BX$9Lrr7VT5mkWdcd7MGLbOXQ+U=}Z*ffNza$5t3ZZPt zL1yo4HSvQ$*9XVnPg5-!K{)~G+wF&pQ?GR1=i7DVoo&F3#`?b|8`dy2{?L_xgbQQ8 zc*AlbgLC&`PDI4|>(T82$?J+)-741OgX|I($s>S`yH{%NR*Ihp*pD}rBQW@>jH_r1Es{`Jr3j+R0Kp%RaBUit3&0QvgmLfMuBv0xT6hY`aHBibRbBpR<9O z|NQvRXS!)alxi3`k&+GBm6g6s7LhLDkzQ1NhWXpTz(7;`tgK6hpYaCIpAcqrc~ zX`L&VK^#JUP8R}3_@U(#aeAnPW4MUMhzn(5Z`QwWZ>n$!f&)YM6m-}mb-PRHA8C#D zM&BpKgxWenz$8q1xL?zozuf&NeSh{TfF$=Yz$ZF?D+uvyqZ}}xRR;#pM@Z_=!JYMA zP(jH`w`M}&$dScIfFmSew3z|=3Hj;L5*$2AlC#BPk9o3z{)rU9fy~u^=Xf3+u3O&(;h{=D;-`*z};3**zH7z{ig0TrS1f4K@A58zG zS!*v#mIwF=Qtruk>--g{b_4U4B14%LwBs*`$e>m>ueA~X6_aWg(DJ39v{8%vU6*bW zrhA3|6~H^91)84+By5MQG8COH6>AejF|Bf$p2JjZ^$(7LuNn#;Cc@TFFDzDjzK_^r zYMp6^0+|g&7?FYk7x^%vEip1!DKHb>Yzt)_QoLs$4;DCj1k$oU1kW^d7WYf9_l>{e z-ZG*3&`>17%mm^j5by9SO0Ldk@M;0bgh&5sa!o^9(}l6JUB<{0W9ZQ9#MeLI%uzl zvr>|V>b=BScz6IApJ0(zT!&p|Cgin6NFlCs;E92q&2ki6X5tG-!1;nGK=@dgFoCPKRjbbI`xiB~uU*3$86}X8x!_((VNGsa z%v~`UE4;~>ft(o4fx5Aw=%i^)D=Tm7dhMT#aIgqu5|}l(g9@;<`-@po1*Vyh%}fAt zV-XbDvvrie-Pgb@1;F|uXQIFgxSt?i~Ln1t-I{0je1 zbJwDxE(g}7_fHBpOl#t#Z-Z}>kK&Tuy}+WG2x-b6BL+mwf7EHkcezCqs<95Ui?psub&cu=?U*LyM!LCB#(GmG@MBUpGL?1%8VlpPm$F~!~oQb zH3rE@xC0<0bjh%;Ambn*DI(;Oze>I~hHv+g1j*C!jr4!blWs4^*InEhyirF|E8suu3!>fd&O=!Q83_HZCIrQt#VF-Sm90wN(hMQh#~*$FEh_Q%23 zTok9tv)OBX=O0uK*!elCSZFi-BYepZDtNMO7eZL9ylF&c4Mc1L8bj&{%H7$SBe0@m z8!(E%1eFGQ=-~V#+Y7u7Lv9U1Jx;S*n?QhW>WNOU+b<;f~ufV8=I`PsJ$jwemy=~*tgh^%0t7S*2pHWe9Sfc+@^t(&HZd2 zo%;Y9WKfbnaV}E=+RI=00Y~MjrG{QRW5QzUY(Pv|LjMAm3a(Ts{ z8}iiV;lj5Na(%|1^z>xX9V{J|N^`gi?jD zVX~eKJYuMffg$1)5|c{eMYZYXj!wEqAgLS46AnZ9%e7V2@0K%`Y*jr&9tmgwZsX|e zgE|002Ev?uVLPD27#_~qCVKF%jgm;EJs7v`F4=E|R?`b@8XNol#YxBMP^-7X{qqm8{ji5#{+g+DF0)w{c(d1Uopfpc633Rm;*U8((`B1Y6ech4u(K-{)`hs%7#38+ z){?ZzOR%RHl{^hBUQYfnQat#StOIC$`v)i(FwG7rCy$zTW8rZ?oMhB|Wu-+?+i^-% z$oSHM)h;cFb|2SK#vn)}%#^Ty>=-g!f>}tLR<22#SRN|X?KjC>9BkBuife$Bv=}}0 z8Ci)fT3WZA+|ch~$yDXPmes|Y(K6395Q(%KJx%$s64KI&oXU|KQci9_wv3ikW5LB~ zN0{O*0>oo>$DY)R6mD!YcG_;`&3A`rE7)fK#^3r7zgLp_$Qk- z@)I!;U~>8O>(4NpQ2Ja-Rt`;{06ezhMBPV0y@xpWbCg|<1aTOev8T+8 zk@ucD<295Y0PcaIcBw{)D6#8kML0S3hV>~94tG~SJ`|vPV`OAy*u$O%|6%(t6D}h= zV0dFuSxJtsT2r!ZMpU+Yyu%EI|A$dZnt<8p4l0cB^bY!r_4np|UO2F0l3BjR2|BAn z-S0-rG4s7s8QcF+T}LvPug69f!UOKLI_L1#lnf&mpVnEsxhK!*svEkIe3J9=ap|mW z>dl#(DiGt?`}SKUjL1weH;N#h?aOPnfw-tuf_NxPEZp!RT+v>GI4F1I1bhLNI7yF2f)`c9KKTSsq3fgV2FwIAk z*|q0)&$R~@kYJ0TqV>bX3Z^v})N>VhG65U~#X#~vA|Hd@|3OGA!s;m{YEzv)ys6*+WnfWGcOlTdH!)E?c-G|Yk&>9uHd5ykBZ_7z_!(C$B5`;|+ij3Mo+rROK#xBKz^M3$KPMv&lFm!`(o9L9!Qf#0FY@J#tlaZu1n~fnWrp8hGBVXm}pL+U84+40hz0YBz5m{&=2r-ZbVRCK^{dW)D?N$I(1| z*Av88KQPjdeD;)dB0JRe_k0_6v`LrJxQVc!m1BP`5D>1b?`Vr^;FZ*P_R+RNW9`{7 z!=gnIn&~g~O10$HAME(F`}7_w<5QdPB;Vsz@8Ry5Xhzd_e{E@9dxEj7g!=vPBVimI zhjOLeT1P&s`t3Wf7%Zr~d(=~?l#F8u)ab5m9_ACo8B(aI)o88iKj*hk`1nTA!6~26 z{LF6wgAZ;SEhhe~uAH0w zhET--qdx(luC4FSNS=RY-k`~YYGrfe$ZjSF!}KtX4FF>raY`Uh8DcMBm_0ibO6)6} zXN>>dp1a9TjCU(y7lEjAD`H|_sj z*bY_>XYuFNGBhKX)G6!$FEOF)geL0Hn2jS}4=}ZpT=m`^iJS5I*tb8QK7Z8FiZb~9 zqrG#Hr3l-JIdhYQm^~_Radok;CSE&hE-t+|eAnMzTbj9?l%)>X06?>ucQjl0 zct_^S&CK~4!`6)ua{UYRMK z0U87m0EwML*g1M{dy}Z|V5}Ylh(h*=1L)Ta0lfsQXjH;|q(uW}?0TAQq|x*SLYba` zSbgn<;zolbyJPS>8}}TYz5}^i+T)1giV+@LxRHp5(gtP~{QUBwJ>*m>2JNgln^m@k zrrP8tIUpV5s=ZZDX>CK%iYLr3e{R;fpE+hF)pb9&a3^oI%vyRc9kVtTU z1!X`C9+WTDW0xcy2YT`zXTRC?+}az=1Wi@=N`%~)Zf;#PsDM1P*j4+?TKsQcXh$pJ za>@Q+G}jQdk-zqTJK;g8DYxz~OS$#@d0+C9;cXnDx*v`l1bd%dAfPhzt>+_^X2FhDZ39gYHE}WoV>6-qgZPTNF&Pbkucj zSN!;eZc8=l(+21sAk+l;V$6!UUKbm&8?Le3Awu~O>BP(qZrswA@973V%`@(lQh^+fll3*#gGq@C`>h*Sxz zxpkOgfot*p=GCIVl*71NAa(ltW+2IM!J9MU;KF6B^_9%}$Q`~yEb-6+n8EAk2QXDe z?hV}ET>i$J;YJqnu>>@xHza(G8ogGPCdboq(6?4tdPIFjIH`;EzQ8&#xPHI6v+X zw8PzlqDCd?>H~{d){V0z_xQG)VPWg3WUM3@Se=^|0`~YYYH{w*(U(V3xf+>8rCE|F zwwg5E=YSz6=00O>k1yK;jQ@7XGJ=VgPi?lNk&}hF;h8fJ>p55dhH1oqhA1TJTastZ zsKMzfIzRXhRdd&-d%pAEPN6HHJ?Esm4wtK?f98M-;CB5u!eRCs{Fx;+2Ke*|fLL-fEvxU{`@)ox?o zLvS{O7@hau5eAR7Wz1YenCh$42F|cKxkp0&vhxLo?!m(SmU>{L_#rnf*^K}P`g`f* z!6<& zF!aE9;XyyXMQYKzvJ~^@F^qg?+N!#alaLSHB;jqf#j3Nvyh~5COIORk)&KQ}&Xd~b zpH%E$d+b8bPJB{~WJ_#YEHO)ErE6u{si|9&lMbQoEc?;+w6*~GeL{EJ>GW8 z5Pv;9?N!g(S&0pN)rm&l7O$o-Ae;vGaPcgI*4siIy`Xk4gm0K5h(p;+8^KAw5GHj* zvI6bsCX^b)eS(VFy59D{%vrMn-4aszR-g%7s@i0W8>BdthV4sab^%`7xneM(LG=Yl z7x2dFysBM7XGSq>%$YEg`Nz?Rk>d`IMFBSPwQ}!hD8#MMhg||Kcv!LP3K@s6?!ji1 z+P8#BSlp1sH4Lg4@OmHGk!sT%qM6EL+k6s_fJfx~0)y7 zi#lc6KHt&x`c6xES*QC3zJH(4ncaZIEeI7O0Ax7_g3pdeoh;L6$50J?`ktuWG(c#@ zDABTlmf)d~lQLQ-P3kHFbKux<<7*-D4l_&%)dxp{|@twOO2GtKnn1Am5Xy>|Kn8#a7Drr}ib#1C< z3vnvz*Z;tTcsIr-zxj3R_O#<*62Apa4v3uXo_i{M)u^LIZ8C}!Va%G z+upb5aYD@|$F}#LuDJ+0lhWcgH1N;3q$X64AfIamRtaJbOsrhRHDvA&B?-pFrF~$E zOS+F@F=&9r8L+#elJ|*U89CZHv|qwtqW)dJN_0s=$x4ozp6D(?i=o>4Ygm<4i(={$ zcpw}5h=IkDR5L;I&JNW3f(L@q1-eYw-OlX{G2w1?G39fTjw7Bz_cY24@>P{(Li2YM z^T}~?)PU^81<7j-R5?ct3j?ondA%8J+K+T?eW&s^D)xk8oTK;6&rC)5u$b@1ANGN7 zvlbU--5+)m5cXBoyRT(p{E^voy8qA6*2Y(R{q_|+V2T($h%SRHW)PJ#!6n#*iz7e- zO+V?vpiKKGa%D69Uc+VwNq%!Bx1~Wz;Dbco6Ovu)FIMmXIW}0O#Va1`;SuhB`0cO0 zp~w5T?s5xiY;4RDXehF(XjM@2eDOzTfaMkLLN!GtKKLfzh`ZD*&QwWp5y~N7#C*U8 z*e(xiiEVkAWK)mf$ulNaY+fTyHg?{>B2PeVyG}vH&zL+IejVK>=kTvIrn$(zS&R0b z5`VT9jZZmuNkI6j{}Qg9sex+EU`Ln)W>- zLE_NytNs1n@%HW7h93?+=`vllzdSG2wP;e|$;q3q-#+; zRgCp8Vdf%q9M8I`K$~SQ;JFK&29w2GE5#<+_FJZVxicwqXODz;TQzyfm}Uob7_dnmvd_BC$k;=G6aKqrz>pDe;%H~b>A1keR-#A+Nq;c=%XaBON`n->A zvc5$4Kg;oa&kUbJhq)kzP$TkA)q5_cxDxOWHo3e>Y>JVg3XO@UDf+cgGNG^&ue zKSI2HP%15w>QVcq66SEoH~mU{*we_Vlk+su{Q5>|pCN;bFY{vJB z(_68gcI-jM@~~geBf!ItH<)cfNw_{<(i>Eh9J1~&d&B*z$lrRnZ9c24tGVAZgVA@g z?dlXjjuSLG2MB`~zh=DNy<0|rhS~lj%Lz>WxC(Q&N^)GGSI+!txHKmPE>}{^Hn8F4 z5RhF?pTZ`g2*!V|qVtkOc& z0g_m@egX&>;2!p{%TaJU^k8J@T#acIQJDdQS3Sc!AHaSpWOo{M&~gIuu#izq4Ms6! zv>pRSaQkVzLRdE>?^x_ou;J1$Z5lw!gH$E$YpYdk?6Pp*huEuJ)h``Gg|Lnhm)N}E zIGiLbH6W1yw{a9hEi8_@%9m#PE$Bc;Lo#oBjFzcwy5#1%f0@gyU$;G{^=sQshkl^V!8MUa^I$~wIS zIH44N4ztPcg;@4Pzco@BS<(-S(nxr)GnC+ zfb-3l`oR?QZ4QGIrQWwKabe!ZjMt9_H17%5tiOEc%i|z3nY@nOU01z>&)s33Hs~;z z!7y&Qcnu66UsOZu^kQ85rnyOUA*NmQ8H<{Ul|kws?7Zz{qbPk39j;!roSh?^5E1v) zzwxz&PjDWvRYP-u^VbFuGY!pYgqOC2iLrc)CZ?$Vw9RumLeK@RmB+F+A|KgT#W=&EsA@oQrWB5+e>R`hlT% z>-4%uR4z0pz*jMF^<=wss)J&ukD(Va6CgLm7)58Gtyl!cmEMS2U)WF2+0Q>zq|b@< zFEKgP9e8rmIK40Efa^MDKxx;B)LVr$#bs$lg(dBJ3iqmUyifKO7`Iy#i9H=+&|Tc` z%-dnj!FYa*C~KB^D6ZK=jkQh9Yt`K!v9AGsZxZ`z!54?v-tfEjMsR}ao&Cz5@VWID z6z;#7WPX1@=7Bi9V5)&=u@RFDXjh7IA#D3tc>&29F+V?GGvG@sdDclzVK;>an0aXqi37s3$DynQA-^N%+L-EjAIAGFm_Id6p%ycO>M4obQLI^N0tZ@3G`*Q$RB*esjJ_s8V0 zY&hJ9t4XXADFspMbjDdMAdqVJD;_uYE6!~id|GHs&MIL?R)G6-laelbUeEHuWKNib z7HG*kv0Dl;)f>Ty7CRu~>4AC$35v{BJL*29hifgA#EK?}ibSSkw1)xnEo?V=9t~g* zE}ht`Mg%SFR#?dN z(+r(Hw44JfA^d;#tZlq1YA_A~U`XO>+J-x=dt1Z>{u>fE?aO;hPCe*W$-f2N$vene zw^!#I7u2F49fcP>t%1Pe3~+cVVzfjcGBcf&`seEUKRM*T$AN1-s@tA7AQ4Ue=(@j- zBkS?>rGN8LsCBW#*12FjcC1*v#Z>KEg1{nU@U`BxWhRe+k%J^b!Okj*Z(!%bU5R^ zg+B1H`w}MFq@rT~IQ)g#CIlmP7KOPsTbN@>PN)M;*!(bniPf#2Q$FJXI6Xicdof-e zBY(2lVJk<;pDmvjr1^eq zgDfrDGJr8y{L8APAlYuW`f$>=e{|W78CGBFqtkzNFnB1$a0znh&4rQiA;J!}&~EQY zQ*3mZ=0M=ir8Kv8b@7H7NrHL$eVO(wMk5EA)B$e4QbC)x5!a-QlwZKTpImbSE#VlW zMcql^7)*6NVFdMKo z)vAI~TMmS2ZKigLZr7gt`6#ba#a4i#q!;9u7|9gD6NEU-aT0%sRCJ4t}KtkwY znMN%~@aU|-D8_GaB^SRYA+!h!WOFx)@&&{?TRtFsTd7P179fwtGbFh&3kuNR$=-)q z3$}zH%xV1?DN=*GS`P?ZZHWpm(fKX3z&T63iy8FvxTe+DsxRa9Vp&eLAJ4h(Zp%>q zhECfKo%UW7Ml2_iBN6rS5DBE;!{c0$aVGfDn;Qzr5^0iM!~2yUT(k_btT6#soa2nUio56i zdK2@&Us*T2rK~TL{zS;EWD{xUF*syfP`qG;nRdRPT~PA4aEE-IuSK~2*T2+Ce%$o4 zM2VB9aiH?Q&GIsCD?MJ17}NH@p=(LA@e(7xw@Gd_khiVzTciVegb-?qa_-A`R7?1V z#bMFx3rDdqQ`$NU3e&}zthp8ly#26yM%=C&`vb|d8%0;iJUub;=R{)?_U%9euKKO90*A~kB&B5=PdbblZ7Ft#lxUT{My?u@98&By_<=}da<}4a1 z(x3!GJQO*!v0&YH{3@G8gph}*e$(|CtOG|xl(f!N`mqTwtQ2*mIkJ5aqRn9iKr(jN zVlcVEE)L?B=>Y>uVc&z5Qa)Ez*nY5bN05*HtEokSsmuPA%7a_vMY|VK(VY#ne}22$ zN>zrTl4pny(khVHAj!`LKz=X#8OHa3bdJGB^i1?0PciHraQQ$FKsMo6YFo3{!IT&){@C5s|VPbK%>Wy6scm z=WX^awI7>H%=$cOcaGVY_rJB5ZpFrOey=wUvQrKw5c_?XTxia>d%~OTALydFTikVI zg#AJ3v(8MvPA!=!49g^WYY4n0b44adylh8#d?}l99SF5a?^4W~uQYwa#izALN@hjG zi$R$>`IR(zl{!NvDQ2K$u=n0e(T-D`2Z)0xreR$gq{d+P0MR}SNLFYL!Yr^?)eGlp z&I3GdyexZX^5lZvNP58s{?X?-Ds|bF((>>e^2SO#3!* zaAbbvOTOH2`wDCmYhFqGv`zeQ2emritA&H@6BAQ7;F=be^+$8^Q17r`iKG)I0m8uMUmXy6W_|6 zlhxGJ;#wb^UfQO6@!~~J|EyVZf}gIGD>lCI2W z7r2fukZif@InV_k%iFmA;hxG+M!7=S_QX?XeSqoj-@2=}2{Kl@f3oV}bE~)zuq4~S zqIgFox8sfe#{v@@Fk|vQfdlc-Av$yDli{;O2v1+%Tejoynl$U4)PlSptHj-t6w{a9 zajU3~?-+|Jic^Yo;}j6FKi%E$B^lp7_}Sug)|;jB4ejN8UDmZ5Ck$sWM1)U#i*Fib zAM5Nd>=JQgAY{kg&yHH=|U^E#7n+Qg07z!`v z``(}*g!62OR#lP|+?@ICz30OhcVHit(&lHR5^@}m_DY?pAC&;uriwk3Wi)K0fH_Z{ zf*ZK)c91GHQch{aFhaja;s20uX;8e?W_CG1#Mz=ipg!sz!>r^#(#LElx8fmczY zsg_*5zD_fEI4UUmMtYNKE~k(5yE|#xd~Hf;zH^w6fuk=m$bS&l>U!zI0>;sst~`zZ z{d2eg8<$9`v#Qzo<>})ld>|$ISDyZM-!mNc+D1HkroiPHl;v)D0H?p{gQ~I{R@zG@ zRiF2N@xx5xl?FEW>Y}A`$(sBgYska+k`%S2d7;zDUWHN1H}fTOpk~9Fg8|4}QOEvU zIBVE|dK2*SmeBDZ#U_qi7>FNX1_Ra&P3cMNQYnbV9?GP?#F>oI?;Tf@b3vn3nK@z{ zU+KLWUaTFsO{Z`VSOX`F9}npA`pse-57BLx$7*!EWd`Hr1uM>33>Ie@8t9W5HfF%2 zqR7%1=@5E;|R zf-hZ|r!o%U#(?maI_s>lx{=QdoqSXr>+E|o%b=t59AJu^%IFBWh4lkC8-7$IH(`N| zyTH^)9^MFpWsW&b$VsP(?~lX6gremPwJblXiJ3NL<4|2r`TW<;R!>!xHKLH}ti+Ku zK{ggfaqe;AmJ~Q(&DXhdspRP(v<-NSX#aa?o>jdeKPdDa@3yUT_l4QsU3ZGzw zPcLoSFq`u0(chjud)D;HDAp}=xLzt6-+lD*v&hKinDic3oQLasZAZSSXRXZRvpAw~ z^wd~)g7SFFdf7uaGpa5|MswzFLThr1jz_d(Ei(GNtOmL;O>k?*nK~L2zoZnJf^bhA z-1OUk%%6BFYb@?UkBCE0Cj_u5E+X`I+^%y5H;$QX9W&Y13#+unn5!K5nS=&HtM<3% zif^CGc)Bl_vyYB8yIo;@%$wfp%V6lQ5$UXT7%W!(^F1vYx&rpD;D%^`o%?ZCRo1vq zo-V7t&cVgm;IS11JZ8qky5w5(!lP zD08{2qeBMP1Dky~b|Ca))8@2}kdYGWmt1t)Pt9O`G^*6HcAI{t(mwyi+w}a+g2T*U zS~oYQd$$OnI)8hBg%W}XZmc}m0(c-Hq!FDQO3n)P)UmELI{>kw1O@s@$2U{CQq9`R z(}%VY{~$1|6`t#X5O5bH>6R6qGK%IQG8+3E&U|MhHj<{G4d&tO3ZM+J8UGe8t4JZ_ z!w&GpZzB84D=I3U`r~~XerYonQXb(@=)xzxMNbR!SOi=omV@Y`)Y%Ltd_LGHZ7BMA zTVVI4KzN2TnGaB!39zdd&~4^F4MYVmFL_j@+3Qcob%a)>{%684Z~S}jLN_<@Pa7&u zo^yM>{|3JKgCelLTkwe9iWB~x)3YMX|ME&i%S$-N)xB9sfLA;prhNJG#b>9PZmrYQ zgxHyE){9fM`rcYgYm5^VKfjIPF&SdR^Ypt}Di=_(`8}|3<7LQ2^UUj;hUFQdY^2)! zAu2^P3V$G6HF>Hk9h$x#8fAsRDq=dE=yFRD_SwXNa-b_92WC5JvqQ#?RMU)m&n>mV*nVTW49R$jmAM zn{N3|(=!UAUK705;H2QIyN7GJZt&Y(KssPKvVz(as~?mDift}c4m;)uU^vFOGb4}X=O8Xt^9uHb2#FHyS%Xoadz zR|uSqc3IV$<-N|x7xJfwKX04YdJzFbPznRg-mWloR`>u-i%rYD10k5C8qR*Y9g_qX zfRPznf-OoU%|021^PGPuUF^H9{alu)gYJjZWf2yYo+o0RdrR-!j9uVvY2UqE?YlzF z*}&cIJNy$}+rFkf@p^CLRPn(_=JUsn_|YLl{qMX(eb@SnuMI0PLjb&poqc`Xv!Dhp zh<|w~U{-(^sU$M0s;bHb2cVujq19?ti zF{lj&e5Tnmm@`^RbG^0sW+^Q{>fM>AMLy& znWCJry{Yp8Qs73P^wrUuy}fo>eH{v}Ot|ekDYxA)>4d`Z`(M*mE#b;jR$?~Qr}wyB z9nN=R;)pOtG4?5cZR8~I34@d-e1O9dJhx#M%c=ZX3covKaM z%YuZ9?7w~*-j+3I{Pnx`raC6q@V4yc++RTAj!V z`FA~eDJ}O8t32BQF8EmbD}yT-t{(s~jC1(@adq|Zk1o;n!?)-1r*8`NN}jr`bL*_T zH2>UH%lbVImp2Lj_-ehjCw-T0uVZ>%v9;3{xfN-BwS~=<2J;mnh9;w6tN-@V4gQss zVouFw4U_?`UXFaBkQg7VF_+)wj6!hKi}c?RSb{%6gQ7hmPb0`tFRcJmC?c1Nt2>+D z(7iLtMdXM05M4uM@3U=Wx*%B$ENIA3y*M|2u8Br9+NfsBLxh@ z@`5RGJgeJh9KDHVx_T!HDU>GgbJzxn*L@WkNMqJwMNll_FKU`QA~7jzLdQQLS)40Phq2n}MD6q19fn6`E@P-qAzmTs8FUqIaTR=V zW>urcFX#R_T@-$#yvFcMg?b=w_A&-%Up|U zX+@8v#L-wTVx{z=$@XUgX6L^ztjBkD31c$twG-c6PW_hcal|^*DOzA`ZbUJwKl>LT zHdIAc_vZ0q4WH-3j2)OIC<;hm%9GL==HK+hL&?>?Cv|=F%)9F`&2go|9GBa?aMNQr zW-wb-frW~MFhY@2;>z~(qLk4YEl>|Z>c!Upy`77UPEufI#m9<`p-LM6(b+r6=RedP zNc|e)cu>^ArpJJQR8uPWHUK!!gj^yg?tY%^)TpwcXtuCFHaz-0INhHyUFle|&zYs1 znwr-w+#QCnoYi0*QgJkrc|VQ>4=2Y%K4K||>E^h2KJ2l>Wz}`tQSt9$#_30SFsH=K zQokJ{R97?PCKE4w6o1Amt}tC|m9lHCQ`O&FHLB+wm5H~X!dbP>Q{dN`9Plm=UY%Wl zCvy1xi(1!qMf&%Q$XnDqY?y^}Ya;!l<1y8eM7t1> z&P+OMJG4?9ZHFqjoOpS15uPN+LEY8MiLWLUbDN$%zT=8!AnX8r2jg8NO0p_Eu5?=s zTbuXaUcMO6I{e%Gu5)$Z>jy1f4A zGwTK~70BY0d*jR!jC4Mo*L4(NbSo&`quaI3!a!_wWunhHnlfcD*iJs!f|4nXUq zxKa6YyvD98`W%**n1RQihl}TpehoKop8wZgG^&QPSt?(^s^+t*Lf@}GG)PaU5;M30 z-O0~r1V8(n1ybF2RUEHQjWv{Bl`NaXYrJUgsw+oP;D3yD+=1Tr0abWod;?!Q{OWkv2}%9sXtBxupOrt(Xfu!t3q0=7Etrm@g+9nJfV^ZEhbR5*u>-4kcrU#;c4{I8 zZ6OuSoIp$i=b6anV{cqAyTRL_gJ*YDv0w6ZCn&5OH^1=KC1dSnn@%q+3$x8rlQ|{P zwMdaSaRc8a|5(*;S5LHEczNFcbzVXJQhSHcJBd;gW2yS32_0h&u8-9chB}q!F-D&1 zw(rg5oCXh0%w>1Lx<*lP`^Ue>I=O91s#?Qtfg`8|4)xX+Htt@~+9=g9suW<`=F8k; zxXLVNaet)Fgs@=aUf^$LsKRrxkE;oPb@9}u9&lstV3eJM+q?`4Ui82Y^>5t;1nBuh ziO#e4zKW`VACF@L{tn@L80T0Hmo9gdx`VFjR#t{UX~W{ZSsF1>JZg^DK1aF?OjmmjnYl_ zLAmvS+rFNP@|b+VV7UuhO?XcapC2A)>w_a-f%N;+u_`ziQ!XCYyVP;+Y`MGl{Y5jd zW{#s9*o1-T=?y-!8gGe#mGyh|@1*qEjAzbEMTTxZIs*?CN5vp%N%+;_@uR)lFm(w5 zT2k|_5lRz$SOn(Y(A|uqos)L0Cg*CqM1Qt*;Z=XNYOkHe5Y~!olA?Qp(-uKcEKBq# z&XXo!kT}3HZ}FV4fjWnH%&$W^6hv!2*g%$m`=ak8F!++W&7o|;&PG-h@`f%vl$Awt z>rCIwj|2FbLp?E?1hlCRr~EIc83x#Wi9i?whm;qA1rHo>g{9MBEOBabY^wBiiZlos zMG+eN&R|6Y>i1l3ICOVVv9$T$S-`D)s=x0kspMq*rE{2>we%cAYi?E4# zYD%Am;mKYRJu)*8$H=TmEH4K|2=_lV6RTXR6jzkCgZtXchc12vZQnEKOcXBhZGB` zfI;@~G6Yg&_)M)k;6Ia_&ck4!IT&8tqGp9)lW08Qa%LDNy88L~J=Mu_L>Ipcuv1ff z;pW3AN8zjEhO8nk*m&y$nvx4B(Tq{v#GytC|LAVt>ZQ(^HFv5@XKE#WAizjH-bm`_ zait-MSdPx;vcf3JzmVTYHR?0b5mO>X8mC}NT~DY3R`6WBy<7}c)=C)%o6kmUP=vwC zxJX=;2dB->Suj`%3A+7SaTY&OAa?EK@CTeq1qj&4`1by=8E7-%YPcvPdU!}N9pB(a zDbd+&ETQ=CDSzYUvBR5F*S$p7m-qD8J zB*cNat;0LI1<_I1@h&H=-UakMee&!~5L^bs9Cq9YE+*tU^LxcDoSGLiL-D+-=fVOA zHOP-boyjE-!I>+_Nxls|Wu92+1+#=-IcO(&gi`gTTSFvYWz@~C69`9MKJ+ju$$m0BQ;D$d}& zsOJ$T^8Je~$Bh9~oQ=GNj(yhvszP7M3!0;Yf?O({0K(9F@>-$iw>5AQeXtG`2wt8H~#F#fOP@At9#BYG3W1{sx74yzamyFddp~0tPtJ2uA<6Z2A~C6LekYTRH7L}e*1u2o zvn(}LR2}}8k~C0YRL>8g&C(nSkx+S+#huIZWTmxW587?GIpk1BOsMSpFM0a8|3cGf z8HI+pzdU$yd-Ti?yOpK;_c=^-=_upGe0wawQ=l*iCq%=zV(}XE30YGUz30$Y;7LWT zu?QYE2bsa~7BO^p0NnRjXYo=Vr&b|zyvahWJ=2#BWy`((iW3UdqMpf7_~?e;n^8<^Z?Mn56J^|;b?oH= z&jc;2YC(2OKhP!WGS7~5;~4~8bVuB$gRx^9Pb`O{hx=&Z21iVN1JY-xQ2-- zc6X5btNXJx1h1RHaF~NQ zf19ESj!a_$Y{s-5x=uH^qS3|UmI%AH12pRU^ZggJbKO9LnKNuLSPkEaY}E|h(-n0W z24nu9yZtI4BTYHpB%^;nm2T6ocka6KqEydQ@?u)?AMbYf&qNXY5CaKElq7jrn&pX! zETi$37f&V?tr)TMYQ*gai-vwZLYZ+wu{d*vmoeD){ca`>YsIVeJlF{5rputCIuB=d z#}~fm%wA>!E73SyHdOD>rRe<^v>5__9o#s8HR>b2zSh-QW)OAaX{*E7$_#TG`~yJW zrnTsnsDA%eU(^`~InU3V$_@+vPJ5wMwnyUz*%6sF-DPw41Gdgne>vbqy-$;U!D8iOan`kFebgWOn(RKEW75DCe+)?k4n8n}WRRm-|=2W6%?{MP$!+fL^HbBVwUQY=&q zmf1~}fnfR_C;8kGZ}ver!CppvG>1TcL>kc|I3k;W#+L>z!AHTlm|qr|V_7b|8U0mq z=+QC=v^=4(n3^(&X5NN_5mwiF>98|De!}gU$Ls9WaX|LRz0&?a)JFN1!Sw`}z*u!S zZA6m#I-veHX$EVq^Y*58R0`@((&6M0l}Xe z2HyuPEiJ$G)Ja7zm4Mu6C#sre=#$(Kh(py_YUL;-e&2JjUSHy%L4(0y6dt(gSz#oG zK4qZFVYrw+=OJbzDDYPrMJ)!t@gGn{TE}0%#pwqJ{%k^zcPD1)@&=#@o%ypIPK|gL z6BAR%?4E_fD$>Q@a>m2j-TyDp{Er6SCzs;$g`C|+pB@PhFjpXN(UP6bw_m7C3MmKi zVV}^&@ivL92$y2}6F1O6zr~bf2?l3}MdU(PY&j#R3HrG6Xn;k@O^i09x6Eo~)>Jv< zHY}47`x4#pA=OABvhww?L0*pfjl8toR=1zog?@cr5|rmxd`f=MO6$FXVTP>H#UsIe z7q3)1jJ=5eG2edV)kxbb`*7|u1f{7e~^Rvy-j8Cbt5IWm2}xa`+%Nclf6Ferc@@>RMZ@_ngwYjDbv9VvUn_>DKcMO7d5-2Vrd{TGH;C}N9Q!_`DtgKg;P?oe;_i$nc+(c33qY@a<6 z;4twp6E)jh)n{IHqA5*RHO5VQE^bjNpG|%N1b1E|J z-_CNcd+@%X#wj^A_xS;S_iJ}F1!5UuUa-et^HjI zb3(bDr^4bhk1yE$?e7$@TTl^tx_4W6IluJUdZ)lYScW-uP!My}r+1CoRw zb&OHs)r#kAjP@= z)7_f~Vwtw#!%sC$Q%$?2lD2uv7FsFUYEl!4tdXTik`RULrp8o=7E;O*S+Yi!5Sk__ zEwU3*gtBJe`Hu5`G@19E_qTn&@1O7aqiMq9e(w9a&g(pn<2a8KDUiw3_jQS9(%I>9 zp7~@RCfoP%5!my=@)_$0Y;`ECtlA$XSz08AphDj+P#Q4g>^(N z1^8&zmm5HNa2Uj z+hs^-5~V&1ysLzEN{@K|;}Kx=pI1sy@eH0$$HuKSyM=JU%Jy0-J|pb&pd3&TQr4!# zm{>sY?3#LZtB5`i|L)5GT(Y(uPpiM?cCl^j=A0M0iG4K&9>S<#h!}?V?$%Yog;bc5 zngrFBg8rZiiFCpok3sjs)!A&EJ*05_ z?R{J3{wm*q*Y$jqt@3wEA*Dl6qO7GMIVTT3eRQ|^S5XvIyJLYf%94sW>oQ$#7$3V( z0_0kl2mcb34sxPC@QFPA6CoM%eM%u`tHlucJ=23X(%JObbS8k}d_!Q4^3lusAfVde zC!$NlV@F^1m=qPx;A=Npl&p^88EyRj8vS173+NrY5w1O+Sk*skOM{uchCCQoC3YED zO=Y0+Wo)B`B_$;cM+`X~QwI5H;Pk468sH*ZhfxmJp^l&^KxzQVjhBFH5SZlibg(&o zSq;K@o#%$vzd#!X3wyUzBrBc(qBxEOTQLPObVg8N5a2*|lsJN5xKzg{Gq4*~EsnX| z8b|~1JOIGwjfC%_P(&3nibtd#hJfdoTp!SYbL@j(2j~u~*_K#5ShJMJI$v;Ez9;Ye zcqN{Fn@6_{z6)u;zdrZ+bR`aHjoy^CnbmSej*_5+2K{9 zI9L^l@S$53TufM$tQ@>9cTr~Nrnr~Uwx9ySOFE+^`P}gh1M!aaLIpVCQ91$>K9A8| zIe0Xx^LSc*4m%1)CMKgTosI4qwkDTh;UsP^#WUTEI$i#GV$d$TGJrjiadG2#5AjTd zN+qy`8Fr`sSR`{^bEi}gn=ptYIoaJ(Xg7=Sao5DT2lBK$$jd#VYo9CeN?dTs0rPu5 z7xZb}FdRvd`b*@NRowbw%-Ou6YfZ0oP(%6gYO|ssF4iRh4V%0WSYk@&4@y7~cpBY9xJMl~+uTA+1ga!hj<@oK8uBUGP@@NW> z#;DTQEX94(Of~!e2+O%;>sRu8x?Nr`Li&RDb~W zc=gcOtG=wh%*|NzhES$6HQMqZu?nloqoq|($-|WO2At@|lDs(aAn-eg7QTnD!A-Oo z_X_O2V+8R(3qbiDeNR9eRTN?Mq{d?9=_BL&P?K2kdyVq!tge9eCzz9Z1Ce3gt|jxb z2TS5rCON^U?XwP?#7;C6P>u2Gj$a8F*jRFl1qzn}WJiyyVBqq& zQmGU4-zMTg7jA_S*CdTp5@bPlcmtI*QNUSkU2D?ZwVW=M~4BE z64k3}9KISL{!zDg)9V`>_^8w{e^C2yVc*~jHaJjEb@beIP&#fDE~DqbY-b;4lCf2z z>y+8&(VZ$u8kl}$?AIA-srhM;@kQT-`I))i^!>aYs8L04-@HgQLTyPC1!Z*q=GyZ= z0h$J~WzN*WSiJnP5R3NKvA~wE zX&A_U=X2fn`z#=qZUZj9qKDhXK_za#%!Rj+-BMwe{b#6I?(6Y8DQFY>h9VJ;wngro znH)k-n69P$0d#^~QlNl9glsV{fw$PRa-=1}ugCqE?}r=%zdm=V!*Lyl)?|T6$}g$F zBRIASMYUV|&9PVDo#&$AXie3}2e8ggn&G%Kk;gpw-i2RE#8lzA(l)9`149awqIgjp{=%udZ_T{6ybZ*%9Y zWfnoqUfOt(CxN6blNqYWH((Mmd;*-vdr;<5!JB|>6X9DM2(12bCC+SdM%|_J5)_gC zX|wP9yD=uBhiYj2R;=7DdX61AShFbGJD`?Yjg<BZM#>p>Iv0jMd6uh$5p5}C zLjCokE{mXW`h`K_k&AjlC=jBo_XY({l1de&O#sbp2pIw9ZE;8aefINFhcx_NTsY~2 ziQRzcwBq+IXru$G#Y7u$6Cf|Eq2FRxk|u-|qu~sAIk)bqeJx-Q>9RL|ZWT8i9qt=1 z9id#0-Y85O&&Z8Jfnyv(f3nvJJJX0D{&URvj}g+9z%7mQ2=4c@vnXP70Dkg}hu*Q# z_A$~^?*-ydsN1bmg}VhNebF-F`6CGBTF2RC!^>ki{4$r`2mOL+K8GoXV=^E|%Ph+l z6VyCU$CvFfl&hdk!_-Uxl>qyW{E=3j|FLAVN-;Ki8%BL;t`cF$ewi4*7hX%J!6b*G z4*5)0;E_Yb#*u!Iod^*x72+~Bii4pCD#pD)L3~*Ib(tvQGS)sO2i)8M0*XC2Ac5uP zl02nm;I*v*9-j%PU=jKfDC@_XKq6xRw-9928c!q$!09vvYDF|TRqGWlFdaauguYq> zOWVGY@u}O(SW*(#UYpw&w=W*+t_`{`o z5BlC%O(6j6%AwH*RA!0?MYjzJGy?VxOy*FXe9(Geuq=D2c-4uYiDowmd%zcXdL<*3 zrZ9tUA*E>L8;N5}z2Mt`)r%w1k~T1XfbyaY+jjgLl2Ip&+2Dt6!XzS~7TVn10zpdw zn{IBnZirHW0Ntg|{U3Z-u9Aag&x{RwV-~TWmC*sqpH4DJE+(pKz-D4?n06b62jG?W zE1lPQg%r}**sW}uDe|q`G1*}0$1^gxad^i&wVGNILIeimipC0kn+hYp2jJ7N&LPGT zWy5z$a7SnI8y|gW^t6YE8)X1WgJ0xf60#sj>bbu%rf=vG)4|e~_St(KmiqWIrxYsk zaL#Kd4M`d164`QQA78NgIUsII`g25`FB4K3Y(;`I5>ZCpy}KKwi_1`DO)XI;Slu{z zgncGMsq7^^_=#4XA)a&rupDHV1pHRvVOc(&&i4deZ%kJH$ck^QVJu{b>%e$^WR`RB zrSU6&PPSrsc@3P6tJ(Vxzngy=1v!2ynmmBl{Y29AeiFfl?z|}Pjc<@OqI_nFb@4v8 zROQ*E-eHtN%G$7twqygj%5miF6khud0%sTQKZfJGJaKc^(<8l(jE2VXntROqdI)Hddd99Xs+K!Ia6k3S>JG#00bhR+6p{!eQ{L2pa7*L1Ih_uO8`9WY z;#nOR*=bJ%js~igu=rZ62k})%mEIl=y=X8&uDZnPmvJ!zDzSI8Htj8*ChdQC^botg zt8h0vtatoRkbja4G>?xWL^Yj{FTzF+$W*n>q?=~T{#mkm8{zbdC%FJc6*MFu$I|wr zhw{+tcYWE5)*1ZvN{C+w)w~myqzftlYTS6-oMDGiN6jRV!IdsUE@p|AUVzP1;KSZq zAk6VCo^2g~Rn(!Q{kbwv%`;jV;f2gt$C!uT?=s^;BQa(ZeI-KJ*BNkpgBR{Sr4+$d zhk3Mu5K(&&hfgKXYvT=B#3e2zIKINV+~j_;%BR3f564Dew!inbCLe3LLlpvTg62Nvtrm zeTBT4c4dNgNxfwBo(t7;brs{8TpJ8>@C(bC~eNcdry zpV^ZeW{DcpZ+h;ibV(*hLB(iWaH-$P~<2&M{PnH`?#U+5qJBaWdVt% zL8v?0Io%IQKSw3PoHr(EEqy*Xr1cJE8Gcz%U|QKadywX{q;AqsvXKQ z6WXPR7-H92$;R7;IHIT?q(oxyg~E1l@$ zRW`vH5QLgbgQ>e%okx(iCZP8!T+Q^6>m)j^!_=}kG9kEe%@-7U_#*Dbu(EFYInJi_ za-xp&I5HibVQa+==O;&LypwX7V*_r`4jY{g7nk#;W|d**g4-O16I`AIcjbG8$4Wej zH$VMwB~Q8WYyU;r1Mk|_S@d>47><(JIn+Clxtq81aMewVC(rqoE<;_4p*&jf_h3|t zhr&B=S>*|kXy~z>@#>)=-3u}|g>-urQmDKGVHbagGq|t{bpTGA;n1~0_P=&Ue5$`H zV6Vj#^S=1An3wwcK#UV1SqB#OmuPoA4L^B7`{$#!G?3DuIaa2!VOhO7YMKI2A4f6p zT>E)Y%ceGz%ny~1UNx?$DQiitYFX?n`9d$0WEK$Km*OsOK&R>BS@9M5>5?y;y`X{G z@-|7UNAYqV;$?6l%s8(v$VxDFY~CevQMBis02KEenfKN+$L}R^LaV?bMW{8K>JMp? z*i7-RBd3tPNo9(<ey*EncIg^9pRL zU)4Tn-WgGncf~AxEF<390s7?3?kxObGwc?eE**^A>X&3{l4w0IR?e*#%QY@*X!h$7}P8kn&QFOxfy7x-ynH ztDn-5dB7}L9clK|&*=4zR#=6$yGO<-(IPj3oo>inRAlgKwPI-cq0pr5VMF&mt#X*v z)9y0l&;mgE^JQ&!y?%bmC0HN#q3x!QP5n}X753iduuk=uPhn0G*DP~%qq90%Zf&_= zcQI8@=%tic^CI;g=}PN9i;E5$-rbbiP^y<97gK4JnUsF)*tOCpk$yWl8hXmA@11ZN ziP|4(|EmA>I~h&jA2((MI-r{MFuuJY=$!|X}%y0jzD7I<8BX(nO z+`t++(Xi@S^MDo>!543TXvgA$l%Iumxh*kG-Jk3M7KA0Z&*sxi8s#-O0k69W9Y%2GuC=a-qyr+MO1R+jd^J>+pLlL;MLAtUsXXb61D0ryrm!z_|J>P$E5#Phb zx?-qby3~?#nv?Im#PX@40?S#V!Lvkx;5yjJ`Mr!x|?N<)A1IZ*e{cXlhLu|HHN|&`tZ~m6dDe?(BP& z5up~1QE?sd)4d~-I>Yw{&Ck#JqIT$=X<61j)cB7dM(Hga(0RDk$RNq0Y@K~ixkPN7 ze~Wa}KeL`#==q;Mgc{yg6jzwe>a=JnpYwWE`dDMy4jZ!u5PdfhD{o}3H7Ll{_jk+x z*gUP3_v|zoFw6kf8x8x6Iw?uTsi9Yv}Bxyf5t66etkH>gW@~9eDr;H4bd1XFa%3!2Z{C%f;c#YGAK?f+wr^Chg8jFg`og!`b^vqIY%Nm48^3Cg9V z@10#IIHp`g6v?5;*hjV6KJ(0El3R2YB_AZZr7x33Ic@!Mnb!5~mUZ@p<; z+L1Q=7u(^r({qIe6kV^apDTo;mxOcQ4_9=T2b5jmc9lQ%W^>xCz$1IB?;>K~b#YUr@|wUB4?ie*EqNOKNbF*Zzs3x6ZAl zE84%W>B;}(@+^Nw7wpKWZO9T@bME!Zc7%6BxN3Ju8lA#KC=0!mwS!Of>s+x+hyPN2 zExV<8O{G_|^mwVDe`@m;4f(neEmgz8-!wyCG*m|TmROYkIp?YPoOfHp-o8@(uTd{} zBX#`(YWC#TcTD(Vag;w9&n8^C*Xrpa<2#)E0fgj|LX7|;AlTF4yZ%CMC`KlCc4uaF+}biu2eX==^qfu9J`C;c9f{BZF=O5VY2+Oa%fj7!In=*BpY`Q7 zY{a;nmHKZ(Sbu-z3`f_zYa`C}Xnor|J@|o*S!K?oA(u|O0^nSA_u_fEQfn+7v~L%h zm6%2^?0hsgJ04B4%ekSm)PDkLAoN7Qx^JIowcEn;KkJ1QZ^E>&>zXAqe+u%jjo9Pa z+bXd$9(~Ov>Q}3GN&UB4s)}l~P<73J zUN-0L*6-&y1G2-=^w+sLAO2HeVszp!jc$PM>(%p1)*5?|j2wIhn2xb&tdVuhr89oi-BK>p5{dV^YI z5Xw-6NddAFl}j}rc(|0zbRR{%7&Y54+S(zo@3TZw`&=SVn4>7ArRTMw*Ot{`+7(Z) zqWfyt*~{wACsLR@auG>UOVS+qG$^|5j!<$u&=-#WAAXj+vb)E5P=# z0rdD&#qW=lMG|M7;1<^QneU^K^1K)LH$Vq;*H9aTW9M@K+he06hSU_sZiB$Sw3Hu6 z#;?!$W7@J?D$cq{tqZ^_ptBG&b(t1wXo8i!=v8S_&u+qU} z%2s#1oHB39oT1~nUv7UsJBg*t#h$juT->FfA&#+kgfibVIxbx>_MQYeJ6mF10#F3k zYzmiA^p*I-q-+>94?6h~m^9|Oi6NjIdWnv~Fh^op>%bO)7bf$M)99@r%NM*{hY#T6 zG5u>Lgob{sh6bQeEqea|FA&iTMX{R5FTDX}E0<>W+~>XeZjEE4iwmgmzn%o%LUuM< z`pr5TWyau8a4ZEHaH`87q8ZJ@;W`v>00+Hk4?DRp)a`2bMj!}IKXa*dhG~{U7c2W}uEab@u4CbBneSoc?Es!a z<+#AM&&kItIt|6$w%;Y?r^KmGmvHo%K)Iw{04GTwcb;_yJ3h2tJ7B(uTHg+d5Fpfn z?&v3IZYi3PL&HuwVlA9N`Y&F4P__zzkQk=bWCH>6kyJ!FkOAWpkc|2?3IVJJI8N|m z-UUojc+rBvoV;wNh~;}5F%4c(n1&w06Fi0Waa)3EriSD)c45d!v)G@2#n}SEl_wkq z#AscCIYlEE7t=8g$ruuy8yZ>#A{j07Ntg?|0IPQk+t2DQ@Hw|Lp%1tA6ssCSWsXY& zM>2l`orEoipGeIf5oYW(*uOA%J=JH5FE}e%ysYJOC}jz+FU86J_1?0PaJ0HZU|#z; z{5d;x2~|ED`o>PfLqna&_KhSjst3`nT^$XjwI{y}rLiFPB@2qPA zQmdEja%nz)G^6c^WtDYk(J><}G>#0nnN}M#+!3#<*cT5%*H*!sRe${5Ev4?i^0*5{t30-Xx|apik&iO!CCWCEfDMCDVAp_fTM(0v!!aocZuxvzGQp0ebBgD z=A*Hj9YMb6-Jp(wnTsMN|C^r&ANljB#PaRTDRzMenq zj0BAG4PPCJ4YA7*F$vrFE@;!HaHZ3m(vJg&(;E3*YWSdI*L%R!naU|~Rs5$A^>*TP zQgtr}L%^70Z8JGMRg^ltmY}5>LLpO3+H$M;?D$zhUtUOAg8CX>qC+R;?Xl>j$@Pc8 zQZkJ)$EXvI8k^=Tx0$0=X)`oMGt2fD8$m|5>&K;%o5MtR+Bo-Ps5PUgLJoo!X>s&^ zl_TVZg6v|*X#!@cy|5Nt{JY!dQG-M1m#v1Z_cA@$;3u4N&XI11sDtrX9jrp)r36U5 zfc;%j5V^2y(HCDNsFF>t7;L)yGOE#}1$+(ZJ zx+O4t#vH>z`tKF)s`oBwe>q>+Eb|?Az8G9K_Iiq|)j#-N>?xR;7Eb9@u{v@?)PBB5 z4!*$HA=rH1Utl0|2tb0JAF#vD6X-2aB87c?fm$IT@7(83ARcq8eZ=X3 zvqlN{CVN-kKwXn|B71aK0CXdkZE20+Y5=ea-BwTjbN1FJb>5;f#z!G{-xDCQ35|^5 zpWH;xg!tuSU|hce@V9A$hIuhOl;2#wkA%939rG}#I7*3@E&)(k(z=L)NZRA0G| z*@EIOKYsu1t5x9lW>`#e^;M03Qs2k^DW7vR{n#Y!_|H1n)Y=`A5#?!f=}WQ#Xtx>SpgExo8`eC_CJ7HJ``hUlscOw>`F z22cxC)a1e>2X27DaTRc2sDr183ha=AR5UuR0Hp7PIPUB5R0z~YKry+=pg=)=yqHwO-r>7$b=8aN+^I8pLu1x~{{0n|2Hx>6FG~f@2|Ca#>5-Wqfw1O(37yHo z>)~&=38y+B7tZ{%%)OD1jY?0q*YY>)e3_Pz6a^!LzG}P5SC-wWUguMq&AD=Q#U)}j zuEpFKwbgIt+Q65&DBsIoxKua$fc7$tlTO{8YvC2*~9`%x|M zFBm(2OXIE0tw}{QHYf+eSz-YZu6IipguZj%8;}v6WH0=G1{;>>t=W2Ld-a3cAJHW4 zIXW^F!9gZ-Ie0h-E3Qh${~6R+%Qj);gLkzfN)?^n#fngWeND&C6Bk8t0iTuRDnRl& zRY4g;eO{hw@%BKKSQ}pL9Q|bjX7WPqnHB9&Ze*!lPi?EM~m zY|2}~Fl~}^vNO5>t9= ze^#&azhvu7Y#$(lJTkxnVkrzIXWlMwJd6sgB+vN>FT|PxP5{b2!if)+r>21Y3U7`~ zH4!<{N4p3$llwPTk6DQHhsHS)($_FJy1ww-E((gkDv z(@|^NlTB-=fBKM~x~Qk4$h&~`>)xg|FU+(*9-yJAXtS*6{p~)^1~UgU?$`s#0xzQt zJbG+2x+lx#q~oYC#3NF=TS~rw{ph?TWK#9Nus8zt09CeFi;6u zuLGQ@XvUZk46MBIag<0l_vz~LLLfBmzsQ($g6u%oZ!;R%y!D4`xhGy| zASqApmiBklkv6n16YGa~V)UJ#9%ZC|%CjdFrhi?Pk$KL=*Mu;TDr!0_*qle)T<2T+ zE)Hk*w;CM=9b$5(@Bs7)qzQvwt);yVnz)IJNfNxHqG~K9v>^L$&PR1D2AS?g;ndmz z=9H7&^7Yv%uTH$#f~~W390tHRJ1D~eE_e}5@Au&)({A22X+O*hNw@{jo#hQv2UII%tfBTa0DMVSyB>3@pMf{pgpd1!A-cowxGI%;l`3OD_@BuwpFx zlrcGyeJMT{T7J(!C-{y`d%efEC8;~t5+;3E>kO8I)D9+C75JQI5?^{V6cFDZ}z5wh+r56X(2PFUEF_V%}Q$1_`k? zmN0fmSm^ld#flYMUbz!xFkV+NO{QNjWc>6$;g{sZ+TIl@#J0$5Xsc0H)?Wa6F+H{b~rf>x?29b&BwiZ5b8 zqaga=#V`8pj%oezf|Q6p!QEnL>)l>4)8{M%&7xG9u(2`b5(QNgX657F^w4Tl6yhp_ zYY0C$2llg)wIj^p-(S|U2_b&Wuda{PJA2+QH~U^*Zu5d&IX)E@oL-vmmg~um2%p!_Ih{3RFb1xcsYE1>Y9*!)^+OAQ9K*n2YVKCdF?DQ zb%YJHR8(`B^Fzj=rfq*+4P#JK)9A8nRy0qT%oMn?v96(W+7CCLOYI(lzG&4i18Ei>V-L61G~1Xp?E879P0ju_kxl$x82Sihe2 zQ20$e>q}?@35(mbl+%J077EOTzI*&=%wyb*@!MS27XYn(fOs*JJLKmrpFn;%LUv30 ziH`h#$9m()x|MCW-4w-{Z%CAzqW?y=lX5Ub5#NI0_H91th>_-HfSdkc-#MVD62Y!8QJ{QP0U|sH#1?&UO6z(RLsE@*70tS-*0i1FE z0(g*i9Tzfpm4*TJbs)cpQ`Ub1elAL?BFi$ibi9-LO6bLgAjvj|ZCvuD6()asWk$LY z>Cg;+j}bUei06=RPw7U^lS6$ObOL`t@o>k4;aDhw-Y1`te|r_LfLuzc$SOxnej1D4 z7&x6G{^#{c=G*dUb~n9T*RO|qVB@yW!q^jJMUGnsv*!_~Z|C$cVo>yM6xKW*3h>G* zrFK98XL2YRIWSMohwtlo39IIkqc|4Eqc?RK?v?5(#%F^6j!X`G#Q&Ta8xPPOPa=g~ zaGX}hu+iCBV&?()vd^1|4*~G`Q&5~CdQzevl$Y*B2loTCE+Tau1iP)w@9=E_Er;v@ z{$apDzdm8K>_DDH-Kk;PWJ)%kkokQ@R+*c?{v+7yGaNSjj~RhzM)Vo|L`tGLW_(FC6BH6O z(qfEor;CGv1nCn_+($6@@TU~%I#MLBo!(1f;7m&L@l#IpuUj9Ip3X@BBQ^6CHv63W z$!qgUa)P1!0P@OQ>YVzvo>)0z+eK3jNuU!+J0VykBiUFYHOyKma!QLcKn)KGjGfO z!&M?2wngOZKEB1zpd%!Q=}?P>pP1x0h5-hS`F`0OB%!6&z;*&aGB+%D#12T4jWk^I$7lj+|!?y*S7 zId&E=149#$iSPV8E@3zlTDV;1TJ7#?X08czs{f!at>@Skw9fhI^~W7m<+~VA zBtC0s``?e~pJYz64cmUwcag+=1lg>MUnFS%~IgS!bJ-47o&cy#BDN{eAS&bNb)wZ3B0C#sq6mZHs#N-bwHBPOGyyBZjb zG=7|qe%r)oV*$4`SQLqY3TQh4zp6{C6Pwr!c|rneuL=3nqjUB>O}^pU@mYh%NG^|0 z;p0c0bQB<@*-!rU*Y}=q<7IhEKqhZ2L8G~r=aMb!SAij25BU~TdNG?+cFXvJW+|_x ze1S$ad3&*x`uk5ey{pg&#nWx_IDb|SR;{h$Q`RfJvT!{H(1pgjT zO&6ERNb5QEAj?gHA|v25Sj!3f{{7Fi6pO5|m0YcQpzcAXN`Vbc1YZyLZ_gH(CNorG zA1qh#Y^e>1W+aefIyGKgH}qov40}<}vHo0GZ+J5_{=u&lq^K8$I934^M%wJc=%4Rb zDRaM|lk_t{6E5B7bC~NvTqE`6dou?G|F{tp_Wlz}!vMnqjkN%6*U@m(0!Mofgb7d4 z(+8aWuh+8V+V4Np`b0}001f4(u)A^ls}(5H^xMrt#;^mV#`u(nSs)+Gvj!UI41}`z z2{Mui`6Fu?b#!bLre5y|oEnm3;H3D%3?>WgiX)Qe+Z#h;NlM7^Fa@@&mC#AtKne5` z)xywa;tljhfz?S^2)%2}X=lJ#H$MT7V3$wUXaDK=n_UU@SO}_pff9Sn&t(oBbR*_LJUl+na~$9?5935#DFrLlZwOW%wfk3_B!E=eX!3y zihjttA zL``A&JYJwAc{yC0)-HjZOu%qW|MQ=~gj_S6f^M^lWaM#VDbFT1XX-JNPiE99fUMYe zBy#K52m0+7vSfKv(SizNyW?=?GY+&1zn6ejFOZ2Ohx|Bwj!_jdz966kWdEek1>9<~ zh9bTJ*zx_aa!d$jXX$bG z#uHQ78;F$dR0-ffqn=23p^rv?$u|Pt(OenA!rj2BW z&Hm2f@gir!O@w%B8ZesqJjd^ClTRio7=S!%iGw}%)cOY%4t%R{BgbQT-?#dK3>XPU zA765y)xI`W!>huUq~{YZ@5^o*7*hcPfgxcngl9^3PjQK8FSH$L{Z(E1`4yQlN> z-RIu9M%#a~?a#lB)w~Ok-K`!tQ0L;WxNqCO-{yTcPxI1s z%c>)4Rmwkw@;`ll#_Q9b9}n%wiJZqfvTW6b%JUbRw}oCf`sBQX#PthP&bXhMF2lX! z?9AYh#0Q?T`w!3i64ckLAe)iCVn%l_OvRLB@>RvvQ=?`^r&~3Tu)~Rq?_?FXhssyU zE^)hF=L?;q9EvO0MXve^CcBl~=lHy0+Eo&zK59IePj(;s~gj3s3=6`4R$7y~4IU?_O_h zU1Hp`Mab51K}4UllueCUU)3bp)=r?+kVy!HnI&_eV#Z7P=GQlkT%i%Oe;(-~kXWr$ z3bsdCt4cbUkF%r7fX*Aj;V6v_rqT}v3+n62O1V9})3r|g8cJFimR9k~3}ZIkZ}D{> z_4sn*T?U$u;%5R`vaRcpRO`e7&plT^gZW__LDQ-S9r~IxhK{y9-oLN(U$)9aQW%F~ zX%nOxqJB;)WbBPhy~)TGt|`MgcemzLRkS~y8L#Luo7XnakF}}7cxFS+>sD>f9#Yf5 zwOpj>?NRU%w(0U2J;rtgBEZm?m^Dhb2ZxbOc~Tt) zw{c%SWhbJ+uA{Y`oRjP(Z~7DZt57J&(uBm(bxM1`jqWY;I#@RpG&T@4qfs{y@_%5R zJbe!Dgf!zkm+24IUGI)c{(e+LxA)V0aUI*Um$+HSl0jIBQodK^d+rZ-Gw-54Tgges z{09_%x)%FvRt#T6z=>0yi#M&etv~YmmevOct(ax51HKcSte@Y^j9GR@j}>W_wfl`iPu%Z}g6ZpBW&RtZNvb^pA8hN!jmv|yIOWTo!tLOqFJ@A#~vnW%mVnfvksa;Jn=v$ z_8OPYG*sZ!QtzAyRaUst!%=@7A{*fQd9bt3tm z)wGgksLDNIgH_KYWz3JbS`*Op@mMk*k~rR_xrQ2@A-PEtN`<FBnFLeeul($XY#6+=<)PuikrWAyvQl@TKt| z{ydk7@80ui_SmH-R0M5$1V*%4HW9uM-(`YIhIWJCkb;sy0$7hD>V; zB0UIfO7)5XTexG!3)d#)4X}V-P6~kxtNd&+skbSWD{Ohn3gx;CON%cQ9LkT|*X!5k zTOz=ld*oH>g;?#>{Hsj~N8j;veD=Isko97cU1G;Y-VN(tZ|?Y1#lUJ9_i{PI-wb9p z1!&8zk|XsS#`Kiq#ym>Df#R1cBHMbx7Xi~g`*xGn;_1AnL19(C87LR(AbO@XGI+I= z2bKXkZFy8g5P{J+JMwvh=*9ghE5+{x-m-aIunycxmG8|hCy%(b8ztC~>kN(AOB1q% zEbl4mu4`N0HzQo})sio~uaM?WCS--Y>aEPV>$LYJ?>#Y>C4B~-%Z75RdvdaGFn80+ZK6HcQwJ5#=mp!% z>sG=3V@~kktOegFF4XM+8DcDi6;NvS^Eo!= zp=#@%cE9E#x86-IUWqmz(MiV=^SNGn1XLyqHc_J=Y(sR4lM65}k&Kw-s^Qpr9Zib+ zRL3^LN(=0(&Tvrs(9$I1;a&m0krM9Eot_@fH5nn1exmwIVPE7mQ{+MLr(Xw+ska=7 zEq4!?AOnbUZi%43Wl$5vNkwXe&5t*q~6KMT0S>H ztINzFy0o?Hj{O?{zTJ)zhR2fHtz-j_I~wMCdT!V&DPcR0Tx1q)IPx*>2oJ_joOid5RsbPogXzEK`2iYnXhNBc^ERcshAx4(39=>-A$dtUz)xQZR??q_gHrn1B z4O@GUIrOmo9Hqj}=($>m#May_eLh9ZGikO7a1or&n{a&e&)g=8t(Y>ud%NzxK820W z@17j9`_0T^ku!RNe4mUqQ5I2bi^KoxD^j>{z>IKm(v2vQKMBLECc(B<33H)sNGs!r zK?TUtEnvz+b6NR?gkCHN4sSvw?-sne{0&`_%)oOc_gVY13kNd=9?mUaTUg^}BCNuS zoJCDyr1NSMZ1kPQ{LG#KetkLe$_y5GQ=Wie z3S>Bku*9t_*X&nsZvRZa17#=pe%5ls-vcI-nPvvw^I#U90 zr+Ildhw{Jzxtc} zab_xcHCFjLj3$3BNU5HkR{7wXi2t(t;d?!2JJ?)IJT_bQm;diK{3Zo7iPRx&#ho>; zPS2rsR%D&#)GiKp{{xO_cC}M!gLvg4CRFm-$|{yYphK1sR?~P$6hnL5x({&9Of#gC z`<2A1PZia6UCwsHQMK`ZcW0PMqj0Gm8beUZyJ|H@s#}@H$s#p+^i~&D1QEy$aXQ*_ zJ1~Rm0torW>B>b?FzsB2kW%%>Q%ekY+X37ap8)eaJ^p?LNk`r&40)N=#&q3!uc`lV z;B95c`|2^(V(d(m0vSKp-@O&=SMpuvqro10nq6S;4AGO2Q}af-n&Qwhmto6p*e5Cl z4>AVah-gpImT`+udBYFi>gQmw!HRrmE@~ld%OmVDC!{4-+czF-T-g_UN}8GjYaJ|2 z7jHPSX9@hq*ymcL!{5*N5cO@M2vTbXllGgb&nyieDQwW52^3Xc=~8s%`egEjS5ZPr&Mx!B3c%^jD9-1 zuy0K70`8emAR#$Z2AA&ucp*TX3D z&sQmwHmUj7)W>R^wO6+9ypp7({$btcg_^v^>Zt)S>8+>qOS#tHZyHsz&c2$y_? z)2MTd{OOfk2OECpF+Yz;{~&T|rtB-ePhnPP^fV}Gl6B(yRzSGEZ6dQcnwXMypp~p= z5MmV7 zT@GBg$#FC5y+Zh&8}rFKlf;>JiNpeFe)p8Aor$HWS& z=MaKt0?7A9^JMtXNWMnyh=bcFKA(hpzCYT1hH)401-yda@q0+nUgC$mbL z(GK8&d)BJ3ZF+J6tU>E&`WPtk`su|o0+0R`5OYs(Vc7a0mr*-_T6ccnt3c`=UDb!0 zP$cd?_6tR2$=IWicP7A?sa9g^sZU?ZE?tYK4MG&8Ts2xgX`qaIx(`fkvp2uyD8}#H z`q4zrWwa}(4tTy`6OF7ycB)+*1`ourey~ypcEY{WBi-ke(`WmuK;M%4wd5tTPTks_ zo^>kIcX!{`>akOfpEnC#HSZ&zN%!0nND{qqbbeIGS~H@Dq)oRa753ojc9n>Zg63OC zQnR8)3fl?fmkT)aqv??8l%4~FsT3oh%8Yf%AU3Lo_083H3hEd(EebWu$kQJazcUeJ zin;RT!NpZyOD)aOcZUr3Z*0HXf9h%R(VBW!$%_7`UW*nP-#UG&Z6!tq^1zDYneHnw zjCrVi@3y}`Nleq4OSTM z^>KRfkDu)-K&YX*33>tJ)Ro^3f93m5zz+y(qtgYsCsKKuL!MJJHd50#ohAE(rWMg( zjkY}64WrH3qtSWRDzpk%O;-{{3uin*ILh=pvqNKQd>Z}qz>0>hY# z19E@y=YL%W=fhzwK#e!gukvvZdBgQeNhlqIX6o==qD@IInAYbp7)|AXBOANBcviocs&}Q-0!b3RanJVkp;{O^> zrhayOKyLm7_=U)j>j||C?VEzFS<1J2+lhD#4JZhL*HljWQuSCiU9DWE)_x2^dfAJl zTr>FaM^bF-#|yw^vVS;ySMcMi3J^VY;1YyW^0x1|!m+%QtPs&ztZMX3p%Y!7>s9oO z)K`c;hu7dDV%9yui<6D(M-vn3r?&K@iR^(sDX89vdvL$1mYx1-#Qz$KCJ5?8mcEGU zRZw}q`^sdcsKjI0hjsgOD-C_P4lV-~mk?_=qVGFdi-9-ccI)93ePwAb$M+{;Y*4@3 z+A!;WAz5)E06>|-@(Ny_P@@|?~ z1wQZk*BFJ`9Yw(I1IH8Os27RGWjv`FsjjIvF`*IflaG+tOHv>fWXw9K6jmx9#^%p2 za#uOMd&Uo-W%kmnKRfNWU}>0IfgJMeRJ7q9SxXZV5Yo3Tf*5cXQgg zj6J+_+OhCj8xNAON>ub7eIo+z=N~%*%fQ&4_9f-}RSufYnDPBzH}v42ztrT9U}UHI zt<(zmhDc6j^(VGfXOK{65fF3F#Kc7W8(xtz3}m1d3ao&8n-Fv-c~=Wn_P@$TFyNXb zg6!d0Q6aUW7##WpuxqoD{8seIdd&6Z*m%_|bp7WP)uW1U;qh@PB5dR%pUeeH1|Ez= z-I4X-i0gu5^Xb?=KDU@VZvr{wRed-8sT?)KyJGGio2c=cP+ovV3tsq%<`)$f)^%b3 za{jSVefNiTP*It~lO{C}2W48m^bjVD9{RI0DG4H^$RAc}A2{&}NL?)`k2NV|Mx~^H z;^IZEvCNPb=}hn(C3c&_Jtjw^B&1$QBj!9(G_IMLa#OtPv(pW5d-wHyB7WihL0Wy-y%b1lA6cK29Wc29(&a*N(fa ze{)=h2oRuK5n+@@cYw#?8q%!C1`wURqwsM#bnIA#5I<=C79$BWc}s{Jur2FeKjo_r9fPl61l zB?W|A`(S?^TwXY;KHxpp?@gAsP8kutrwu)^A?=BZdh*ID%E(0pymW(m!`! z9ul{uKw~@zb=qx$NI%Ad!FIwaQD;mGji^f=RgF>XhjldZi2#lqLv3G_^iOE)x|Dr4 z|Mv4yotQK5@P{(HkC%7hI=aiLj|=MNv9OvyG-Gwr67ycbbncgJ_6)NF?9*8$)A+3a zayLjwBsP7^V7}g}dmU_oChHoOGHpS^plP&=q_7#8?UM^3Y(A>7KXQ3(J%MqEktnm) zMb(#M`IW~z$=ixAjD}I>ocT;Sr6F`Yc}4D|BwTsoBiO7pXHG_4IkT_OlDIY8VtXAg ziKa+0dU2n7Gm7lq$d}A|_Kn@8nkQbt*~H`Eb&fr%zDbc*_MXnbmU0m2uSA4}$=;^f z>p0G_M95NI?)1xT(|FZ}y#%}9ak2yA8~4U)^v(rDgkg3-_77>hvm>II(J4@daOZ9T zj)uv1iWPV_ZJXj3F#NGS z7T>_9ulKI}5*rlm*$&ge&~0b)WM-J43_nF1)u;mN6uXZ`%G2&66>Ha>B=P_zyQklB zmtZ9!(m}s&2#8Vcyu3P94lCcUHY0p7P{XIKorbfaEI%+$Ow||M&ch#Sw}1u1yKp2G z0ud+Q%Xzp_>f1Jy%+u*YAmm;lf~hfxI8=0z1-TBQ(K;|h_}o++GFvWNkeakST!ScL zA^E!%!HKTjr-?vHLn-N|i0BZQXe!8t4* zeft0hXa%DkYbNs)A@|<*u+PnZ`AD@Df=I(R66)*i5|fU`8%^W{9yCyH+g2nq{HF}kHUZDoQqnLqetor&&iiQVId`G!R%N@@i z%G#$fKG?- z9U4ZMVes_l^6=(ItTGX4!(O6Z?kKtaizD&;n%V8tV(ai`q?h;eWiTsw+&0BG9 z$$V(-ml+@zc1M&zc8yy#q#xf!h!erFawF;jp5f$v9MGGM1QW?S*1J?2vik{G-r4gB zhC81w;#(S!K}i!XWrLOG{aH|zK)&ghah{-|*lim3*4-()tUea|dsB)*j(e>!#jzme z0F5{ut>fyMw&$8?`paoxIiOX^ei0h|2?zi9jj#sPMt_Z*B8{aC0Bw$YI#`+lM4gb) zsQMRvx$(|D(KLZ^-qIo@)sDO6m~{+RHBE@`lUsZgpb6Z}tO+>Q_*=WC1IebpD;J`* zF$BjcaM_}O>*h)IxK;S3hso1GRHyS`z|C|wYjb94$ zpWry4UGz5}HNj6)OOLz?_Cv*<1G$DuF?0!Ii-?Qc=Vt|aK8K2s-%=mq5uxxS?2O#p znv%xwh*gTij$;}wahY_2GXs(Xohu+&oCmsr1MU2e>@UB|mxg;cbwo&hqov0$rfELi zF;qpnDK&)bZ9POTm-+755PpSK`N51}rC~GW{DsOz!+}V)qcHUu3^7a{qqLQk^9~;nY>7jO931%>*Ls z>&2krV=g9aDag;wWI2BD-d?fFi~Y~9;9pD&mXH0@4>8{yJbwa*^6x)hx_z8#|MxH8|7Cqwj=G$^ XeO*T({=iN8`fjpTB>;-}m)+F!$Wcb-k|ZdM(e_a?icX+G_iDAKwi@ z&_2|qzw{vJ*c1e9{k(GrIFpCrx(`0Kd7VcY>;x@nr`@JpQN!-RLu z+cbUcqARL+sO0wD$TJk|(=&?C+Hq54wsj=t(Tl0b`zW>csk@JbFD8sq6Q#2rf4H4z z`qZ?7-?~Ev-58aj?SVc;YUn&3$tp~557en_8bU&xdjlr-ZYQU5ZzIfr1T3=DqEz$ViJNFyPxg*)wR#FMmem> zZM267IVC(DPSlg%1|6U4LxgFleZRE+c_L?REjDlGyBG+?1+sLa2lvIxpt2`;e3l9FDZn*QK+{yQkrD zxjumi)XbyEAn{wb960lx-YC%(VXcJH^gf<=n753VITC+D-utXW(tL9JsUIY)9gxJy zn=9o_daCL`&CXd#^}cB9R_S`dG9b%Beyb(z=APDM$<;2O=@8`v8CRjCfA3=5p>|{W ze|P)yw+j4E^Zxk)>)24@r}2NesA9O6{>!P&ua^%uC~bcG<;bA%e?$InVgEnEK1&+x zo9vJ7E^#t5%3IO|YH6Z-_Pn;dliTQCza+7cuPchnUobJPj;~8yk)lbPkFbvF4%wD( z_976^(tcy4ve;Hf@(Qz76Q20^Rnup}FT+EtI725To*3fe+B4aZu76zGwP%hHCn43) zU?m2$W~^0+oRE=uA_BL%!d{FXFMXw&^p%3u5k!@`^~L9z`5wu4>TFFyg3c>hnvY5_ zb$WTFyU+$R8g+4bnHH!a_;12ApBw(9S}wuMJ}^5l3n~pgDD9Kn|JFkuxi#&ZXH;Oy zBgCqa{1)iAkN)Vtdkr0Oir&9@YGoiCAuA|{^WF)au{MJ-r+?{q^UnVr{ci>S-@F2C z#a94w3O!9g&G@3IF#xU!+-^X*Y0h9@{erPJR>9yMu(WMX#aMEJozv)VkhPxwX}l^b zS(tkh2>!HSD#1(YK+ov+v;SRkmv8CwIPvkZecL0@0r25;zMt;fJbbQ?0m7Slx)Mvi zB7gHwd%?xK2==br#U=$BAzq=_`?o%?p$D>JJuhr&3~a8}jK?TXYI$VibKcD^emc5+ zY>Ex47_IfPE5Eh0d@W)x{Y1q?5%QS&jPq5mk&M7;=8~_^#K(9<18PdxY^&xLTGRgz zByoHGXASbx5Uj&cH_89On*V{f|3OrP^xB^sydilc-W9c^ez$5DA8JJ&&rH7%P^XHj ziLvtms$(QoM*#QId)vQcsqXzxY@}(a0h43xv_~J7SzmyMQ9Q!jlJ|m7;+0VL(fip# z_cOdB_A;2Rps$}>sPR!C^cyX%Dkf7_EfD6jJtOpYIV}nyhH&Dg_4gc|WJ~2K6IeVC zJE-6Ne)IaL;gvYs{vr2Ge3Z&A%=~1nyx?FQ(;C-iVO7DG!26BfehgbbEwIQyI0%|1 zQUaUgyv`ekL8*!`JcuvywTf`Is@|`&4Rf>yimo;jll?y5vx;$ZYQMRC zkG6T0-`Bbd^my{=zIL{PDx2%Zun+!WMxHrtrY{n?@~wZew`^Lhul?DL8#hLQzxy@I zcOaOazm#c3%&$S+zjtq9`SqbUO6g|u)<$b^>&FaKA{l824KM-hntRc<^C!Z; zys4JL-$upoFXkZFo6gcN!A-DYqO4307NLXaUh>&0m^L^#sDPa~cKmonhPOtCXzlXM zwE6YM!$LdQ*=r3e(oxP``OIpgocUq&cxRGHMY>HMVt~7v=hBRMeassR|KaFWc}Yo6 z=50>PZ|UtKT32dAn7BzFMvk#n@H{zPBX&xtuj3gh$9UjFSC?!Fn8TBH5pMCypIDGJ zlRCejc&d(z{uPmOthz=BU)FTiR~-K||ImD>QI2$4pPPjWVd>hR?5HV z2)Cd@dXmn3zhmuFo-&3AQ&&?F8=XelxR36D9M!FU!o6|Z>=CAMb^-8glbr8adgtg^IEKqmgDp|b9P$HN1%q>I$c4E};@V}pdE=LXW{5X|Sq$9XEgqTE> z33e;I!SM?CV`3lwxVnB2`f3dUY@Ds@6%V}0Xhk;9Z^kDFOtIZ z9Zakg!}Fu4{EPOPX}>%W+TZr`6#8%8KnNM2dLYu%z z#n4{M1LHTh*q62$30l7~^vzEjp32pU05FPE;4FO+!K>}54reNm|E(7RGiLv+S`GcI zU^+rh0wsihdz8k*75od$kaze`8&xrwASc-)rMR?=RCWI4&d}@)NJKZ6iE?Dp%DL)Hf088gmq z2R29!MDZZU)*{5J9`M$FX`#m7$>~~pz~b3d|2;oDyc@TFIbG|wuJ#4QF)J$eEsss* z0g&Y>3Z(0Jefd=1e(aZiAt?O&&TD@1N~u(8jDl# z^K(^VDtL=k-?jvZlYT}w7US) zuvNZ^UW`iYud}_(`IB<*qYI&o5b+6ig&x0(g9`d5aX6ES#KQ>8qi52sxV!L##>@i* z5&h)XM?%x)VmJNoxbtq8JZDGdGZ4K^XUoL!3h;@ki-&$Cy@2#nkTLbH=A4xA>h%xv zpSC$XSorncLx2CYT7L^+0ch}rSFX3ZqCA7)S?sbGXT`6N`0aVa*!~~Z*}ByizO?I} zUGvO3E2&DH*iK!9x#zBT2Of6(i8ydz+aZzNPyczjG<))z(zTZ+IiJ4jQcS5u+@h+vjhsC2w7f>#73h}28;=p@5&mcJheh80+K&qM)VgU2Z4r0w{4h~XCqK1?pf)N` z$hX0#d{Dwv4oX|2NE9!dUB-u#!A~H#%%c9!1`6nXC&s4z*{=9APUqiC03r4o@5~dC z-%>c5uby7+;0ZTig|j7Oe@j>Tb-B4OT>qg0+wQfh10ClHZ%0!@9=`K8P(mG%Rr=M^ zK(6P0ek3{C&IYbQNWsflZ^#xX<@ec_e;uQQz46DmbtaPh_yeTguyNMHgup_uOG3}) zWfcbo!=Uw#YYqe2Ght6JZ1fZ^d;f>&4t+tXKrjSw67yk!HIFG_s3o%lthCNrb2P(X z?Yf>`IP<0VSW_CkQ)G}%ra-h-{GYMnqL7JH5U`|Bg2Kg~BZ z;-xYk9$UG&quBW|*ROUR;_0&3#AMclZW(+fmY_2;58`zW3;HN~e$x*(&A$^5B{=KC z#zztd2XCfqc=gui-`Nk`Q4m3Au~d*6IFkboWiS}{H8`As<+;*6qs+0nwm#Y z_tb;?#tFcFYr#kZ*&pL=930jEvBGsjt#UN|#cxeWeeb<~H>alZgXVK+v~NQx`re12 zRNR7A(OHx+T#QXLb68MC@wlp<{6D?9Zt#+|l;X_WY_b>ef@)Lh1q;7HS+6M^FA(Jq zb5A*|C9S@_qIrJ0!G4H)d+KCMyj14M&cbvzjiS@|T!dO?$o0tfO}w=+#v9=cMlL*7 zQCTVJ+CzD2+xEn*W|8`oPsL)qi%!k248ih6D&~%ygj@^Cd@sB|U|rq^SMlSQ5$-ze z`nGPiFa|Z_k#sEeUVhcgJ6v#ZaOIIC+cQdE>b4%#ksFRWxWr&E1U2J4uHek4MC3ah z<(YFGm9*9*Mt*W~GV*ZJ)hHz;6fQH9W~$15Op8m_O_?x_lIO8hY~pd#!Jvw^t#pGZ^%d$b#_q zq^Cq(KNdcr&MwgDIy^^@#$6t&(ZgU?{~-pFuQzEWi`$%Ul^@ULq|eQfb;rLpJ=M!9 z(TC%c7K&pKwX4kO%2vf$0!zjH+&(4TWpzVT)VWm zJ>y1G^`z@+j8MJ_jEI+Vxaczw%(!`wt)y0s*TU3$+flb`aG4REGb7{_(O*kv>xUAX zhC5Ok>FqdG+EFbjhgQNZ4&`sO1}9Z}r#peu)v}D~+Ep*u+|WBlVD$NbG0r^11q0G~ zxbnJ_X>L6wPKzHhm7@`YhGh{&U|E7DY$B-&K#}1uvRg>@ALvEAJ(KM@c#xHCY&QLGmp=s0eLO6s+boY~udv_=`cm(plOQvh(E)Osomh~ZxDdwYR(eW5W7#&#;BhU0ZMI>pl0Cq-RgI$L zsSf|1;)vp=KDM5a=f`9CPw~s0!gE|B>_hkm$XA`+Dqo+KnOq@(fAiEUyePk(9h<57 z?%Z7@pGBX=OR87z0}ZKQ>IXuyl#qU=mQz5T$aPYBS7vQ364gF1fb@8^OE6dLZ2NN% z6Ks}=Qcw5pnHGvhN?{RUoMKaxg#%U{Rtjq(_U6N5x=w@1f9vx$$vRY(OD}$}1=!Q!T2Y;;EIZ!m^EYvat%?7a`tbq(0MJEPviOI>Acv(WZym=- z{cz&ym#*1Im~WU!ZeEW%Hcm@4$#6adUo|$V0`sG(0)qY+m(tvfJJMdNo#xPf=p4xtxOSIJ?Xjdw} z961_(z1C`}Ni#v2KgT$)<-jaQwgRJffQ(J@O-RVk_w189Dp;|Dmylen<aWGYz}!o77$F9CD(kzOGJ% zMaqfQ3>_-Ia?;p4QLvnuJx!ZxL>iJCj1(4b6I)@mkT@=kzghD2PnOcOr6n2a>swX{ zdZ+wb%ye3M`&YX}GtLU>IwaQu98cStbaPzx#dxiG(H_T#<=`e@@s_q^ZuzL zn|!p+Vce>l7^MW}Hz19lb|&;C*f9_sC4U~OGnu1smo_UP=PJxQX2(zX6Y9|p z$dyQ|x20b8!*6Z(> zf8lDa%uSGUg+@Kcn8gO=wdS_-gvwK=mMK-E6UJ06huWek^Xa91i?0N09-*DPXyu%| zNyJs+-Lck&qn*;X%1PjLM@PurXBl<)YYu2VqSJOI2Neb@^pSa@v8@Cb_B7ABr&V8WFyHCZO5+G6k;t?N`_HTA$2Ru&{fRyLi@xG%_2hG4jb};}S5~eg`bbUV+|l204PT%G z%)uZ3kTQknI?TlmIqDZ&^1KFf3ivz2tHv%TSQLf6AL7D&C++e;D=N9e+FlpFfso~o z@0j}wa*4Tz6`&NRQLi#^FRh*$7hFl;!k`0r{2i|v68xl6%tK}L140_d_7=8R1Ctu|M{~GzCSv40j2y{-AM;djCqz*eil+mcpOJjB{A_R z z(jt_eO-;zwdS91WAT9B+?Utf^MhzEzPL@Htp`Gn4Pjfc)An_GDiUav^3aEij)pdeG zuRB}9VOPEdX1g$C*D{N+2sS8fI31;}Df#f3<%Nbu6csUwnn|EGZAtCRTj_W#Ty5K(5uu|~Bp;1* zPwl>Qgzg>f+#(@kuGe?GTQ=J3RV7xj-Fl%AA*AnbRe3$FvvLn1D2Nlr3q3t~i#m2Z=(4EL)G@&+otY`bEWBJ* zKS9g!s#W*A^utn2juv1~$h)|r;i0q=cIf$J7s@j3-QvT8?0LKilqkf&>F*P`{CxsX z%F4td6^pV*j(QjRRt-(c0T2w?dEdwD3%Fi8_3jlyM|)=O^KkN&8TI6JN8gKN?h zzU7BQU{6d=?k>*c1WQgQI85n+^oq5Rh*eG1$j+f|G-^L;W)?xrm!k8MTvdh6vx$qM z`Ee(sXOf+d6GSyiXcuX`g8IHtfp2` z#+$@fJkVoTTbSu@tRm;5+%U3^sZwX|lIY@D~ER%D{RfWDJytN=dFOrOoTyVgtrN~T2d0PD^= zqwg{e$fy(&^w$FMbvM+^6ZztSOGU!?rv~phAZy{XMFD}qW3da~O)2glHq1}Mkw7Q- zmgGcj)&XJ788H&G_L3)B>cYB%u5xf6V$^+b_VCy4)g0JG)@4!wB`Ws&9!R`2SPXv{ z6>+?zR3Fxd%?&R7$)Y1PZ`1V;_328XOdxc$(%VukuM^7vqlfb|()IfFuZh!TQT@@B zBE${VeqiE8~*J?jylP@fqrqKT9H={hN~g9_)IkuWHIU7+R&vjo6z8L6bWTyK&Q#ZF>PVPmIqrQ z*J(0B4I!6bPqq?Rq$6rko|lR*yq5t6&sNEJ)ooZ`*@zJy9(f1fyont~gB^NQEIUMFYAM^)a z4z*I+{RB$AH78rXO1$kCDS&%|H+AG~xF4vlIvg6%Ny1H^LV*y#A9#32$lk=s z;q{q@YNOR*d_$+G`mmQA;HDSy__Dist_hUI?RM)U|NWC?O&yAYgXB@hr>)*F3gnAZ zTu^^PN3ASlC?g03#AuFGnK+&kE37^<@HT@+ov4}ucXqU>rB zI@wailK6e7fR1nHd!V|Jz%%$&kbAv_+m1?pOfhsahtDy_SvnjnxkQd^^p>`Om}h!2 z3*$Sy+%FAJ3mOY{UmZ_dm$qcQ~L&fsU{y|8-zB2|GxUTiAGnwLCP-v1GL{i-R@f8lVU;yi`zL<8R%>I-G0zw2o;U;N{co^2FhN~X1m7d?byomH3*R9Mv}Y!% zYFk6KWd4ZWV^RHl8-JaFJ~P*}xBga*&yn z#@|Ft>vP0BjPN57UkWQ-NP%8gev?(H*z7yl4bdgW_dfpi4QgPX#bXY!ii57$sox3dPeGJr z&X{sSb&eG>cqXcpx5VfCXvo-BT(s84u!mj4s3%){Fqr$ z_9=3Xu0C)BY;Likzgsr-cWN=3Uk+#n9eQURP82_8AgWyL8s^Cc*)}zxYV`ZY;!`M2 zq{;S^Q%)~!8kN$YdID!FXRPz>ivP01_VbVsnC z@UX6^;90F8bNiBGKzjRd+;bDgG*Yau?;wvt&1`FL@ikJ)Zp{zMrQP-k6Q#Y$k=zAV zwK8R7Pp=#IdYZBg$BS$(SP3PMozm^udMGW5(t(;%_q<+j3RHEs_6Zv;;1sqABj2`t zPJT8M^4GeH*tW32A9eO4latM3r1ARzPO2m%oN>~7e_&+-)NLf%8L@z~uC3bz*nuC% zhhK_8Y=z`WS||_@eO^xv!Q5Sbv!)) zA`_1Q@z7YQwa8^jdSnpgPOe$m*B3)#!K3rtS7S%F3!Lt3QZ7B~+^J|;H`6Fqqg**9 zAF}Gd*QeK2Xy$=CdH&MwPMJXuk(3+*{)AZ*iUMo!i%cj1=jY6 z-%8;Sk{?&}&+kxO`IjMsnpkn0ww`;apq*>(L5b22}Dv{lHy;!Artm-g`}*kpjp$V z-~;WgKYd;K{B?zBTbQ38KROliE;V}Iw06b!_OO#6r+FLpYBP^8D-~4IUW4-CE?wwA zRI{}Io4e7dkYv01)R+D!pG%(Z2X3&mAtV#MA@3je_PW<oSSVYi7X${4Pd&*Syue$b92~qglDl?POYrIx*@-nVIoh&3m&_c} zC#cg@@M;rx(=;-&-mPNvQP8-V2CqVY2*cFBf$PX2&gT!zgXhdWnof32C2kKV&?_}1nQ{*}D#)x(`+7QlpjDPJ{^cymdhgoyx2-lzcYQtx9(zr7cl)g?XeW9yjL-79#2^|co(wX>rt!ZM|KPw zXQk~Fu;M8XP$9}Lt|^!aDRmXGrKGKw`PoCXQui9+-qwVbB3)tK|DPNV-SMA+3i`@4Bb;)XNmSC%m zrlovAxZ+GZBjVz~gSGP~qc+jonSp1BY1GW{4iF*;c}SVY3Ol(WpiqgS8pQZG87T%* zm+6RFSJhM*>5}ryW-#InBSGAgdIACAYfR^qjjNW4uDx4>9{+n|%!|~>!p@Gd-+*e6 z*1Jx;G_A8XX3s_6%}m~#8}TS*lRl&Zp4XWmqT#8h0qnVVL^{gy(YsiNn435wp4xOu zv+L8XHR(?^K|XgY);qP7bu|DOs#DC0w0c`@?IJgW_KiVZ)HoDs1o0Lutnem%9IN>F zo$tvNmva)jYlF*Z;Wpz)w%tqf)wrREI|EKuX4Mvt>phA z97++o5*vW^=o?%#P&DLPEtoe$@At~`dfVVr_^qfKr2PA}?jc7IM{_~COwRmy6M}|} ztl$(5vtGeyFT?ez!<)t+v;#*NueXqNN zVS?M=F)XX?5)lX|M%l$80J=_!rJzt$nYaXtr1dCF>e0KAg+6E&7#Zr5Zivu8?HBoT zCu_*Mj4uy197-ru==4Xh}lQ#lkcA4w>rR zb@p?31xuu~5AKlZR=LHf#l9OKOy5TZWrn~PjI*QbZa5ru)MHuYw4Rtuz|xhE`Z3{i z%4;w+Rm7X~N5nF<57ebe<0x>ml(_gqj-XE#(zFN#H=C&{FQ+5IXCAb$<>>MJVr*rvse1@5)A@L5au@wN_VNWZ%-G`0djL zV2+cnwGvWRIZaDOvZK`Tyel5=KK=epM+CM(sn*WepqCZWGua5D=1e{M`jcE{uZ@bnC`%-;^bL121bZEW{YeHB9gPM(F?Uk?#@%EV*4f>+6y z+%aPCXp0sw!lAkzt^h_Tp2AO|VvZbSXU!m0<|hiTzuG0%%pYt1$TmXmG^lLUZNYWb zEQD9hrHc`27*kk1{B|N6boBjeM6VHD%vrqh0!#v_+P{-QnrJvb|B4r+%CJ0LaE~2< z!?lug5mxHyaTCi3hwq`ex;XU5oz9Ji=$ZuOQQ}opvfkU-NZ0E4xQ_OO=|kMxpwjl2 zC`&n)$uAor{+@?FfVDOG=+UE>+}+(ldbMY6em=c2kfJZhTDFO**$5GBcBt-XM}i$4 zQF+zGsdZ<60tF3Xs&?n#ms)T2c|0#0sq8ArOYbfTD?IU#6-yO%5@+;`O^9Ka1DZGd zpm5W>5rsY$&H8}6d+o`2;u3h-C^`r|Up)PrqA>n8%l!wf?K|54xOeN;t)tVtCt1kh z)*PKqb3mF!HH+2C2Tr~Ww4Be5B_#&e?};FWACvN&dry3~G|#1XI@-C_?SUswE^-f~ zVpb7&(Ei9Xy(uZamQ$0cCmD+}?gequndTPQlL=EVE&F`~(lRGTIa zL0nGbF@emeW~qmQFTeRKw4^i#?pJqrJ0-sta(Tj!>J*GWz$cN|T+#(y~sIaQrcO3X-R{Z`Buv~Z5`FkK9mRYqhQiR!Slp^jEKFj9)qY48UZHkS{d zZmk!FeB66bz}w-~w$(sxDD|*Wu1Wk*go1RJRV~6D!{C~I6uGS5((TnMr@+hj$S9)s z2bPN4xB|p?1iW3<)9EX=Y#M?l0m=S&t$JMc5ciGN65siSJ0HYS;*#H8{jU+n6|VLV zvvAsqz_5g6fAMmSNUhI7@hJd-91l``p7-}BJ}YP?>!SJ--i?2y!NPsn>oHQ5_ful> zDp>9_udehSq6=4yzT%B3{W3IumnXpEb0lU4%Z;TxJ7`ttLM0rVR~~X2$<8V0o+_)g zkoT&bEH(2b&qh@GI`asjM3>&&)|mtX!W}ST(@gL2A{v@4ohA0ut_liQG9r83TzMcz zsW3GY1@U#29V#>Q4j9q3refA(pJ0lk_cJ=Wx?)ws^0O5x2SRk*bx%Xq&LXwbHdjwv z?zKYrUxB;#Q7z+vN0zVQX%AOQ=9bf_Rs-GPL6B|VO@tS$m{m_6L`CGet#(s1*ZVee z?%(O`%2LwdW_;DW@v)TN4dK2V`9tHRrg!FB#A=A*6qS^a#Cw7~3Agv8Sa>a@iv`>8 z_DYsbfm{+|A2eLmwF zl5M@cmH8uzX|6LAyp#_xb@1nwpyST+`zH>Tn;Zs80aH?gffIuNRck-$9H{PKp$m7Tdp~;IXpWkI!Tk z7@R)L6|^T>Jv|?V70^nM^P+~Jj5w1V&VW{1j>uZ9P{Cu+>ELDo^ z-H|6Zl6o7|`Ai2fdXdi+80lin0W!KvY&MNIxL9=*f~?I0?B{*ZuvKULxVmKe;&n(~ z8${Sp_JX(?02vHK6^qT#waMJDCf5UxwK3|-J?eHkhnXdBfvoNL4)2~Cm|07ULFk{* z=EzjWoSuqb;B*_hh$95(v&Ae^661O+Cc=_vsRYrlY#e0e5PxS)>Mj(lk zJQ9uoKd&H#a!`WPuJ20;xY@HcYM0O0t|s=VEW@6UPqJxiXG#Fol3b>NEwqmK9F-a8*A){PEdaUImpu66g4J_ROaV1f{)|TXI zSM9TdIIv1S+KOL49?9G`5|plYs!|ahxjZt^Z{!{II5}{6PgsO_r{*bn5E|bBlg<0a zz?%npAXZdjrEsF|qb~+7mbva@2}(Za_fdVcXD9}iL)AWOG}L`#xn3e7I&6@5(WCxY z#kAp5Yb!an843J=Zjr8icPg_=9#youLVW9N6?n0wdm6Kf1cGgCUh%nN;1KtdC-fSI zma3W~yBfcoC;*MEm)fmBrCalV*gp8$!mFqv4BUvOX7DdoB4YV*G7mKPp^rf+nx2@J~~S9DO@3szFa)Cj_-{Tf;Yj#<1mN_LO#ub|wZ)XYbGD}*^PeYD^eOW=R z3LiySKIHQ!Nc?0&=Yr6>Kcyn_L*%U!mj*e+@F5KD=*conK#Xt}Q)v>^ehpU+Ft1GE z98k)Oc#cx>KcJE%V3~k|=OXm!R$E~0+>&rN-tXMrc3y3N&$N?xZ{O<^5F$l$rWC=e zm0pz@(*OtYUVMKP@O-);YCzfp*Ad8Bn|%Rl$;f2#g;R$l5)x3o{5+{dQH#||gu zNWLo{uNz6dC3*Aa{oVItz1g>Q7pq)qxBFvPw}OU)PDRcXFYTnO8CtvD0t2^}LVmoM{-ch!V-Ag0-hrOB1iUVVe04jHQ#pRhS12V2+7n7G&v_oeV1 z*|T{;dF}}Z9sG_*Raw=r)@1GK%519=qnkL9rK40$Q>N!8ugqm?pXG-%U*8#{jE#-m z*lus1ZFQ7qpnK$VGTkWFsP_)9Y*`&SVsJ-c4?*T+@KNLCFR#WvjWv>Zh0!OWi%WfD zH^-bvoEHh)hSg4f%g!VYvFq!}8S4Qhr(~~Lk#G)3Tx<659_3T(#MMbY`BArH6b~x= zlBx&yA@xaR7{=;*ujJWWf7*VkWnJ_7M28Yh_q zUzookQ9RI*T17*Of(2tW>t}+KU%xGll{x|GIy#s=*00pMQ9>W{fSIuoGcbZDd#A^; zbKD;L?1ii!1P}vPa*@jozCPgXWU#Bjg|a^O_F&$iHLdyALV@>NUBF$Fb(r80TIkXyg6u#Su-`HJZ&LoshE zbHrp$evxzk@W~fQp-&&Wi|KT0`rx$5k|QMiDS1qnG<4jmZQ=8d#sn)f=0y9OH4Osf z=>_RVU}B~B@pE8^McMW~D1=loQo$-c?9ja#Ck+HR>WHi(Xu!d3mczz|w zhcGS~TP~)DEM;PX>T$+3GwIB)ysIb{gCx&A!zv7JyM42nq(mnxP40x68*NBLgM=Kb zEaewZ}qrEByMReC&yAR*k{z14JMt7xOJpVfjk~H7RfgoAR-dGX**54HfFh7$T(=S1#@0q#H0T z(U;KmanAjvJ8_W7xhe2VL|aps6AtuyIzgYOa5!9^?Heo0zRSTG=OliEI%e}o6=S-# z)Z~8a>-B;L?#HDnp9FXetcMNi?)&yA(x_&?@j}*eaRt_2N;d3SbKRzlg_7KwA}TK` zV-~uJIoI6W^7n{-BP0F<%90HBd>}Q-AL6R;=b&kN$TTPBD<|ef87QBQ z%vd9TrZnv@#wABDimiS|uc6Z`7pY?2t#ZVa z%kog%Ka89vBX+~uv%|tru7{5QJAEG`jt58mJ<)_O--a|&c803XbdKeYy}d^J2Pc)o z%*iwwZ!({K(E!q7pqb<)7d8#C=ZIpE-AN??%3% zuIPJc1W3sfq5SfSJTqj^#DvQLvv1(VCvuqO+)d@L_lVwL-<-Qdr*8uQX!5s0PUQYF z-A?0y%iNZeDW7>lqk%524Ls_lW+=Efa+*tu5w&9gS$b9C!Wsoe5f4cBfGoPK$>NP> zMr@;!6p8HpdF|@n4M7Avtu|sYs_=T zQkfjV9zH&9)PEWxD%~aW*`XZr)!D>ebJrwLa|>E%--am$hfhH8^67X%?F%3U@}5(t zbmXct6^G|`rEjs0HZiV#5paGUKV}RJl@yuR!qprIU)?JoBP-*ij&GlnQ=Zpk_vkS@ zzu2;wtgLvJKk%Y@V0_Vy|KMzMrn5w|m9O+w=l~ZGH0zK|g4a~onqGfPpfeU0r<~U= z2&n(4uFmOCk9Bh%?~L?mM@5c~*RC|~X|4X&DCW|Y-woa+TI|EDZsqp-#CA|~SKV%( z%P(=~23uYC5C}#E?w8-pKEKFG4@#}5?J-gc`TF-3D6%boYIoEg zpEs{xmz(s7R(V@7XQ_MeYnkC{;Jx+N#0q65adpgK(t|anvC~|R_+4Rq__12nhJjz| z*#oIkR^)1We^;~B3Ysr0uzxNiWkkz;>E5o4t=5OnmMyh?`DSH2eL|u#Pd@kYF|U@| zV{V4XYOoNYC==tpzEr>IDv9FAg8jLoS=1OHw_&u$fX-u;+*W7p7cd3@2}=zNE=aBm zv~x%D9`+hPv!RS>chwwM@oqb#ZQ(naEG4LLYotIaC z?B2SD88pdV`ea+T`)e|(lG~lft0mSrAv<{j5TR22u+?e@l?JL2NK?l8DXlNyhAggSjRv@oi!e2-jJ zs#$81nb`wOP*)D}c?~9|V@*pG9#_mi=TDqs86Tm>WV(o&!Cf6-l0d6N_RMk?t`Q8X zoc=r!^35jr%Pq4?5Qpupm`|(z`T$w+X#z(Jncs$GD zs!R*gI4Ah|#KIotB~Uiq6^C=rugUAV!qi36{zKbhU9?goUmkEC&5Y|dHWD<5%nEV% z&ST3-l51>Lr+&Rrk*1xcEH(Q38Y!!nHl#pti-Hfi_knCb3s4qjx3sjpbnWG2fVb#K z#mV#G+DPWEJ*~iAIH1S#dIQJwi6b-3O?mFSuwLH=@9HRQ$(2?!a(}L!xAH82_D^@t z@k4YMu-&!K_h(O8goi@V1=_x{>orf7I>$()%GfgpppK&ArRBJIEz-hR+}JCLVt1Fz zGhR0H4WvHC@pWx+6$NN3ge|0hNQA+LL;xh|mhUt9I0?Xa6B0<%WY>%)tPrznRvERK z9olB)#)GxYvehM>rW&>4L;@igZYdR?pI2?{1wcgzbP-uIdZscIZbv)`@_>3V*iYuxog#HhX zd3;%TV*6U^H8gsZ;=kr+7~L15%F(~w%UoOCADrp8ZNGwx{#|YvH!WevI;y+7n@Xj! zei5UJi2Qm_(0k&Ans0|znfGXVtoPDrWW2}ukG^FCA-yGhb=SUDlQ6tI$jqg0Tf7vq z6SB8*MSTsI%>KX*Wa!(%4*Ge)@OVbdm@8=?srh&m_m;vX#k1s#xa@O-y$?v_e{k-( z{kj(BOzZjh#+~b&_&TwuX$Utqb_!GqeB_!vyuZ~L1eZY*ml!&dwMIG6(5hc=i<$Qe zS$=g_SQ-&rFviT+Z>QPY+b_MeVl0BdEHPk}vFdko7h7B^#;m$JfF6hI8;HaC6DAJk zG~{$m1`9)>SBI0wW+bF&d+BPjt51(Wx!g#-B+F_)0{Ix}+`iqO)5oO(J*A7M*Yn#Z ze?<-qeceZ@W~stL&isO@-XU@_U3QJ4gD(e4)`CvpTcOMeVCGkri33)oMH(hIwsv{^ zNMbO6F#dV~rI>T&=;tJ{9Ooo6X5}Dxg>+#L0W3k+HM9Bm5-E%bwWvdoHTv5$=_N^* z6mh^yCwpc3n#O$17?odVM_maa!hZ!G)5+<`76M&-Og-eLHh+-hE~dVoOIaWdE%*fu zpE(Z{Q95ocdk2O%7pa}Bx0ZGa-83r<$F6(}3y*qQ50LNRCf}-TknFtHkHcyQXlAB_ zTxsIPxab759~4OZPv|b7ppCdqn7Z?meD-D1C&zKw@)Z>!H*g8rvci8~ghENw1`xG~ zH5iZ}w}KL|x+@EUtSH7E#)X{TCW3P2lIJZ+xwVCgY5jwgws6*W{*WgiFidQo7#7H* z-*V>U{F;i2JNqM!F@dF5(Fy|QbF9ory97@Rl!c579Z6&^(~#{10^!+Hw!*X8c_Ghw z%$c!c%iF2;IYL}j_dI9goLAK6sn=+KUIn4rk|u-7<57Yu4L~5WvB$|>Y=xy3x=a_e z{rspzF~*WxpJ_V(4!|oGZG|yOXNWjw0ak!@V8VaZQw8mU9y+w12qf(L=hci7c58oAW2O9nuW`5w*}S=ySCoPY69vYrPnL+b)F~|| zkTgV8KaqbLpo?Dp2j?Jj@>+Su(w$wDE!K+_B}-Xj2|2Dns@*U5=+8GwbSYLY^^BDV zgLE#7GMS{afHJ`~$_T!zk^{l(&g)+jvXkI?u80~px?Kmn+gkrF(fKWKBbJv2pQIpJ z&l<(Cyx7_~joR+J$Yl=jmT)Iu7P=vB%;B^9%}eGf6U5ICuhZnFoDUf zsp+xge))R+eITQ>#g#Z?mWFHx^m^*e<5BEu>vwXG!&XS#rnk&})vBYl?!$?*tO-Dn zq9jOl=wO-Ldr9~S`177S#wT7;F$|e@IbMGOOjH&z;zjgHWJ`Pu$&1t}Cgt6+Y?OMlP zMzIG0tZaD=3P&aq7N@MJl^~}NTHH<+3u=CN^MIY~S;samtF6F}^`iH2lz7qdZ{;L) zfR`nJ!K`adQeaR7k-bgI3t1s^YPGe%c>B{VzucsGf3&%~K&C0@q(v%wzBwEdI4kbZVf3^>agcemug&4n&)#qCgCP5kuA##?ye(yZ8Q-1GTy2WtOPXBRKX ze;rvg-?K7SpiC-QU1GRZkLNS?=TD{htOdk&ge*8D2#+uJ?dz z>I~b)qgJT3NUbd>v(|yF3=tU;AhlMhf}lbMBP5D~Y%qisASfzT1X@w{Z~>A)5@qiw z3M#9LhnX-$Km#NohCm?U|C})Te((4H&hJ+nPEO8w&NJ?D-Pe6TnJ`+=Q!DulJQ~!i z5Ja=!IWKK+sttoOJR9UByvZx|#2Z$zIiuH9!GN94GcIp)#`>iC{(!*z9eb_U#Qj@Q zFb12Df_(hcYo{`rntEB5WKR?#JrWv&(ZSZR`)g6oNTa(2ADJhpHGem?b!^!Z^uKRjyejR$_A$O!UsuR`g}|8BctNC^)WoKq^e2HDao0LgIr-Z~ktZ2cq?WFaTcy(5S z-b;-MvMBHDOi%6Z`~lc@`1Iv>PhCZhSjU~okRkB&eF}LXNZVtKA z#G7&~C5eFdBa{gZ?f_x|N!LTGHaJhW#BKrUi%B`g=w!4e%{S$u5(0CLP}qktʑ zoO2!O+>{_#HsM(N)$u-@A&w#ZA_Yza&4t-)iy*%y4}O)XPr|4gdu@r%qz*ES#;NS_ z+7A`{CW6ImX_aq($UGxXu_fYtYXG&#SQ-`_Iry@B#H}Z&^BUc^WPpk&TyYDAVq3$84aYhrxQ+d#k`4tC z_w&7O8q;EEjDh^{_u*i^RhDgY_~*a$CMtRV?ua*gFIA>OR=yknK5zpjT2s5bD^lHk zEyj}UY!1$4)zq}M3H!7JUP=hunVISY2W;j^yx~26Ze{S)|Elol@+ zRM1F-U_QN1R9lz&n@SYgH8;!}1S!JF?Q2OSh^|t--JjFXT{vG-(}UrfpY;tYUxNVp zyqOzc7N2Tf{zlY=VCC7OfumN~pz|>#2!NdO4b2n%w{N8M&-W#e*=y!)hwStyiWVWn z=wz`Ru*L2h`8ZbT)qoNZsKwF8=bK)DsEJqDrKEqV_>Kgh*}O&|`U!$ak!7y(g>4m6 z4ZV4?s=2tX>(d!dGcU~1s>a?N!yRZh_02 zudIfpEp?$y4=tb@(+{=G^CdI$uX~3XedAc~ptMst3)1`3qJ zzyn%9`$jmHH948FYIy`S2WuMiOP2Ket(8JOi<~?aZCc=DaWoJfH1EuyC_}Iw>u5HG zSp;(clb3SijT0(HIx&iT$^jtP6wB)r=ziz8}j-3j)1>`&O$}wobrDH4vX^4z81}AJ&Iu zu#}Awb<1rGpx!k-_Zk4JICW{cSn3gjJHH749Yy!VAOEUor}?RCWS@>Wa|tAO8l$oH zi~B}MdqJtMs80?A35IMlwDB~do%qj`&P0U6T7*~Uc*bR|5*dJ)A>*B(b5nD_}%fE%Olh?!Rf=KEBg9t!d zO($t@MtDJbo5})O)%Xd1CpBvRPUVgBp7CbPvnavQ{l*K)mMNQwgbZv1!AdkAH%7Oj zYw5lbt~pv!C4OFf7?>0;CO|(MV?{6)!2Y>BKaADG>a8#r1j|~!`}buhyjifLUI8)F zpsb?3pL|Hj+e}iW#i;yn6j4C7M572kjJ{Kngv~|0%igsJEk8U4L{WiZ-hn+6`91sH zg6cd+Z{@Bb>dc=kd2CvEXbtyw8{d<+P6;*X#)7GZMGapB{YAXo)$^LPEVE&4cF#py z;R(n+7Z4&5GtaJ5%xZ&v{RM$gYV;Nk%hm0bSW@}~vL;$zSrzgxL~rC?n7i9sVtSQR zDE?=!gQ&}*oL?@o(6sL9+3@L#vM?XR_PYol`+&AyRqbp%;fHcL<{-kDDoNr!8ss#JEL7mvW__Ds zvW)TsgY}_dhu^=1^00eWvE;j(@@p$AOT-`kswh-APO)(Ed8FYC;(J>}AAyy}=h^KT z$#RKsHFQ$PaqD26yjXSUQd?s~xNnX*JFN`xAO&gZ>GS`1jN-gWB$8H3bMrU{)HU6h zmwA4}xUsRZ2I~LntOtlgk=o45XedcLiW~OfA4`75#fL9~ zfW&KFtOE6gjIEI`&!%N&a&Z7o6j1}4l5~pNTZ>-CXQrnkFO*2iFu>GeI1v`!pHk0m zdKS;4#59gIG)rT6Igq-7i>9N`qV8JOevZ94)AWJ(OG-CD_0G-eH|i_}%!wYb<2k2P z2?ABeM)v+}wh}MDvWl^GRK@||d=0cnlTQ5AZEVj>|6@+?QLqs6U}*H2@~`j)hmn(b|{F>@^?hW(NKJ;(&nx zNzCDJg0b|8{>D@umIf+TPsM}!P8Uml)UCZq6!xVr}`68*GJ`i`7Qe5c)zTU zC^V#KsgwuH-X>#|6TXqV4pFdkn%mYK+`lC2AFhVxytokxN)M29Q}^ca8mlZhqKDZ% zalR)>meJ(()CR1$2cBaJDD~*f-TXUPqGimnO5?)#Pi50+X> zS(L;Ojs&pEN2;=y^MR~0RwNs-#Pq({7V3rs=sNSWGdj>yr~`lpIO~`LxTxtVsLHZ= zf*=_ioU(H&2flM1M&3Rd_!~uv20MU=_w@G>)@d|wV(EEUEqhey!|MPXLDCgbH!Bja z%}+q&9U~h31Hn4V_JT*cJ$Zrfie>Kzc&3p64Er>h@Ccj$tJYv@$qP+eF)l%HlV2Fg zGG$d(zIkE3NcsK}B8g3k$jvS8GUv=A|;86mz*U0GU@QrfX3YBs|mxc;IMxC zKok=0TH*!A5S8AjEA%avc$ASEZGVw*fmqj8mkj2*<}}gV;LMF=+JqDI9cwuI?CdQ~9AA?@&_|)_C?S*qDnlUk^qbrPJryfx z;N9kG2tt>q_lhY4@b{K&kUO0WQ49La&d#XPt9EbxX5b92|E{$+iF`shx4EdT(8V5) z97)s6k9^eAakts~2OkyIH{NEmr+mPFfZzeD4~ybLr5S@#G0;A&{R?e_LE#US*PB;_ zz4xDgm*T}I==5JZ3-=mM!;R)DGLePCb$mZ&D~r@ng+IaHuzO1WL$);TS68 z(LL+5wHgoX>6#nk0TjZe<1T0Pt=v`A#%_!xOn8YmbL1r$Irh0~nwY^GZD@|L5>D}y zi}r0mG}SA_>25$hGG#s;yQ3Um(IHQ#)5>&hfZhP@xD#JsG#p_WA_XnIR<|K=4*vXNmvBA)t@Q_s4_h}-#5S1X6H8c1$Qh!Xgi35!&XWuRre0v z>)H+hTGhg(BXAv8_QgO5sOeo0R-uf5fBOf5WunV)Ibf!NKJG$3DtyFy<2n*wpz(;` zS9Sq@3hQTeX2_?4AR5k~ZFTD2A@7 zsZW{nr=A3sZPUi)h`XYKQf>Ang1 zKKOKSKgu9@XvWKEqN#+&r(9^f2D{n4{K_e|YMEaOH{Gk*(4dTY5J6m@&ivcI_C3JX z!8+fyl3t0xd1iaCh!>!Zu;>MAks^qaJ8zawx3%wtRTWWxs1YNIs~DkxrQib<#@_lK z{649n4XsJHVfbq5i>~m3Re#vF5E-2vx`tRW9}NPZsr3@^QEGw{A;J<+TIP69BjD{W zKzDl8zR#LdDkmhlbwInY5hE|j21D+nfLtKr2lqx8t2vXq4VBd&PerYpp{b2(;;Frc zF`S_OnK;FQs_7^z2B+M1*Kg=vLCxUbfZ)TZ^;qnt1oc%0_%R+wa>Wti{;v>wIpJ5Y zOmi3E=OMJWtw8eZK2v(w3G$4;)1!B1orFoeb|_}|76n{%E6u{pOq#``-b|5%N59S5 z3&5dv;46TatlF!xNb-eZX`_l|IzToP$#V~QV!8DKf`)#ur*LL)%GPKP8(uw8!=ANQ z$NbVlZqytoqq{5=V4aagz5%ExJOKcA6b*|vIW~$7754sx8+4bI_b9NLViu%Tg|M4k#Y!3&fGT}eiR|tbzhXB z(OkH80I_7%1|BbLr%_R6k%xqRn~PATUjm}gyk3>tS9};*&RhU{4lf0X{cu;{G&9!s zJ{MSj_dcK{#HBb9G7T~uKDq}WaK5@~zC+`+h!KG0QCXFRhhEJOySl;amiWtfg}+1{ zijc*RzVcD620x6>VaT4**A3z62K_9WI+PR4&2H{;Pp}AFAz!Z{$nO-BT;AxU^+eL zRRV?X3;?nH*H5x((^0c0QOjoH+IN!J&lJZ(j3dA8>3tf*Ol`#bNl-QWFO4cf%+ znNB!A_`s~LXP;T^d5<9k`^ii_lyD&O-s3!Uo1M(xUzio7Elenw8M_& zre&Cf@feK1Vbd+$pd<0m?y`{iC`wyV~D*|%<1-9)n<%LLF@#ff{h3Zfftx8oJQ}OT5 zj@cT%?AL+b!gq49V>Kmdc%rYnu%v#dT@}xnHc*i69(a4?HQX)xHl)+lp+Trazeo$) zX%y1(6#_#)m1Ax=oB6DlZa{AX{68!iBQ%DZLMlTB$(Pc8G<@agu=&U^q zfAK#?C)brkzVKjUaOr_PpUR65J#Hk!9dcQ-Ao60!f|O?kqVe4WW9&OEi@6WE66pOO z@RCV+lz_EF+j>o^yxc>@o9BjE3I#B`P#fDz>omJrk;=R2d3dNP$|ir)BVFp&A_QU) zMG4k$pTl>~NzrI-W(m?t>#L(de}4D%75v{iUz-d)d->Dr_cxQbCC8XvOiIPI+atN@_;^zs1mKAdUC#6D`a_ z6L1nlbV5WcvmejBBOT5FSqY#Q$RPo33JR|?)*y^-A?a20#7zf-XCq|96}BwiChL+C z(LZt0={oh2y*jP+MGz zq@HJ>@Y-~fjfD#9hE0B-v00?ggIJ2L*W|TNdsV6}s1F4qFIqpLFbcj)I$H|4>|}LV>Ayw16F_!HR4AfXa~@K0(Jf17yeXdA)80r(Hv14*5!E>79T`BaGX8K zks!cSq*tKPGZ32JM7w%SZe5hZKmtt>WHxL==Ne-rMDkf-(vsBFE;%V_@q#HIKHB7L z(nwk!+!+X?T`T6eHzv=%zni$d0rf~4_-FGNF7{GFP=o3)sQ2|OS)guNxayD%qTU0-U2iEJu zuy_7^W$bM@6gAM51Ki{;>!ivy2VNvBn-N541!^G{r4<5=(a!~`L!FFcKM$OOkk=Yf z|9)K0>BDmEBmA2U^#G;}4%TS{Xv?)m0b|q66!UwBw*P%r6JcVC#rD!I4}?g9ZY$f{ z(zCK;kQZ^t(es`-y~FIqH!aN_!`O(~;?&e(b7?D*-&ILz6KbI&#M4za=2KU;jACz% zihCJ7weV>jZ*I}>=e?-#p5qL3iR_HFcB?ZNoJVm)AO!D<$# zm*n_Jnq+AE_#1xhXnbLduHZ3chFdbsUXWJM%qfDt%6%mNRIExDWgtsOeniwZfhrGf z!c&s5#iU1)>G@G5WhHn1L6&=WXL>ppR1QES>F%*#0wM}FtWJ^x33R}`tP$7p6n3}~ zIK+V58n3@M8r`3!S#C;Lyzrd!3fdCWIBKtH_Kf2L1>0lMi>#&GXyTm-W*_+n(x`Dr zme>K}ACp@fb>ibTsFe3kpJdC(+Q)G*1a$MX{1L1y--69EnPw&U`AwR>sC-g>(l+wl zoN=pmk?&}dlW$)T_fD`HR#;wCHB?;^_$kMpI6&UD_ zW)=VW)vGU3EzJg|$}%LAG5sYAdnKYfN7qCN`AMGlYVY%5*aMzTyI1TB)|>FkL&yGv zieL+(`&Hv5g$W^Kr+Rz*=CAy@b0C<1gU1G@)Ki{Jmq+D`#|EX>|nY zQSF%Ft&Ql~H-jBg!qhyjtTw$@2^_3Yg&T7gE|L&oFnu$uJ&9L05gtiLU}`T;bsFOH zz8qdj;OcljJ}YCaAP^?IId$R$fYoalob|PhegYWbkz|F7PmdNARkA8epFTVHz*(Pz zDo#_?7l|NOx?f>X;ensvH!4`CY)`<}csFy(uuQszqj~t-dkA9U+$Y`}=%DLJE8V6b z>*5cc;lCLcE!8@)9BQt&sP%xuVjZo6N3O>7E6TKo_4EH^=7TQ-ykxNG9$6Tjok3Zf zDHNdGjd0%M89WhPHDO;cMOXrrHt3z5sT?d}JW|4F?S`A;{cGD+eqC2`?33jEodXBU zUt9+Pr?}C9ls4+UFe{L{hzC>bAb2thQq-x59W}JmvDv-^6kl()LBj6|j`Y4*niI$^3u{AeBfjkhxD1;jvI{Nwvy=i3Q$)R$M{GDQ<1YA=%AZ zR!h1ke*TL`uy6A|sq|#@Yf>`WuduK4v}C%*`}^= z0VXA+Bjkm@D$MJ>VBZ8W);1Q?%SoviMc3(@MV=9s0&@tFbO0+E<5kNshvPoiKq7Q3 zln<#V#=#=7V2itUfDKsNtT#I3I+B~PyJV@-jJNKJ-c7kZia`Mz%uy(^OiA}UIk_(V zsVmJGv^!%1!sLl9uvmc|H2VNfW0lVPj|k{Z$+F7yDfdBB2%g(vlk8mne5OGs z;2na4_36LWGuCT0R*^-)(huD15O_qLODglFdO?E{N_dv(vwWe zNN|^cBfWK1xDBLCJ?EvwoZUL4H| z0^5~;&N5xuhG@iFDnW2i0-s0TZ+@g8@g-EOtmOPwu0o$9aS|w1RvxF&YTQbB?MzRyns?KHjkIRL%WEr(Q){ zHcL(j%K>ZV*>k(6Zz$H$SMLU4>P*h^e-nNjoa&BqxyeJS+_uL*UM>z}^ax3-Q}cZQ zD(6fWB`_@D^7tkl5w&3Ei8p zx=lvV*D0*4yjk8M$W>LFP1@OMpI2~XkIC~6xg4bB=X$6q#$eI)`M40Wr0u06>Fr0I zr|at_N%|y&sDlduY82c@I9DxO7z4 zPHUnGXux`f6=zrR2+d9r89WUqGg&_!ym~c8Sw$w$3ryj6Dkve*Vyuy0d5zw-3#tz8 z*#>AP=^!t)BS-jIG*Te9D={H0=sVouA*HvMT)R7X*B+!$!i9xfTG#sT%r2;997^2^ zp2BvUisv0uQ2JgCjqnc_y4OooX@d0d1Hi^|G36vsP`ps6RgRg+F*@t(MRp_)y@u0_ z+1HAT(#*%o-!-7U7Ftl=d&q&)0p}rFZbm>whrN!omUvT;3WtGm7B`pE+}nCl!QsR`Jtn(%XaQy1!O#aUIDx&Fa-nETsQ{pce^I9tw0LPUIDx2VneFv({TS>O;gzT zvxTQ_Zlh_bsSX`OgfDTnggf9_SFc{}6DOfQUM9>H4 k6*mF3a-K=xk`yeq`eec zP{5hm)YNXrk??ej0i|<6s zD@tD%AGUthS3@!Oep-KY-i034)1DQ?EjB@d>E_oKcJ(?0<2K|n}=t-=ij5EO!sEz2Q2+?Lqb zjd+QofS+ylVM;ytef)X6>zOmsMYaCLQ??Vg=gkIO2(F9fAggm;N~6skW(50j$@6jC zupzYhWJaFuyo&rxe|rhGx=!BWy+?~rcj@?;^YkYxu@eO``Sq4mZmflpi3$pl3eL&CM;67p5Asg9q4`0Qb<%u%)f7?qX2Tj2D>dE*9WM6VaLr0|C%* zd+QoC692-3Cc*7&#Y*Q}c8( zv@=8+Q~X~$bTD5y;&8Z>%=FC6pxbf8Gm(S;b}A{_Z`{$~PJc1jfbDrZ;APAklP|Ds zJOQj~)%xk|#RU-U&dYQy!G}-12-efp?VDNUCfOJMZ&%W2{*#st)WeK7_eviE)Hd4GfxYj+PV}kUfB{) zcr%Y-R7+{<#{Hq~*C~~su81N7gI=HnkrH)mX3zVx$po~qpn~^d*xA*$NeSh&2|*}* z7twZFWC>4l1bqeXP~}F@8_`sAH-0PcP+~&S83{3(DO9vYzLcAMsD&DOTt{he$jcT! z%**aAs~(?oDpfBdFFcsFln%OPa~I!^R)5}@%>AOQm>QHMJ-qHCi9-Wq*aWP)VZ;z;Gh8^puJ9FpAE+s`h zD7Tpt3g1LZE5+Adff61a|CT_(*aV;OfBN-|X6NqO9w^s?FBgt~+11K@GV%+i>1<{I zPr`Tb4QT7=<~k+xUi>VjWg=TEN1rF!IG+MIc%#6A->M_>J{Zs&FLBQZtD|5*vip`& z-~KWdYai+LE2KzQ6h<=GtM?s1Yd_@fYMpR~Wdm4?xiG8(KQC_lJj65vxn3Mz+UBBI z$j585XLMy=Pcn1R(}d)f`=YaMI1fxTE;4lf7>1V6(-Caan~u?q|E;DvOBa4-?NVfg_C5>2>M1kDhr@3LCk<3NM$r7u;C0#F-=s;%5EPc=vS}Aru zrcpT|HR4ZDb7bg`Eo%_%lM8ag=32m?a}#P_YiwJ}eDQ+a1+GG$<~jsx8|PFDmK2kL zT#zn7sOS|46+8eOz%(xkvnOMQ`mLtZ0C&`qu4RP#?rTKXR*)_f`U6Q(gRiX0*`oI9 zI@Ak%yKJ<;;TRkp-RejK^ORVGyKu6P^wE5I7boIy~vSza8te*qQbk?GGku(vuaviKx&LvU zBXksCT7*-C&v=w4e}0GX#`rP$*i7^U;FUNF$34Bsc0#WmZlfxkM9?PIL7#jZi)FiU zdHNRQcQ3~kqR#g?)!Y}0~s zWK0RdPWBDe%K9YbuhqBNY|}>SUV01m<4--jAy>Do_5s5-d2Hti)9(=e&B%Qt z(}G8R2RTPWkB}+CRNmlTH~_=eKbsHcs!Ur~CjI)+foB2Z5x}qXhRe>thskASdnOZb zNc>iZo2BMgPmdO1alcw%`q4B!I8gt>!KcX}BnJKOCi_`zV2pPx+x>T@ID z*ZC$ebhFRxK@hnV3Yqova_fU{LNy3AfDAscm)4&ivL9TIIh()!6KvL{xA1qZfV8{a zf|k>Gi5to=i;sJpB&W%ez#-WKfV6psRTLRxveop1!^y8dfYXiTD0(e3>{Cj`^tJg4 z-fO^JaWuczQ~Klr_(?$&SOify0_rjhy<$B^xk3W_gh!E2O!sm3EZ=$yNGLh1s$C0v zo1uWpEl<+|aVrEdqKXxE9!#W0kPL#WsSuwOjsbWcyloEL8O#1m!DF43GC@0>4ORkj z2(J$`Kqi@koP`g0Bgeq*039{dt-eDUy#pE#Jb{8pMi&7Ng;o*_Pkd@vUdvr4vqD}@y>NB2^Sm)%U2C@Xv;uZYpIg$o6-j|TA0X!f6KktM?XFu_owr^@jCs7rS9ucyX8i1~><%P=9k>KuSUBc^w^=(s9BJS}FdVd`e z)$$<~S_ep;BUz3${^d2Y{Yls(aPkFlp!5e3d*?zsPaeQ1CngrH0@^OhBq0vpOGA0$ z`c|oq@U6j<^XRJ8Z$73%NKg*R9tAW&O+llD69!mK8obNjXLVPI(?zv&hYSpwC<|HUlu~C(ue&X?@?*Wcdwg#YXW{r~=jk~; z?>bOK*|wa#cwIiMjqd{ohYZW`kw-Vi8W?0(yY}8Oj=UfM;k>f*W6zG0z@r15gm({H zU^wg8ZZmpzJ?j!d;9_)#q7CwE@1Kh5eQF&uK!)m8cPC*CWU%b;?c1OT7X>*s+_Gco z%@B>b8v!GWfR}ukoHc|JY(aVzt9!v)NP^a>;y2Bv@G=IC_|o%?r|;q9=CN1a(Qn@# zE}Va#b@gi4`v+}xkwK{maKp>E;;ZXO=uz{9PsSPHpI_~D{DbT!I6v<$a;b?JZEY7> zd^*)ueP&lDP5j7TgpGUS)&jJiU%&KDL9NB>*m7mf zuRVxg^tuj06wupj~+m<_!)rYvAKl3pJ}iV7Qqh^WP($@#l_ z+`T&1%K?|C%hH|r{fyDsv&Zwk|E} zbc^90@xfb(G^VdV;d39TjPRhc#+3EV9QY-MR}74G7uZ;DyjCAe8KF%mWVNqP5_Hnb zY>{83HzZ&P$cgV%yV|geZN-P#a+4Gg;mpHOax>N&-O!}DI7Q%vt-rXhiON|xFI=d% z{Z&SKLlD20(g49i>%S(V1LnZA4uB$Jdi!m_zJIK3EeUfAub{<88cq?!=5b)Ffa`Yfe(wAL+YQWM7&;d_Pt`7j0U25_Xe z{4UIcis-R`FF^pdg$Pa2Qr(E?`os5;RCub@qO`}9bM(#(rwqh%1Ihky9brT~Oe16` ztAm{=NM0GT+0gaXYFu|+S#`JJ`1-QKYBN>5Ta)L>jLGt;08+LxR-#^OGx7`Q7zHb?{PLVz&0$=+FDUh)M1xD$fp99`hwckgO!uh^RDkXbemTX^ zFGArW7-AxM;xoLTi)!;P%keUhr5C{efA)aun^sVY&m99h1ukS;p_Ipldrx~fLFM)o zNWTPeym(MH%33Jq&BN0hfm0094&cYb`q_ge3s1Rqc}1f&#=15IYHSs?X=XGWAO435i2?Lz+iF8Eea zKd$5xFW*?LNr@J*!-5({?wg#ANCdk`v3?fhw=y)a@T3-DN57R-Dr7eV&HnyHo^B3P zm;Jw8u}W4MN=tWRhXqlDx4}AoqF+=8Y(2a^{-lo0eoz7QdnH=iw`Hf|>3kGW=Wa7U zLeqfgn)IRufQ06z;oHSIN_4<3(dK(kI`hW2uT)3evP!62dH}9%gsun*HMyKLzaRFS zkf62g8-Nzip!4iU?LiKDWQ6buC{N3TgE)wN+u-?G>wT}XsweX03KsuIbG#?tVDaYLWfOOt>Uvt6h*kCQ>Oz0d>9j-P;H^@LYf zcuu__X2tFsjN7q#NAQ8|#$t+dSgaz{{>pSyq>e+3)) zXY`BxjohIO`~-6;n$h()uxuXtm4;0gA1*j7Piq5uf#s_ni8|nvY0q7$l_0m8g1Xdx zVyKIYPhl(Q7O{`B+t04nJ!*pLA)%VPqGj0NDRT8zPyZcT9@d>E zvBkd5%nZ}Fj^kwcD}SnbJKPUT0nMczk^O7k(Pl58+jyV+LfT2NllYU*2uFC=URka~ z;I^O~%~`*ePOoqQ?p69WgKWcWP#Y{)Ji8MRA%J&TuOw=2g?4F%&P&{r9Lh*-S>{f) zA)|Nl&^3ZY9pBzC&NE>8QMk%BwZhh3oALNLjO&+p?dy&Og2L0c^0O$DONqu{U^}Q$Z4RqK2~L_bRE6wdySlzJK=Y zeRHD5-0k$%$k}>YB|#YNB%z*EYN32 z746e?I7yek;=W_^^nRw)Q-Rvz2lAiP>#o)9W<_Q=*Dy^PPpR4Q9}0L5^+SsG*~n)g zOFEnBi_Z)JGI6tNX?dP!=^@v~i<(s`oer-(30~cuPHy>|+eyzhXceE29qv8aUFJua z7YVY1z9mYQ*I&T~=E6$%!T)Eu0AJOtsV8o_5yZh`1|C%dqAg$%#e&Z7GIMdkJ4cYn zqAS!wcQmz+$B5hL?VNyoU7@8|wjw(#W$Eg*RDWP{u_$DkY|N8lqTC-zs=qmsHp$WBb4_J@R1YruV{P z#{KB#bvJFV?loL5Of8ePx4M>r<5_VRgUQup28^0P23j(=dMS=== zKl&)SN>3T-MmuO(ygQuIHHGiHdYIYd?92xHEnTlPc{O^g_y_eX*OLCYxBlc!`CIn9 z>cuZRZEZi_+UccIAXGdi)p@1rRm}`N{RxrVAOJzxW^#$qT`ceh{=5IaR=(%;T z)iORNEq-^7=OvGk_=`EahF|}whs`-~`*{2BRo+?ETk71drN`$u(mJh*$`{)E!*zR0 z`6g5ED5_=;(>_#zjqJO*#F@}znBChvZx3CLZGA-*f1<0z>3pMAn?xVJ_ib`blIBQ> zvr|RStGmoX+(k^E<#(<5{7nBpf zRnBxxBPSit8QoP;@bRbie8AGea8#q>#YEaRzSEc4jt=Tg)5*hU{? z{>w7v{GnSP3bYEF?x3U(FK=m4r_Rqi?LT1hx%02&I&p==!14^A3kQagE_2V>zsT-PvjWcW!c}I z`1I00enZs@5cd2F$IkF$pKF+c*DalqsACs;BAA@9xhFr}Mw$pGY^t;{uaBFt0C6H8r`+JaOOMjrGp>1kX#l zV_~hw^hk@Qj%ib1<={0{U55Ry{hnT4RpXzV^ia-$S)YnQNry79InD{JOXzU!{i5b5 z#?Fnr&W$PaeP0?io42{6z_wtdhD#BY>1haI-?!&ETn;8VH6ZhSe_8Ngw zDM61KRxHcnz5i|b2fyXgLHtLhYK>&@_tRj3>O>zCH^@W)_|Cd&7c3Q)58~Nh>kn^J z8P_vyC{J@rhPSm^K1NhwqoG9I=55fkh-Lesc*QDWU$!sY{flzuTB6IIUHMAtYoo3o z3=1J>a3Zj8^czYd>l^OxSh*299ENc2mewEZ$nqyF!B_hxmPIUlQS+9}rp}xi3K{$q ztvj#hvGU7F6LMSy6Gn-ica8nL(uSht9Xh~OY_n6p-f4eluM2Pgwsh<>KMoJ_K8@we zZx%%Zk^jFA_V`${tLPHUt=qR;V`ksb|7g+b<;}H@>1}&~fxK0Jrt;CQV@{h7!QVw) z1%CDFMJpfJ<8b%vt99%=f-eH>d2rzV{eRr`g}J4hIbcd~?CIrs9l|Z}#q5X6J|q0c z>X8zGH*AG^?gFD*f0+!}{9mcc)yoiwB4$xTMV|w3>Gm0`7X~m~JM3iNZcO_>U0d17 zsBT!q5W`Ctv7Y#)nk?4;|JQY%8sfZJx+_$vOCw8UFwZ^@uyxV@c=+;1*4H|KiGTW+ z?g|m3^|Z&u1XSRx|MWl7bw0-~w$*9|2C4gKrXA0cSk9DcenS1*5m<1#r(%@RnqP2d zr=?i26#eU+w-R~5rwiS7sqa)y4CnIf0lc=h!)b_fd+}S@v^80dG5z=OhAPEi;nH?iY0e(PdU^Fu+P@zc zaJg@g{zMPq$KTrH&3h~Jo?vdNeWLn%l%&1wixvfKGFJCXYgRdN!U_x|d)D(um5yI6 zWId2_U}=W^*N&UQ9ml{fXd$-hENaY|TZxQdLA^81s!u|Q(xKEmiP8ser zaLzM7>;3+Go}Wz$-aYr-*>rty=0DDzoHM^-AX^Ep_4&met3r-8lt$ij`lP*5dNE4b z2DxWP`KjrX8~9I%xC5cA|B`EA+9Oe%r`8`=xIN0Shn^vK1to?@QO+xsuOnbSkDbo{ zK5#Eys}s)U0cWsq1Ghg)S&6=}@Bj7nwlixOJ|ufAKv>9kM`q#wk4Fo&i%O%Av#ePD z;nvMXjssI4`;RyZ@;07CETW&Lg)rE&|Gzu%uuek|!##9j|Kre=7D_|59pBmAafd85 zQr*&R{QjM|D^6Fa4th&6XUY!#!#@Ta<8`(o8G{S{p4`gs)Z^@+@U9!WH{GJHKz@Ph$JM$B z%D)T_xY4o8E`Hr&Z*Hckcl);ZssJ-7DPe8_HX3MlH!O_&IBx(mv#;At5p1Rjp2WR6 zi`xM!@K+JdBPi0DhTe447PLpOB`r0EU#s0f{)63ZOE?#mp|e!IZ#)d`L56di@|{yR zu5>8dJp&QaiW*Y!aENrh-&CP@5qC(0qG%O)qfCH?0bj_GwO#p~`*!4y_w=C-%M%#f zrZsGgn|-QaX5e!X9!E(D7BWbDLHDJ$(Zfj1E4V^$YM#M#Dp? zHBs#I9Lj)Lo~K&bjj^Q00O7u_pyjp{_ana2?w04)YnRhYHKC#{7Kl*KX&U(sM>}V% zw(z>E_Uyme&wfR3i*I|fYVE#Pd@W{|T%$jQgD!vUaI+Teu8!K?ricD(o64z1i8Ffl zwGE|HBcOXiEonh7@lH?P?(@{^jHmKJA(&s3?HX$A=P5K>R>E+gCw>ioy}EE-`>54c z5{gkLOI(Kg*M?q{QwI8r(ak-!7)EH-+D47n`6(sVD!-z9__QaoE17#U{ z;=x9ghCDBWTjo`TT5KLU*u#0*^zo+xsX3%!o#S$O16aGpPS%q5bqpO~@6fWY-da9}pmjv*aOJhv@gKI`Q5{na=V-0@Q? zmwx~kN4xw|S_2Qj*|FKTb;-YV;jGREG5hzphsd8u=^Bouu8Ak|It{S5`t~XXf1DBR zt<3f9UXU&((T9Rc2k~iyl-maHjF3Vu)rEa7p5~TjuLwNX7u-C~BZlY4jv)6x4QGx) z{8=}E^4h~zAkV=2#m!?Dm9)V{^S`8oL!P=m!-d%?mP6-)+_Pb-B}8$~p3O^_!W9~H z_Sn&m;x-*%aM$~Z6C|Mj_^yI6yL;$qdc`%7J^DRu;yS4!q1kI!mRa^|{^Ddx4XLWc zW2!SK2j-va3+*Y{YNXA7bKR$Mj8fw%lGtPcE)V>GO#>!D8Fnq;ew3-m&9oLl-Q*^I zJ~oeCOu?#pZ5v{|fp+@(|CmVAC?eizvojr~tVF(aDLV8N*|s|g>wNJ+NVzzpG=ER( zeQ>4g)W61?pmK>RLmxwAx1xt87u{!C(&DMFpxY9RcP>YG!Wh}6-iCEECO|qrHGOdd zKf_@kgh8J>T%4(^|9PPowYO`1V%9A6R`BD7*=AfS&Ovwj=-Q?ZToGBcmId98ROpGN ze(IQha-@@Y+p_Y#87lAR0Qq;9T*Dfw&o6Bmn@d5o z+A&&@PL74j1XV&<4H#B#abNN97@N&(Jf-+=JLTE`rP1xqtp72Ik13YujdgH@Q-oSH zHWiqAfk=v_QaeCpma6Wo_6!s$fr|Hc3Y8>pL>}4T!Q@x zmFGxZwk)okI1GiHCT|^MY${25{&KAJXV-wSO+{zXc~L=&le=wr)J!iu^lh>Q7bW2- zt5^T;N#I=CXK2LmFI%_2pH<2GKadl`ZocPS`9P2W89xCW28W!=+f@2+a5csOl$ zn0B6T%hnxKILB&y<%S*hnB7_1f7;AEVO1j+#BFFk`f&Hb#2MWZ^%L5ft4%H{%c8H( z@$yqP=b?}gzar4ND#%7@qn7=MO5aCKgC1%W^+XS;2)6i3D8t!OgSWEB{l}q)OFdD)oaBwV7=J{+ZtA% zPnC%uiYIsYgL~9P$49^>>F9`zy_m6nboAnI4d+-Va?ALAHotOgHMt;ns@Em2hq1U? zRq_Lrt*czk&nq5TK1ro;?d_HI{(Rd3Ut`!2BF4u2{CF`nHm22F0HyrlBSWH~BbT9m z?9}|!W%&WSaxD0zUoxCg3_{1KKsUHBc4;p}7Y8mtvC{Pf^z_HYS7Y<@%!W>sZbaJ` z^^vZk9BaAW{u=Ihb5?T@*JAG4q5b~V$y?{iGveo&bd;z;=X<-NnO{dQifI~NE|Djv z@sEcnlIJ5uirX*WX$@@Ovb~hCHUklR>;}n+Z^t+CoE(f<7ssRo629Y?s3J84k%C_? z_ejwgs~!FoVhrXz=xF}h;AW5WX7e0-^H*H8ne0}*A!~$~xyU!kd2mX1;ehpp?z7oLq|mhwEGj2ygW10oC;fGv#szvocS_#SRHrG7wH zymCkaH;@AM+l92MgM1-^hdY41sD11dGv91rIRY9=h(RBHL2nJ;)&krR@b_@W2$ zK%5ECj-dX_HmeVQ;kZJ1TY*zMb7HqffMTM0Y7Csl%Rg@F)1e>(fdmaEkQ4wdJS{V@+*7EYEpK3}9zp~1d-bEO81K_RvKlovMx|9%XUc9yOJgfy__JOA4=<7-_+SxTH|&hBd)jG?|dc50#5%-Qqxw)l&u zzJ}Wo)te^erl*AZ1X}w*zm8dqF+rlRK@{4yh@3^r@rA%Tl$_+4PwDZC zjh=lz%S*C0n8E5*iDFGwXGYNC5J3WeEb)R3&-If?Cw40L+r9ZOZNcL*+XW;ZU4rof zNW6hgJNgKmadEe%garE2q<5NRDvFHHnhB97OHH>}$501QdvR7N7ai5ety#*HJn!$` zU4CP<^`t$NUX2@CYn&&cpme{*yBWudE1d%?je;75LfM=CGd;qj*-J9&fm35b0zi49 zy}EkMRljq?K@=ss@SwgfX6Zv~Xg21g_k3`t!m8WIj19Y|8g6IZyn2&8ZV& zs(AkKAY8D&AN`JsuRaw2P((?RcOwZ!k`J<)XDFcdS>)jnYz;0TWpKJQGK%MnNId6Sj+ky0XK8{eh*TTlI|w%7GuREcZ@$oZonv zU{hLrgQDz)T}vLp)%1vZke}5ds^#L@i5E;wq_a=C#f;_b|9j03*`Fl;)a1w-21?ln zu0WY5QQi5S%%X^M`tnztX|X6-A$8)J)d&nRiCVVb5je1-8!XD)oND>J3wtpG+pDg* z=qo#1H9@TNkVp$QVh1j(hCW}8;YatR)s}naH`vy#y*hfcZmZ4T-O!?o3A(#nJlZ-) z$Iss$T6itgQ)pdNJ}=ykUiO$VmA16?8tg5fYY=W#?@~1iJ*Pdf%dW4x z@8Lhc&N3P}7 zCX$M4D=Su(evhV1@TR34)I>Sve)X$O6{}B(+Cc5TLM8& zG{GnP52raNr{NsNe#GF#$R9s`d@=zpqeM^%#Lae*Bi8^CJ&p>E$*ya#a))^#(?}xT z*#jozk3R|05@j`$l$Uu!SuqbGpTUaat)5mh&4Tgjy8F@R>ZV9rG+e)`>fT}cC6aH~ zw12@yM#&;{e+G}WV>jeLB|mpBRgdk>*m^%fVNbC)N}FM7sqNOfzG2m^S1%Tw=7&NI z8$0RcBQ!_r@Ht^!@QbR}MD%=s#Sn|3X^x{(hpU&AD!`rgy@3`yPSG5hAcRRPb66>V z$I|4u4p(?SlTdP(ObWWO;Hr03Fsr+q#isO2{;gSFpqtYvP|!Uc1rVrGeq^qXd?+^0 zH_?G7B*^vk^%T^oXT6f>wSsPE(}Hdc__naNbCS~xtd?aWiLJ1}%RDW~aU=A_#qi4> zeGlNLJ$v^uAGC2!(AhG3APf`WLP2%UGQ-z-xYSf18K@+<_R0MNB5v6Az&WU(acgM4 zPIcacxjwwvbr46@B#mFI*qUYsu2!N$M;wc*{Q9jmt-Qz0^E6k=H8r&zd8PmB$nx3frsGIcGcx~#u;>K_$p<0*gj6Q z+nZ2?ix*dx@8`y`FMSrrZuvSz$%_Vc8sCa^4cJ=k$*|5hp?eP1G9H2vdiCV1W z(YM0LEY-^cC9Ks*CX#gyUAg*vb3KKzRDNjbnOWVmFhoBrnPm7WjPw zrGZgs$#GT+nVfOfEi{Fk%*PCv8>H#w$A|LPD8^|)zQ@Mn)%HwJc%8riS2w4 z@7P^5h>s@ApS3!r9$Xu+Mr;!z26kmVIFbym8MpEtbHp^~bbh%1#RYZb)QWQ%DE~_m z$+@4i-74fwDGs6Bx#(+V#aOxT*}E8LNE=+fFhkBue{btW&t7$prqk)=Qck%{A8XhE#UVl$NWjtBf^$Hp1c!F*MNS{CyD>m}tLcUY=QypH)yd^~YMM zW3zNz)0G?(PtRutR0g;~ZTlD?t}AGEO`plgmV><5I_2BhBSY8jPFZ*xb#P0Dgze6* zLPM+X6%3{{S0jz5b*@2Mvqi5Z2@cNCwus29#t-kim*$)}^b)fyP)tDX@=7yBRj+Ga=w=Q%Ugid#i@`hh} zfY_c-2_2g+I6#FaSNHx8t;Un6mG0Yi7~8HsVa~KHnWg}9OYVf5$5~r$eIJi#m&Mf&Fuis^&LK>!hc)9JQ1DAZE@(Ojpfp3_Szj+!ToGU zw7m@_$w?Avn98%{2dP^31(fDD8i;h1GJ3A}QgvVd{oMadEQE*aZ{3&5853@1wp)eL zMiB5MfIyoCXk!k}G?jws+c1tcaC(^QgODd(c5WZ}9}4sN$-2$VqtwGk>%R5%Xxt3l zXd?)dmU}hTJSvUU*Jw%}TR&B)k>Bw0j6c)93S4V(=Ttu%uY9$tMsVhy=M=0K5mCz6 z*4{ecSsP2zs=ONSP)^_*<4=3pb_cxKSPnwq6hA^pL9tPOJOzJ#Yx3Akd(u)6PkkkK zXa28wY59)#Rk(6zXxS;qxWk+$@#_eLM`M?~v#NSkpRo(L`{&B9BqoJ4o;1Hj-uz(8 z_Tfw?#~?!zwn4LdbNbZZNe?^(EL#Xux~#evY{rS;WxoEB?3{cA=)S|mh$;_N%v>RE z7F+V^DOBtiFcVcKIyMV*b&g+>6s5crer=vvN5#zqd^-w3vOGeWTH?|4OYGAJu6$e9FL7II(zU3N8b!Qk;%M^asAm0QiKM)@W<)P${% zPM?a!R|aPT1^~c+MEE<%r?6P*s5(ZgT4u`K~iA zGUq!67CzEmRe3R4C}`~&{bt~&YJ`VScPZ)+f@gCmxSvUu3_Z@sV5&V-ba^QnB3j)- zpe#mZ}wMbfz|2y^!lic|7JsJa09l^t#xQeKo zwFFl}=HA`j*dK3@Y<7SQp2SERjkY4(o1w+LeOTDB?R?O!(7wLD&K~CsMA9ZkS6lQ; z4dMrRrOmr7{f_WB{qf=9I1^{`0I+N21dbPTD$sy0%1LjZ&-zbSqeO?cr-sLFS2Y#Z zEt!dzb?TAUmPoQRP+xt2bC_5+iBwgP%j3urb7$-1+@*oOU+ibL`cpJZSF5`(G55#@ zPw)-BAWW(O`oRmK)}Fpd;KFfsBG_TajJddxoRm!r72rh=wrlTjU+eptD<*v#s!JoH zmQGg84fQRh3u!#3@!GQAz5!7?zvDc)$h~S|zY7`2pC8{j@)kQsUT!o*hJ5K_H2Yh! z?BPUC9U1P!$0O$lN$M4}zT_Nf5A+nSSVrXlPsDxXS&b8c@c2O*g8qZHGu_wMHR7qO z&^c5U{^M!xL*(Y}-MeSx>1I%6C8N@mdKGVKoNw|V3i&->n|6Bfn6>SurgFYJ5(#tS z5ZaMv)BeY^_uy*fAul*>Hz>@(X<@DbqYstF6}{Lb<Z`gho1a? zaAtyQCCZ)P?ut!HJkD@0iOFOe-Z-{PaC}=0SVL0O3!=5vVPv<`* zGP&Z;b%!BVAM1x=jG?4~f6llJ?|rbJWBGBPsMqS4XZxPL(j}|>4!%}#1cdNh&6?h} zho?O){_R`PyUXlECjKrr9C3Wx*q0;C87|F0LvhrEOpspsR}-I|?RKVWAN6co${9o? z@S+R(GP?cuH=aKM!y{|Q@ZetPlgQRcrw+r%WmSOC{Ij)4m}RhAtbADt#(;M9jcCo- zuu90xFsmmCN)Keu)7EQz&^JlEJj>Vd`r}BF!TTMHh`ZVCZd~d63>UA@^k?ENRc+bH zO19PR6WHdEae`(tCZrPcGTOt{)nIKYYG1p+{{1=m`Kh_NK46LLAkycLYU7Vox4$LdJw&ehyfHW+(FC6ENCo#f6 zV=S9xN%HO|cbn)ctql<8Ep4lOe`HR~tE>{~)_HTZ>snol4%S0k|>ERh5}Yq0bOO5m{2=WFfME9x;iXnx+5W)5?`KGiz6zW5JH??6 zqZeFntsgVf@bIeZWfAWzIkOncc@mJNWq-XwQE(rzk^S|q3l?svLW!$~N*~Los$oX? z0^%gh2TIxXcjB`I&R)Y&r;b_?YHAAHT_n-OGn;d9Yy0PBEvkOjX2%IBb#M8Get%#< z;W}cq<)zmxHt;*AqWz2_Vw)k>*zX9VBLYeC1kLW^w&fF6HAjy;v{ra|X$8f$xp|x? zQS(`MjMDB#o{Rlcf~vFlq`z!;)jT;coCjJec+fG2bFLXwI~gaZfr~>l)_)BPF{k-?7r@S$v;`N}eOg)-STqzE3DB;OsCFZsk3>ime}D03>+B z9)AipuQ=i$aKjwBSXPOrya$UvP{NDY0 zy0)9M{4+|azI`#J?fvh-{V56=%PV`GBzbuv-ee;AQ;Fh)q8$ln%;gP?mp7n_Pi;Gt zoKArlU0wxgWUZaOtQo}4fqYoFc2%G5Wu^II#fo6hShZ?SAhhx6GRpyaFZ)+6}ZCoP;qLuk>BUyB2hX)ZR-=SfatK_pqW{; z*#Hi;I;3Q+Gv-487a;Tvb!ga+U5IXMPG9=PzDI3DIUxRiji+pau6E-tyCA~*^Gb3v zrvUwzJB{^`-1L{%t_DlhD9w1PIkYF@AtFvT*|US_V5&b>bD>ACgzRK^OW-HZ>f=9Z z|8V8N)qi{c5V7}N&_~05i~Mr8h=5C@gCBSQ^n8wKHN|?j;rX_>4v9$U19}DP%+xco5!^K-pQTcoW&c>3?~pRj*$fNPckcX|0DtY_&hf#WId(RONyqy%WXX*W zB0g6={%54R{ZNo3)I>PinxugFh_l^Tmp*tbyCT4n!LFSau<0D^9=!c_WG^0nZNPWv z_xZ}T)U3AYn_fcK<;`_4F!f$s1%chDKb$yB@W3!bH{0?;Rgy|}$$26=>$VO4=l=40 zd7L39w|yz=#d*2lM^T(wGW7{6;85!4X2nHc9fT?M-nfu?Rc8v)g@37Wa%#I-d>lJ) zRX~hfttjN=F*wjIAKdy$IbtKds`0x)CXHwoI(jf_)@;VCEM$2iVLs%wHM=Hz@-)wT zPC8YZ*-UUg1$`LlLcM{j^tLynXFr2WjiJ&|f7uMk7s5I=ZGa!KBB3lxrhH8qOgzRw z>!4Xb-8=Tp(^%qi>%7b~jpmAO6>7Io@2n2k>Q;@rGMjl$>CddeDvTb)UdGQ;Oeo{ zH1t_=2t-R`axC=JGNDDdpLlNJr`v<;QJD4^c0*suW9c6>Sggz(3IEdCveYHw@%3ebY|q<7x9W3HU7nmGJ7+vcLT7VHiOubiT6kx)#DUe(bKzdVJo&SQEheW>*A%Y5S+<3U@=blsIJHD-H7WVo&@R?@w ziQ1sCCEJ0jh1f3J+NFB2y|kiDQ`V^lySMJz=V9$?feQvWe?tuW;%jYs~ zQJ`$VH_+P&UqC?Byb_fP1#X+C`J`BsN=d*AYN zJc#cPob!>C3H##NYk@p@Mr zwH!rCa<~}o)BRdsX`tlen~Z^y062K9MCu4}vpWwjtzrz<(tDlptT|^=ZG~pq{ljUF zm)Lcip{i}9*LpBczc`Rk-?yLrvT6 zPW}4v1Zq0U_Oj0#<3e>+@+PV{ZSuDS`LUvW``e#}=&A=rk@yoY|G zhd!McW>0>XK^)<`Bm;!T$M^tS;xok0pYASYh$R-@Ypd4xl57g>_uctGeiV-Y z-JE;mAG^KANeix!Ft`1ip)HPo2pu@qr!jJh0`NWd{y`_nD!<9mcog#p$ePdnZG?AR z3X1LD#S^&`Q~qv8KH#pq!>mlLzxv1{H>D;^cE=r0q2@s4QO{`P0@fXlUDCzo8!;g?SfhXxkI1lVdbJYcWg==^C(>w3Uo|lkx>mF8@D9 zZFo+T+z6|wlco4}YVjaFuBR>!peFb|G-b)xP880x+f}ujtZblcMf!*mT**kZ1{2wFQZg64OGU47krMo`LRZ~&!>BNGuvx9?!_?a5hHbc0dR#!x- z!(**TN6m_`F0?mA{KD$yN;7Z9ct?v`z*hV@M6$b6T}mov)mosXH_pDStnF!}(q10D zgM_xEKEZ8bzKhDXI{8S}z)=gI%$3S-r0{Wk?&(|$d_0j(D2|J0yXplpkxV4xbP}w5 zAl(jg@gH-HPAT>GnBq;F!GE>V{Xw!#ZQ0G<5)?U8JHi6sZiZ*OdlL4C%qPz=O=S|A z<8;J8O-3)*@?t!A4v2*tvx0-$F+;}0c_MQ=M8H$?v%#K-+XtYf->*`vl~%kRVa7q> zXWyEM9lP8KQ_BK9uv%&Ue@t0Bi+aCFE2WQ8hGQFlkKMt5$)HREqxDJ(gU&}GcH_^SMxr}xf zuqbhuDH3rumqAkkOL^+g%005xG+ay^**P{`EveRPD~3kHs<=dPjwg1uMX|?f`DnyZ zSyl>2PKtdUag=?v0FO!l-*?K+?K#M3+$tATWnmzP&ajglMH9z3p4V5q^MX4-T1QPKJY%FDnATFODC zohYnPCl5g)ZdZzaUl>5GIZ=xjV*o};jYny^n4IKE+ynNzS056%P?^!~eDTVkyICu@ zUHmYK8Y%KAC?wti(|{xkfWk3jx=@jn%1?D5Ge^KoGL9dhgc%Romm*P8m&|>!dm=DR zP?Y{s-Pe?B^8UJ*C_Nunm>1skggd?(VL$o$&Vv=Dyiga$?mYI`G|NxK)GuCmYm$}G zy7@kuv{6PQ603%TM#VHw11I9r7mq`>k3Jx$zc1Y}pj5befM9LAEeq{7lo?c)_U8zJ)*Xnn6T=^vORf${Ibo~7AauymIM`&Et zLK=1vs?kFbu7P6yT`o zeZLHwJUDRXbsSt-(1l0^D@uScQ-pCZ2waPT_qa{>myB1b6(HK2nL)MzgT}W z@m?vizw1y>q>PTw53rdS!wwD=a-0)YM0Fgx)&Htuj-^Se{drj1_sD)=o<9Bo>=Y!4 zQ|{cW_LO@^zvn3D*|S=l^OxvJZ~gdq#Rw)DE`9oWln3`V9@TzQ<&Q)3A{5Zhq{(mf zKL)_5Rl&`NWo1>~nyf{n(3;R*wRX#Eo?DyImKM#8N79@nX9mi0{04~5?Yc`ZQ3Vg$ z6kb~=Q$L~14hd)P3sxh0An~b3; zxmguX&#gW)l)}h)(&SdiNt_XMQT7Q~)8(##;;+5*m!6@V#j0e;q^J@)`;UcCp2XRE zN7v3SqxbV%P+t|O(a3tl2?$;=-hM;9ANApO_;`CdU-v467th!zy`12{Kzy(o`&1O+ zMO2k|siC-eBDFr!W+1C94?14g{xw!KaH>X>Ha%y1jRR0z!18A>sau{rJoXBkwPZC; z^1I!DS_iT(FO(*wH~^pBu%!CK$?bP>e$LvlK{ALPqXh#8`^6GOb&L#_Mt_f1oLZHr zI!#W2{lr{>x$w3A?BlykFU`OaxCm&z3XbNBywQV%RwI!EH!^XJ^Eg$mhH7;np?bu7 zmB_oGw$tYh9QL{O`wkc2vzMy|%8@@knM0hMxPz&VC4qoI>h%1#38}0?<`sBN+`Q|z zD_vwr*QP9&kQ3aKS+#uR2(YgUl!`s7u&aZmn2Nac5c=1}A=%*gzt=wnH9yC0aag6l z+{;knMv~wB*`;gnUis?-zAk%)>noqUz3^E_y6RyRIJ7^)0TNYCzxo5Z-T4*o>O;$F zaYDG`kEjX9>MEVJ?LX;h>I$eh4(c9Ix$#&tZBE;{w?mP%vDmUKTVr<>-fG`$HNyhz z;jr+&u#KmRRv{RU$E*)da!VJ->w43A@FDWrvp7JqMxnJpu6VC0h#>yi!d#!6Agh83 zvMr#VI<(YyFx|o}kRg+qfo(QkUM$zQ^}iheuU&HeEK{;RuvOp30u~VK^@#NkK=sB_hfvo1Zxyf2 zrcv)2z8E)#+Mtq4vGhZqhC?@cdD~c@=F=B)i;3>h2u?3uM)`om)9r3<+_y%A6=;!` zc=jKNa~no3jBS`8)N*d=;*QJi=d?JguU5aNfe+)$8A*t)TT=D)lic5ACD7JDP9C^y zBz$9VPU|QrFo{gtIuD2lxCv?4KNf?O!`t;a_TVhM<1R_eW>7JZhSjalob<5#B5Cqf z(DMypa8v99f(|Bq@(wCz{}Dl9>zju_Ij8(<0}8=BfEz*RR`GK403Ir6kMjdzgnkj0 zpBYOnLAC4n_`t#k7kr-F0+R1bz}{Fxp^f1T@v4F|9l4R(5G87f zp$mL<>I@Vz^ynv^dXvY<%=E1GeCSEeF5ZTZ!^U@lC6QMZ$!IOn;lcTP>5ss<=ej3z#a>i44sRm-HlFNUO)-FvNJXf$0i8h?KJnKAy z&?c^!;T7rKf)~K@?S^pKnrfpzdGG63;{c_exf9q~DwC%3QeJMugVeG$&O zWWJV#N)WG-3E!;{IhcvMFW^CvBX{q1dh&bSr-cMR($Yc~Wyiq>9(8=NVi+*$B)Im= zpE6P~2TbpujUXIHjeIg!hf_QMvidO8=6N}D+}9P>G+26w5;&TASH;CW%^VEIojE}7 zMPMMX^w)_bjx(2Q@s~Nlm6HygG1Oit#OQN>E(`*aT=p`^H?qBG9)JZWis1^Q`QLWy zMO_9nW}puuUqc}1mgbOw>VEUb&zabT_oUk8tvupP2iIo9uC=+VuxQUB);>^%NfU;{ ztnlXMF{6$co~O|k;bRp_YTDtkI%M?%&kWnX*S7X)DN3!Av*!IVs5Q-%W_0PfR>5x! z(-l)pTc?murTZ;t=4j^T;&CZR1w-)%7HX%Kd#Iw4N1b$yPh`ANFItQz5_rPm3W~_C z4)b0_oj|4x_nHqEEKbhcPt0F=0yO(yO;3=m6#iPhKq1JxJyfh$9SS}1qhb;cPE{(y zy1`!2C6rtK;sbBeTgPfBtY;^nO91P8T414&kz8IWk>fC!+bvvUG0@fB-Cd0jb+?2I zhc{g=tvK=Du4^8jp`#90veF zHU({r|Cw_v!p?9lJO}UVu$Wf=yzWx@7oW%I&NdA}igTNK&YNCAe=Me3PTPo)5%D_2lPKes#)pIfL3_hB4sowvLv4CKq44tGoaT35 zf@qimxdK=FbF6=c{?Z^yQ&&Ff&c`kzOW0U`J!myRtXKYJ^`&%SdFBT8k;Ha*UspjjxcD>-#+;bA-X3$TE!7tq@ z5@;+e1Q)u1xJOvbd%@Gg!wbOfF-Aal(6%F#JUs8dY?J`RrdpIGoND`*8cY-{_Y%77 zWoFxPMm)`9u)j+s3y^N0CnT?ri*t8a4@k1APifBwdrAkfrQtqkAGMICoRazGM-m$G zTeHJg{qOHBZCon$m}jn9%{PT*uvMq$3-qWX&JxPX;l$@*2#LZK{;yFvWLh^Fc(#T&TEzF9I$AAkbtbrHqC&PTtuYD;o)^woi zoD>bvg(IOU)eKVAx1 z;#n>J_Pa9`9^VU)%wR784^7iH1=uW?{_WN235sxed&}f&b-A(G8m7$_bB<_hW_hIW zvFh6hinE0yRAaN0u#dR*BRY)Ucb$$>jE_cB|^3k=a3~(}}79z*LxZ z8GCczBD8vGVi`;$Go86PY&^vKPR*?~*oXEd@Nu|4WA(N~xsYY0;J5Sorv+8M6HfgU z{@z;JNM@Fv;9%NJ*U(V9M}PTYVbJ~DUveNBT_F3rk5&CEH<_vks=qL<*=OqOGEpEe z1_y7N!=WOMbGsjY`I9eZge7rP?FM@yS2NA`L0d`Ij7Vkv)@HNJ^;m}t&GX{{BrC*( z{?`R8*sHegh@D`_1umbeZig-2>ss!yoivsHlZPNzcO1LYVeY>iXYTLY0f{GI>Aezb zLm01)GvDMLY}YuCnP;zL+UtE~yZN>|+AD)WAC-4zwS3W~Cqk<9)b^|zC!)x#`?76( zw*G+)zl%o~2hZ-E8n%Jp+D`)$juqOadq>(5@3^0Tp=EIMSza}jMj+DL#soK{O zKL`?Iu6o{f-eW6}2E1G9r*u?RP;C2E(P)&iB!I_CftC@7>=%w6QfAWpn~%1ob)Q@Q zgEwkQ`Y#i6ruap>-nMUHrAFrX7V01f+6BqzEY1IgD~IJ?^*oi6LqJY$UQW(z$T1_y z1d(BxgiY+R-y1t`b!YOTpl99Bz_}F`Z5E3jlQ=o|+?u}3_8wa*p1*H&ik2W8>o$CN z2B#)P4nHw)QyY%J4a88J_U(IfD<^W3&lLmf!fMjw9>MtETxWEZLLEtl!wYv$9Bnn1 zyVYDHYkSR4geMs(rx0dsHdC8VVQfQoiQ9gGUF|rEk+Gk6YRO)!U*K@^mptEwzryO^ z96SGAPd(T7?T%6IvZqE2SWlr*CF6MhCiygf6#5ceinEHR{&JPF5yHdM`i#<&$3h^< z#N-_>cbjbzw%C_URflBSJ_?yv9QufSK>7;5;>*sU*+KjVyWYq;oD~x6fA2e z^w^~MCRnF#g^a#Kj*bvT17@-|u-uodWb!^j<)eUq*9&>J*r5$=p8)SnaNC0@@CP2m z=C%jXvOS(F0^KJ+N2d~t(s2zxrYk}+f{ue2TrI~YX)q~o9MI-%oMomKr4#2Yf&RN- z-7R=}e&9>sDEwn$UVJz~PKP}II!s94acw!!0X-jE5K8F%ka%8hwJwnCU6*%S*&Azl zBv5wm1>bzd<$Ma_9HbYTjy&PAGOW1NNxM#r7OX12ezD2?uU!ZpYq!LB^RDq2ly2kj z?+@k?6Q^lg4|nhN6RDl7Cvvj>j; zYN^ytR2(pGmHc*1XBQOLtSpRQ>VwNTp}A-O@0pr`Km;^e{MgCAVP6k@=iW6sqtKo) zKiFk%b_uXo>kqP4%MRKszt$Ye0zN+i<}`D_oD^W86CQlMZAZ0i@5D&$oJpkrJMH~7 z46FriZ)!>|76&=YuCVR7?+=~rOUAo9vD~({mX*Bv)vf^YWA0sGt!@Y*rW#i0ahJ8S~a#`I|!;;2Qaqh}=#At7`81B}iX)w#LY zJawsPkm%k_p%5%h+jL>e`KpkPmMFUS;W_o^<3zN3MPbsQWuM&gy|GbMQ@F6V#K(qU zBt1xQyXa^?9A0{R>>KGZ)A^Zyg}o?PW!B^%t_fHj?u{UJK^QFDE|C;NJqoeBp1AIe z8vN<90+dJ+W5o09y79T!Sf$30}@ZqJo>6CEz<_FsTol9e&c!lcqS z*UG`Q2$$HA@#S~QDM?$tPHjT@wLUF<7JYjpbKLUn-xOMF*y!q-11F#3KJ0|jw@ul^ zp(3a=&Y=Qz|FFatm=>7~IwdE+N8D(1Qaz z#s_#_{jW?XI8+bNs{MC6-#InHf2l6y^)emXMz1REql407QSq8V^O0=xknWd7Lv#1H zTBk%UY=Fv{&ZXF)k@c}R23p7T%=bv=>bY9S#nTzZr~nv>kM=g`!R`46MGT5Dhbk7_ zsPz8TKN^JkvTJ^*vTITej@GIznWVeC~Tjv7aV#S zl+)0JZ_1wgEHuMYGfBd&i1oq_R~yV9=UcIw=jRv|W}%x(nzy?pGE6xVU?hCZoj(I9 zPiB$xr=Qemu?>ufzu$`sK1UwZ-Y%lD-RZ}<;ULOPU4~4fpF0@?c{q@oU2*0G(84^g z_HGNQ8jA^oR}tGn>h=SUx(FMm9Q@op;=y(6;)1v)j`ePBZS}*)!72q|)@p_a zgNCtdTl%o3l!~r;8ZZv|=)nL{)plh$}{ z?Ua1uPnA=-Amiw1rtkZH0ArShY;9!kV{6bpYHRO(T4Cu`rwE1kPIjIJh4^40ui7(^ zfegT=@K#kzFIt^~3-Q|w@_LlFAG{Z*CjW8*18+hw6KB=`^h!OoqTLg7Al2s`-?@Ob zRS!wTDNzi5Uc0vY3q-1Qz8ZP`8)sfsw>|Uv_a@|z|E{N_B*Z~|oTVQRa*pQ}f#J4~ z3&h-|@`}2-w7R;wv<&9J1n~+o?emynCIghYWQm3>fIG`FCr_{XEs-Tey$8G-(*j2B z9%b18Oc3PAMQHVh#yaTUF`uTjvp3sitGDR=>}AzqMw`F3bAH~5Xf+>pLfx#RD2png ziKgA-7YcgNW^7oD8JuaeDs$;jkgcA~j6Rg+_=O2B7?id|y1m;1rZxe>6C(m83J@M= zp|8J8w6M8Vf2K}Cmsr6_=qp`@BpMi{fiZSigS8kfdM)2XBlyiR=go;1jUqKwlWMDe zw5kc2Qwhz6HItbU$Ubw{GxFZXaPcu(+ zeA(=C9d^!+rK3t=RQGH;>Qv4Na*K(vPD%eeX(t56`15oIDX)()lV&|#6jA>j+ zLWv>FGAR{hrC+fwD(l3$eaP(cu83MI#f2 z{}e^}twP$!0^14pmQ2MIV%BMy_K_~_yiuoBwr8aNr<4@xJg!y+FyUB10Gx5T{5_B0 z#EYiX@+Z2NoW^H2+ujYKwGyhd6420X(LT>+yBBRO`;U-X^_PmxYB0^L|hh=gUAg^arK`(EkU1~snXo`ZWy0WIVJxef&20*U;hd9WY zUo|@I zx;61ip(mi&%d&^?#6xTC9{r(unnxP=H>JYEb7r8t3hO%`U(Od#WRH)oVOY71Ui}o? zw*K^6mAfI9@6oLH8-{e6wHoD?Vfra!yuI>ev78&2X-4KCQIOd;?Kf+&HqU}=tbG0b zf1Mi{yI>8)i$PH>B?BAlEWXl{ScVC#51q@2eHiT}U?aMDy)C!Q$#@9YmbPw@Sba~^ z=HfHJ*Li$y`%ql4-nKdtQ{%-L7xoNydS8R3`1CnIir!aa@su(`%PDRx&3Zpm{UtmI zb1$v#Cpwcz?|!PUTv=UkZzeg4e^97jiGDM+ z>ItT+rERD!DFvg{KG0b~sI$NykWJ5@y7N$=rSC;$-oUVtu*a4GFuJ?0<=OWLT3M5Z z&Bpdg(|Q5jv+e<=obo}b#-P!_;vN=chMy-(iIOxnz1r=2q>gRUPU|X$=V%z7UO#r4 z!3ffL)SPjPDKPdHg+1kLywR29s7TTf&lrg|B!vRnZrj4AS3xWc$LL7B$5l!O31WPjS{ z8Pl!MLsYBn#nWq%CZtd4cO-;EE7H_{JVmByC5*&Nyn29 zTW=|fr9A=ojKSU-^!NpTyFl{mg9t+~sp}kbF{3qNLT1RcI9Ldyl412tS3+9Shmup_ zVk&jVPB%>*>!a!AJ9V@kg{{)Em2y_HljorF16WVrCP0HqXu9$PsG(^L*irZevRd_7w+r$rw&>56Ii{xK>k!N>xR zHEsoRcn)$BGruR*Y|&%v<|30s|X0RC4k6OWG}5%3`1lF zWCUf&46;KApjP-Q3L212A|fCgK~|WGf?);3u#>2aKv+V65JKR81Gd%w`#cbyki74` z=bn4kJ?C?jqmQsGf(g}bXVN@oM8eloU@KyTndH%aaq(fXOJ#Dy%GPttMw>*njfj;w z?XIr!r-G|U*VFMv*c7?=Y}nBAS5t`}5-a6)-v3x>mJk2MVkuOx9)7N7AH3S`Q`_m~ z&?6U${wTZm)b8jWwWEqLH*OuhpPeldU_AT^7Jr80oFbp%KPOIqo|2vpEl*DgPMl{k zQMv$!p^k(OKktT$uA{R>)pE5IFO_L_4yHK6ua?QuUEtv@fme2YRFFgDqd|aM`Wt-2 zjT74&-Ml~g2Y6Pcm~{+RO?oid+5Ke9>Mo_rS^~2k_6?4MpFeY`U)fc5i+4E<)#0?W zCg>VFA7K((e2{C+aVfx|UX4fr_GqaXH#Hm=UZQuI;v0SK8%O+HEHf!9;k}Zitd*MZZa| z9D2($RTq`KrLgu-F2FXG-hsr1PwsKsN^eU9IBF+Kp=_`~XQ1zoQsc)=urtXIu0V1Z z?;9e9IN1FWW49(UW6^18tz=JSnM%=to*>c+T1z~H@jCWlt=MeXb=pL%Gz&R9(hz}e zt!eL~aF-G65AbMlJ@qpKbyX3x^^0{YIrSzT;tnI_IsTnLzf+tonvE9hoSv~gQ{wN~ zQ+I*eel%cN-W{P``W&;+^kGHvjv-x9)SB?r)0h3G`1O(fi3f`r)+*gYr*rLCdyqPF zHb|dNH{k`8(gllsNf!|W3xNBO0>u++%YWNtlWulY+9us^`IIyYCzpq!V*OUcg6BA8 zCtz9O>NADqNK^*Kn9OaBD70gSoPY)MPuzG(in5Ex3sqLH&AY<(0z9q6fZX7)p4{xA zyW@s|al*w~B!l3&)MU{AvG?-qvbpy+SM$X;tsB+}3zy-Y@q%RL`&;EHTeJlwqFERj zqQ99XZEcAhq_PV?CuNd;NhwpRnLMu(#5+BH1z6F;Vo zeD+F>f9o=H{iVD~z|3G{Er!;;-n_Y%(Qvb*u_BxPy4bHKJ@FyNUv-{KO>Vp_h$Ht8 zE&XLCMuhKzT;YhAZ##%e3fm5%6io%DYfag3#EM`3(y8~2;JlZ^qvGZ=b5oc6ZenYB zIcSoN3+Edf7U(4*E~Jsl%;gJiLUYB^^PZgA+Q#ayO--N1BUZ{p=DmD;QkM>>=`23X zriFj0EanT+ezn@8lXg_^Wiay#wYlCqONZ33fn#&lSR@Pj_@$zl<}zPxOhw!M3P;@DmCi;pI>eASNv6Y$8CxB%+Hy^W&`=Z9 zfPS0;k(x0J!(CboSzfpYO*^fKAlZKCeOefwU0FA=WGPQ!TN9XVP#{7P#)~Zhblkn4 z)Jzq^TE@|-W@5{V<@IP*#au{Oge}&tbmfE2enGio@QTi+jXtxrp{S$53c>@vT}NtL z{fnWf_1C#WhS`3-gR+VPwLY^$L$`|#P1CpwnPtH#JDV<%9RwZ_j&H-uZb5V#RdR{1 zYT1u(9y_i*bM#!~VT>e^CRL+J+<8|Czsk-fkAqj48K7c@^wuLqkFncXNxlK zWSiBvySEB)M>S}VsT$d%1_h8v^St`-{Q8DeBhsbCHj53t*4NNz5GDaJUnSyaD921j zgpsGKXMJfaC-c*Dp7j?$wGO2zXzspyqe!>Q)8lG4e(csC%}P_M7UIZD<55=twl6#& zQf90I)&>I=^xp-m9HY>6nvlz1$D=o02Ib=@VJnNVU9tVzn{x06*3A|rCUGl$vHE=05y58Vd89xdp~`;yVNe%ss#%aXJZqk)aU&xQR)oPVyRC7ga3v=z>;4?kH}P5%O&7i+HDu-F&QpKZ2Uu17>l zEd-zDNs?cA6E4gLAVa%~?5KvO(-=n!EKyBFTil2aHTeN8cM29`eS3yAYN@>P- zT?W9HlV=+$7p7%(!-f&|rD50lyNzV$G_(VrPff>B>Xt;}!u{Fyw(|}fUyJn#L83$T zPpL2CX2QMPXB%Ry>`LbYN_wb6y43{cgI%7QaB+3?6oE@u&~e+m0NgX*DTbTaqzA&U z`VgK^=|W(x9x;o2N;o3JAd@5n>nhX*QR-7|%%w4Tm3ecoHiBlh4h$UNjxb)RhmKsq z>d543*~WVrm7Ow1LC4q66PQgWlpp@(-^u*&Keq;D@k_xDDPs%<4j?_^$C5r|u%u9M zg8vYn0`#Aq_f3=8WwNt}_-Gx9{POt@dOkVb;P)#CY(2TP0m_M*rQydMu~f$B24iJ| zF#yMmSn6z5DtGW;vm@^X!LdLrHT=pENdFZ%rgq?T?yk_iHPu zL-(a5>DX^{4}H*lmE&(oQclFx&)2!oSL_H3Y&|?3g(9Ks=}Ucj5>2oGdWpHryapAR z4qOf8^EN3t=}-Kua?EDRrNAltG=Q>F9bGuwW!gnJmE)1$(3J|+vy)#&DQOf}=gLsn zSFo*quRZ}Yb8bd_+!IKE68n%1i#B(nqT30Ul#s*Q%HN1-@#}s*@8@$R=7}QUkiZTQ z@xts?)rx+2e4eR`7ECyZtE#sgbF1Zx6KsQ7eb}-C=J)&@ooA#)M)8tFjH2S8Y^#%- zPx&BM46?g(>;fMBT=pALT$){yH}dz={<}HfMbiQ9%i0VEVZHEG%zAoU%DhV7Jj8WLu_1o1d8W z)Tw#m$da?SflbOpwX*^YBvgiB1^igp0B0lSNNX(Ik$>F@wiR}J)K^q$^t8l zt)jmS$kig&PKO1!x1r!G^nSnd;j-glXDCPY`5><8L(|glbXsi(o-(0Afw1wsFcl&b zJ0m)J#pq`+|eap*|03F5p*H2?o)OBE?@#_wHjc=3>MFZ%leyZ z&b=SSKkN{fL|wqr@nfTIIq3qDY{XR8Ja8KctxKTp&b-Iso6<%nT%e>pm0L8D|S(9zMiH7^$Zd5{$0@f@@+ z8%|m(Mg+~x#lRh2ZLK)>nSb~VR(|Y}I}c$8l=c8Vt+Z;bVu&$oZLqfr4YS|UXQ0#h zqazk=sH0MYscPd_Iy>Tzu+k0Sf!gy^I{xuXqQM?kFEtnG)QFo=?lj%R1o z>-K&-LY2IIa0a7#T5-K6m%5TIVs@Eb%{4YQ9#_`tU$QG>q-f!S`^tmu(67l_q-o9E z;6KmrGU7vaQlaShkDKhF4CN2$PXcupA4JuKv;Wq~4#=%Zb@L%GZ(!#oYZ}*9&n8Z6 z)G6$0?F>cFnFND6x(taBGFW<(T&=1c4P>X*;_^bcFo;VF=cA}*m+t3H--9Y@y=%2K zlj*TZHpn#5UuNHCIJsdjA}YHwul}msY4%z9ibEOWONL(QN{v={SaYCB(VVr4Zy5@b z{#S6N^q_;=CUPQ}$)}O-Z`P$j?2QwIw|+U4vHA83F7uVMeN@@gwe@v|x~mCx4x~E;=aP)7nGxz;@BJQv zihG|vrl!`z=3c~(2D;gtj0r3;@uQZ`F2fRaF?Lgbr<|8@fi~W;vTRi`nV%wENi90m^-T}Zksw`_sC44>k1V=+K)MzogUM=nlCRyFFe)deR;7L=75`9M_pwOvpc;+ zMr%Npqq(=cx8d(HHxe2c16P|o6_MSjAZd*cPyM-nVNE233UR@nSZ|fMIIF5E=#+am zz_Ad^gQEGJ= z>Xu{nsazWd50n#Gsn&+k-Mg3;yOfLabt^5a;2B>Ao|}y@odb`{GY4_b{#Yb*u(!#J zM^dm!DHj4$vx4y|1dOuy`jJpwY^G_qXLDB;?b z_Y709#uu8ZAK3#(z8xF=@r`8jaX*BdzyOWV&|U9AG^;OgV02{G#@h&$Vnmq2vGsaHg1%B?ylASXT8#YD^?*dfCt z*X51Fe$e-*B^GrrKP zdz$x*UtKS6A4%mrV#BB)0RwA?ZK3@AUHk}q=t^yA80xKZE%OYsv~vx@r!-P^>|kDNUNM{eF9 zJ(>-8QVM3f3lTBPFmzSph z3}WjwQYa_r8hcSDZmuhEs+~`{>|L33TP75ppQV?*9!ltaGPjPol>#>W^v8Y^CG@gI zE9RgsQexw%jlJmkgS=+Ckh9I(M;7X?Cy27x8KVz~|G?UK&yX5GI&dfW5S!27^Bx{k zji7?KnX)(#Ji%ob;ja#9sShl~ZBF&7sj2m^XhW3Fe?){dXvM?>bPPn-Jr_MEGTwO} zrfq{t;H!RmGX+L|%K^)@v$Lbv+sXhzpT6Ec zN;r=6-{uJ3PAx806D&RD zekDgxlHF=?KT^y%+isFWp5??ZsZIuhM@QOhW27hzuLqn|LmXG&c@2jP>=bf|@+^o% zPr_l^tAG93@;3+?H@Ki-=~Gf)$&6juWK}VGlnSshlt#(a@$hw>bhWYk1m?_{aSCh& zIvAhBxIx4fl5p!rQBtyA5=#eHD$c0IdCxUDBHFJ^`TAoI9G~-^5>HnyMmxkM>`jX{ zh)(0Za`-q`TR`YU=%lER!jm@R=(q>mFnQz7xzRn5Jqxvm?^mc1MaiZGDo+-1Lx1Ak zo-eYDv(l3cZgoE)p8~#FYrJ;C*lUaXVX;R!XHN4Ds+8_*hZ$XCEMXg`iE&aOEppQy z^sfEx+84~~t4&((u;Fz9mX)9lTrqwDpgPrYKyQQwy=et`4dxL z?_W6+5dg!E_?gIf7x^2G63boM>Xu}PxOPtRmK~KH4=>}MNj@Zn}?2&|Mfl8gq z>Eex-=~>w|E_V3C5Z^yf{c$C|877t&doN%^Mz;^^boX38Im`=va*r+xEU8iVu13caswt!b_55&cw(H%6=RssUEe zKAbzzwMSIHC7wQPyxCXbe#6`i4u@}6kZN6rL%NRQH?cLe&C5nzmVsSOqbFXqr6;z;XJ^;=H6%`GjOxP2 z!{oDb&Zt#XzET+r1L;T~Yg=}@Rz`z}5D^8_Y`KZcgbJL2@)GZqine#UYdpzn%R>h5 zZC_2!sZV$j`-2l}KZh_&XC*>0{T1wwZVgM!37x322-a5vt3T+2q%xy zvFj{ouw+fG<{ZrL>8%pp5*hunb$ZZb)Pn;J9~wXg@{cB2lBR+ds5B z`)lNNvaLmf_7uqD%c<^oDUCYw@l zz6-&Ve>b)cw|NpkS<7$Sb=bj`QB0H%~fK%C2;c3QP?o;Cuzs{B{h_{W&S^pR)3h}y}BT(*YG*DS^qh-fvp+M` zJ$`%g2Ob<4U`~2Y@opUd*H&~}$$Z2b6BcoD9IX9+m#={q|D4zG%^TC}HQwj_`xI>a zd1aVe@f1i4Y5A}cIY0`fM3e;V6%tzm!=*W+un1335dygP?`^rQ{O$@?@5FzoU}q(C z{Q6_P(36fmr*hJ38(+#4`P<%)Jta94Rz{zir3~|rRe}zlj;VPrHzo%N-mGxuy^b{6 zRo)vLb$2i5U&6}Wm|s${t(YcqO_1}{IW(#|GxJ05FpbkUnle0x)_|>e>3v;VX~>8U z>pb8;`N&h+F~FyeZPFjHTG>bl14lcVTC~-7od7-t4D@2QgT^0C&O`XIyX09ithg}t zAPkhsn_k=~XJ{QX;q8y~ZS&7j(+2B|?D_p)0!g(e^06kqHWh;!01W{O+qvgZ&T|~4 zerk~W7nt31eQ?Si@VAT$wQl8}W}q90&aOLRO`tWS7JOBt4E%h=aKBd(w-(LB(AopN z>xNl=IsnL)?I4^}pH;-GHTONeeAGjse~=H-M63zlnNwCoUSMD2XvW*6I$h9(OM%Lo zbxkG`KN$(e)O2r~;=QkhVAQm+btvibY3uJZni=ZSM4Dw)Fnzn9qEb6=2j8d*?YD-Q z6E~n$7OkfKSWxR>lUJC{oB>XuFnhiL#A=aUNNLb(;MQIoua3!eyHQwEz*k^z7y2}6 z5H}T+{>9g6dal7-mvqO|90DH5d~S#sUSC>4k*;R+)yVJ3Nzn|_snOG_T!O+hYbWR( z%l3PuQ#I!{yAl%&pHv`8^OYpxy>9t*m_OznZLG@hFZXRE4XKhisnW~&R(YE-f_GoeIDMQtrZE4dTuJs*8Hy=`#{b2fEhw@Om3 zi8Ub20YJI%^>f4Pi;OPiy#h(G2YC3OQF&D_!e-wv0sE&;OFQNWdwi`%On;wri=umeiCJ zY@Q2%C;`&#Sl&c$*!sqCveWh(EFq;&}R1XA1mkx9X;E>&PBniRwfFm^JEydDEDt9&zYYmrnB7 z@WGCq6`H44DpvPmmsFM7jk*(@7mm?aB&U#$i9OZ{y-CCU=41U$){Q}+#-{s|+y!6m zZr@Om7NuMj;uG=$fzhCkUAMx?(ZOc42gG{%%> zhiDKBq!sxAu20S9A&{l7Rg7si*Z-iL_)bo`qtT}>kcy|S_l#riMAvSNCv8q4Qq!gb zbs$QeC+4oHu9ZXkAKaW12eSQet_Cp+v=OlBIQ<}zPU*RbM7r;!PC4Yen? zx3)sh_Exx13w`niXJ}UQdOiL>aB4@wV3~{I(IQ<%6a7(8;IiE#Kr|*1!0NUgo18W_ zF)`^m1xeNgX&;Te+{)w3kGC@9X{ay7dtgU@&DaBRd@zw+c6QY|0bAF2%jRj{xDeW% zR1C#?I39Hj_*H!8gN{llz9bf!k~Vf9N-_Z)Im+lw&q^PgxVMjI`I&EnHb1>l59=!O z*(3pc(}1ikUrXNa1veJjX*IQ58qhd!1*IpcJSZULE*+N9+nZWZwgP?&xiIO;r04vfM(uF-rnBgGMXk5_UPFPWB-W` zBFN&U#*GB@8iN)dy^in%gKSIX1+=%X@7Hcahehl*7!erUm2}S*^?44gp|9!q6;B7O z`0>$^M#Hk~TQ-qB=3zf6-CVzj|yK_Ur3 zPz{TK|LwH_P7%;k+0Zya(d|G&&NIK75vgofj zLMt?RV4&X2!8n+By1S>?21H%d^c#C$N2OA8pz`EXSqBarcy4jBv%Q0pn%?{I<3~_r z=r%+LEtda{(3l&UXJC$=aLoa^!;7c7xbdr0$LsY_Ce2|<(UZWysbkDh#sUt95oKyBKNYhL& z*5DecC(W6b0YIW-wlWL>qJbsce)%pGaY9r7PN&IY{qX44~V@;04q*N zOHFlH|038Jl7lIcgz01ZfBi;INY0}{_(*Sp7iD>MJ?E4{F0Hq`q+;IJ#1^Yv;l^CQ zvul6s1L@18%narlKkJp~B=RJZ!d_ipNSB52i&ANcW_j)=q?MH-En7n;4Shp@rV&@T zTJs4@Fs_dZ#e4Z4eG#~PjE+$wRtXF3J{F&{R(se{^=jf+Zh-_$%mADs6y*K7HY2+w zX3IO$p_^D|7a(3E2@01MlK}3NdTWT?@oXK7GjQ?NfN~+_mUQSSLFoUI6R#G;8EdR4 ze=>&687)@bg>JLQKL4&gp3IJRk}PQ1&h2&cY3qIzvD1w~*vNHA*YIyC+vxnb@Rb`! z->6kP`mJHczJ)jCCC955#?`#$W)+bhK4MM)Z0^^>JdK)&-VA#uu4T}Rs^BhAKDR%9 zEK{DP;EvY{wo~);qHz9d#Whf7cXe{;x}|xT8$2htzJ6G%0<6+fX6;rIUqTRa6EdS{J5?EEVAd^iH~Ja9n(*{%Z>H1 zD;5T5esI!F3TW)w5=qGch}G7{t}ame7kkqTUO(n?LElk$_3?DXZ^+Fo1h2C_X1u)0 z`2~rykxbb5B!TR_9m@u3ze%k6>CtcAOmYR!k8q}71EYsNEdK@4%5Nlx#AeL){ulB05$$9$n*Q^5okSXiV*@y37#mAcKGGMxtPjXEI z9h1}2kwQLB*kuXrN=crcp0L=b3+kvwVY(W>^L(S^iiwZf2=SPyT}>8W*P&o~Z+8Q* zEdLn(bJwWx&gNLXlj?l$hEhO{-e|+UL&;W}@$1nRJ_+0_*v)TtcjVO6r`NmK$i{Vl zo6r^;cMl&K&90wA(JN!NG=ijCp>Fvk&r{@G5-T5K9Ll^0Z@(+D7ZsI_={h}9$yWi z`%{^F_#{a0*}O9ePlGT=DcN707%eQi z`&hYIr+ZPlrH>-j{s}2m>I!&ue_CO(ZI{K$Uv@tncxNY0LuL})f)zWk5OSew($6Py8VKfi&Sd`pshYO!ZY9(M>}8)@N8x!<>LlGH(tW7 z?8Vhe5FbM^2eqXEL3=~b?5${;=}c_$91>(OG+w&ntWy#yZmbNe7pJ9f4fG*Fs;SAE-uuw(VV*d|iYaqz}g|Htn!uqhkWRRR#T+6GI zZvF`3Qar!nE$SS0PgXiJ_wXSgyF(LN1)%Kizjmj*=9fgR9HSq#`4ni6?K{;mOrHEL z>uV}h)G0amY0tTRW@xTO$?-#w%j5d~YqZU2Y#l)}NQxZ>3K$p3(6&Axpk0q`dtU%; zgQy>0XtWrDwcQ3S|A*p`hh&Ws=GV5^BlWUJgDQZX9Vt{oeYR*EkJ~BLbtA?U#Msid z2IiX3x@nA^J$%S{uZV%4e$VeMpK#B1J$QZo0^bR1kcnFF_ZLL{2_Fw>Ufj@+`co_d zH`Zv;nh8Z1fd;sn3C{DoT|k)A@&Cw`cX=ksadMi0yNN%xU+@R?Ziw()5u6(rJs;lO ziQs%G*z~Xk5jD;Tt5I`PL(L(A8l_ja=#p(#j|0|ECQuW$yY8nrzj}8f;`Hw_pZ@ZO zh2KcR6qhUz^T+mu3`B$iyh!UJZJ z>Pg?m&^GVu%p3VikR{gse;d2JCG#~kr-)Nkm0k8;9in`mx7nn?W&?mwIGy{W!BXs{ z-7dGTl1$2ce{GGBB?e2K*sl~0HuiGcp&gFTRi$cPY9>mt7Z@S#mm>}irjFr$Hs0c# zZLH?>Hj#fV#O>8NJl>KLYHK z0l}}2T1QJk_jI55=dp4tP_}j8E9n}kC;x{}lD09WuZqGNblkVpnbJlN&!YVPinMWW zeJ#oq8{`f=iw@Eq;#F$8-UUaQY5QP19;&5YPXNgiXH3sMjrF{EbEXd(p=Y{H>3pDN zAfx0YP@L%y?;MB~H^}~Uv%SFB%jc(6wpB2@^ zJ`eo6l^@Ffy4!V!aMsFNo&_icbuI8aIju^u+QOd_&aIT^;X@>gf)MQlBM6CQ?{WR! zF9ZKN*rq}%+VRfFclJbVy#%q**y5KLqYTsIz;iMIe?PqjU?3# z)pyO(*v{A6XL@-t6`CZ?=KN6h4uLe+t&Y%H5le2!k&k-aOPBop&3kS_ZlnWMsW+DL z^F{LqexrY9N3InMy?Z&IR{HB%2{Mpb4u~C10sa@U{XAVpg;wzdx{2hVsUwKSM%tX; z3iEb2#90C+{O?k_Tgb4}eMRsmXI2JyEAphIWig6scWK{^^qdEE)o!*YV@#TzZ8FB5 z9TMX_gjSh29({WApLr43Q@}*Tw$0ZFTD1ZQHA&*xb08l8;<^o5tmGdEm5~I#*H)hH zKMzg5@37F3gtTc~7qU4WK}~&Det`A0*tiIRUZo`-XXpW|8eV_R3Fz<}$!49*oIlOZ zj@qR=lYffip2fcY8N_0WMc=)Q(7XmL-h!>H)SZ!26PD70wA8WIC>=%1=no(nDsu0( zgjVhi%p0j&8-$i9{;1oGU?h#{`&@W>#kJwbOQ3yGyKfb;A$K;aWc~9Zw1BqM-1zck zjZMj}rURz6)js~e%Sge@{rzZ8Rf_Jyqa`o79D!GJv%?L)!Ni8bSYOz)%&3-lw`rWd z7Hh>Dab<|6MQ_kYii46e9n7#`g5h|MG>#AfTDmm!fupQEm#zSWFFW;+m|F7K`NKRj zKJ@Qaos>4t;;pI7l{I(5rotY_ptPjt{(rZs!u2(XmM{gQ#irfkz$!_$+zP!kbtp>{ zAA(vWb_EoF$}awv`#@9)>3fg)+gcPMJm!}K^atayEVNOQLiq%g$EytrhQ z=%wb>cx@JUS$c@TjjE_^sVvDO+~g!yNMt#omsu4~{go=bHwPO$hlfO}D5d=zwu z7y}F+vQs`DdHpw%(Zzf|3%Z3Hyrjgz)}G+_1W-pd{WULoRk}D+7@n6g|Ywi3wFo{2mkBs8+C>5pX1=Po90Lvs5^odWS%n*-w#=+#`%NK>{C z5kRdF$&ck^iq~K7&3UNB%5fW3nQm>V@)`(ye=gdqo@E~~vshnKl3#*nXK-6vTJ-0_ z#&k;RMosm-sZ4a}J8I4W7w=^!f4f?*%|~*!!3E)t{L?;#8$P#eBiSm;9oK6FUhj&` zFEJ8}xJt>ow?j9sBsP6~G`9G1@kU{&`C-mwAec(itd{On@}_n_!-~?dh9Fno+Dc{n zZ*nhQ%Jl2+&5LB;46AVGpWet?8`)XaWFg&olOj?eD|mW$a#y~ z2^2P{N5|Q~g^WN7WNGvPa9{w5d@}tdlc&?ayH;@vGX1kV-ubEazdoU8+r2ND8b`WG zkv;lqr}a<&+|=Nd6{ai{QIR!7?%15Kzjo>wxP#Rn0z-hOn-&8&4Ab-a-`x}ZspJOu zC+>jR4yKdB9aDvTYEAd=O7Dgk=lrc%FBu(se0EIGJ~DIj=x|7>*+WaMv}0eG4GeR& zn%NPC^7+5OU<-G9MuUV-P@gz#p5K_?Qs9)oKf3b=K|ay@m{08GbnE2`AH#+hYM1#5Lj(JEr(?S{EY zVMEI1VQ}k}J}JyFh9Uo4jek3O?e|o5|s)>}<+c(j8x$R-~?u>60>prKpACTKBz>B9FQdU<(BqYwmA#x%P&;QABXv$v_Y1L!D$rasP zoem0`?KxjH@=>cx+htR4t*TMouYX`wl(dO(YxEBI82`r(tN-PZ2A2-v#u?8w3Q^;? z;Z3aJMgmRF)91y;wL@1U?}7mkJ+DI?!|R_Bo`zxq2TNtZvtyb+@f_D5F6%!l=ZAWh zhbs&7`VN4()d{EhVFDiQ8_@F1S3AS}l;>GMQoPK+u2xk*)oD-d%`Z5bxySn0H;#m$ z=TylhY771}BDpmm4xi5t;vWDWQR&ldM$e}2AT#sz;r#+5fY|?CvVAdnD_E_Dsig?b ziA28RWfKXx4Rc>I(Gcr}mRvi$``4f4ENk-{!5aCr43eFqV=k^!Dlw zp6X9bWWf;yL6*aNeSX*l5XX%NCN;XJF98JeMU`(HQ0nGmgKn5Pa%PY37U zIa&=^%<(mlir{XB?R|`UWNraXq?+eR{&z*Pl~uowou{`M4M(!X3w^7O|L1mKbLfr}be@7;O!|k6RE^v<3PY8gm0xA4 z+45#?Qq;M!g%K)7q^EUU#|smyyjIxRYr{zhZ-z zSf_C(WP)~=EN)pm=)Q#?HBu+IRp`TcCPPfcFAqE0Y{)3q^~Xi92(mw6dL?ovxcxu7J7$`|AOjAIwh$Lr1-$Rb zLUr>zpriOqo(=HtRk}OfuZu4>q|*%%+;I(T=XM}yj=l!N?FXs{QW|Eg6OqcZFAg{r zt~GV@NP@l|u)Q?bVoSud1O5?&-)?Ex_3`0&KhPfA-`(bH$bhUEFWeHzW(tz$Ed(ut z3yQaf_3t>q|0JEkEM!9kUOetuTe1-+kl(+@Vb&AnZTlQ5+3U6zm<`WdA6hUTle58{ zO+?y-Zsf?SA{uB1C6eYjQ;Ko1^PLz=ARO3ys=Cxc!M-DExQHg)NbBVL)cQF{q-6`3 znYAc?G1%6Dyb~_Raoe74{rku8)uKEfbFTAp-!=d5h6Mh!{xio%uF>_;{dcDwqD4dA z9z<(ayWxAL`)A8n-ppD?dugt>a9P9164i-DFPdi`(g8g>#8o?9JzKgfuKF5suE3)) zP^9W1xQ@Ttyrwp{z-CX zj5K^XiJg}36Yz`*MJ zLtw`;P4&epP{?@!NOF80-2@#N4pb^kBtI7}L59V*3Xx?C`r)?GhCmWpJv+jE$^!(I zEAu9s2#9{qwGcU^UfA$xC1t*n)!>1N4s)M83Yj413j-0VM7k&)m&W4|D;%OQHhHmO zG#_rM_+kUGe4lau!W`J^Rxk1z|9=hxOZoY7G-Zmwvmq41y9Goen!DxID!lt+AIl=^ zTsyPV6m9ha_CuH+fT2A1=7CkjvJN%i9w-B_7uobO1ae=2>D!<8^i(=+n zzvW)c=3$;253b}p5o^(`<=4Rxev=RCE4`;zG7mdm6wsQ3J$&%sK*mdWajh$}`)BaU znFcL*T(Pbdx3vjS!hcbgg?dwk55emr2L~(k1t@;z^V-Y=Zulkw5mL)gNYb6Zt@bch zVV=g>^;O4QEwPFZpu3G22hTZg3P7w;Vm4fZUNGFZ+nu#Nx)F_HLolFPQ1|nCQ#N!MPbhLs|tKtA1br zf!V%_E$qs>nBVX0%d6}$Y6ZI@Qb&zav#mhcwS{X;MmAZ|l9V6YkW!Ild|-`Dl-j$R zZin$ZB)kI>dF1VusjZ~8Y0Y)0FbN#xO&DBF`^s;Qwout!!3{waLf~+ePYTku%10En z+AOXc&~jUIZ8A%IbwaI?jm^}W=zahG{h9|iFBeC6wqZ9qI-248=4Z?LY5V!LwsU-< zXH|Nc=DQie8meynOkx6(92D9$u6H z7|}X_mfv7asa<=_+b|O|8oBEWJtgd!={eNEj_4UpZmjq;l$v<3 z+e-bsi#B`~L(VSGNbOyEU{oIDyjY1Z)Fr?DglAce#+U)fNIB2FEZ#&%?5uQ(HLD1Y z1H5PqSc&XGLZGKO`@R0g#x()-06iK^U5y$tO2~@ST|1&S;@i9`Sy2`V!1A_p0g~*> zJX>Q$`HuU&|9x|CA8Mi3+s8l6sjt|clee(202iQ~#<#S##)$|Cu_@pUw}ZSlT3RmV ztHJGq23;ZoKHcJTPy-~YSz{EK)BzX=_H)mY`W0+vni@hmfD|?a&}FZA_7&EU59uFY z$OQN3r;HG)8kUQK*M|BVf`$*9XfG7<>3+S{?($1lRzX&4Yb!!2;CrkplryZ?yCqk3HBiTGxj~0H zx4f%UDp(~O6`{7v0~hkZvLve9*@}nF>1uCTB|yENo*j6Z6%ShP(0J{4f(0mLzwiK? zHkSql1|+4^ecvFhB$B{XjnX=g+i*RWqZlGF|5q)z!NzZ*IDmDC0-@b^bkzge!`22` zb%TtThk0MJ1%!8_iHtB`;8)*D =Y zuG|H0@5sHxG<$C&YSAZRu&(q2-N!UD+EG~=p>iQ)?-5J*&j>#{6)Gau)m zQ$TpvS%}%Z&TCWFK0MhCULd7^kN5y(8tF2##z@VSfF$1PB*%hRh@+$O!P&gEY=%jJ ze*s4^zOxI?Lv6 zk`>|N`5U_Di8~@C=a~34KS3NTBU3psofN_7=<_)ys_6u)W(*HE1hW?niy86a8|#rO zhV8}nPPE#vHFtzNH@tUtyi(I1#b$9&1T@KsFESXtWUYJ=dco-FP1i&MsN}!8yQ#3!&ff-(Sb9j=sGe5f^zHmu z`sV`+zJsDW$od5)_NXZ`AHQhXh&^hMEYe?usGVwSFLtO6%IHj5X<6>~Vb8tmhv&kP zg}Q3izZk9F@fa>FNRBBYuL9GlV=i)+>dIKIZdO4F!D}+Xoif#pUMY{`KG9ve#X6Aj zW|8%X@s`o;8h&bd$i|1y3NMn+B3+|M6hy0>Ist%^UjeLHG41K34X??+B}ZWF0T%6qP9(DZK{)0<9IklWbV93Fn$EPw)(`$nm#8hCm=MN-PwMab`LafX5T> znmep?Tt*ElM|U~83|iZ(1nds$ulHGgk$Kqa0a|p!T>l`Y^tQI|;#a&E=GeP{qVflqr)Psi8cSLX13}3ZJWLVlOHxnKjAJD82(T0>J zAGnHt?TO$FWgbp^u1wM{2S5Ik4LajUhqoka|B$G7-a}C9O`Ku$?9$Vnu4lrtQGYKT_6>${+=!v zJ}CO~L4Dnt-!dcfyj5f?Jc9goR9(DCOvkIMY(<$&g2#zyI`2_0)6CK=S<7Sy=9BoZ zz6z_)TZ?8$oR2PqG+x0JtbOeMUACz=U*dGN8xm7Lg@EjZ$US(>_?ax4-<+SVN8V>` zfV1O<1{T=#>Abv0BWI|h*thS3M$v$X2R7USb3vHIl5R|%{3A>(-aYd7be#Z5=_~Ha zF8s~8(fMV)VoGUvzbP*Y;?{_2iBb#uoKZDU=G_{lroEzq7ZMc}()>A+{qp;LnmYFl zQLRhckr%@`TxlW(k-Mg-&OTzlmcitFjn=E~X9f1xQj65qnS2wO_m4Q&5ke>b=i+4> zcz%PQ-b^WwC(GwYbkL@#17@@DX`jOe&ahWrFp{NIm4#|kDkR9v0s#v=?i$4-&?(S8 zaDzvLjQr`vU=8u&{1p;yoGL61Lg!W~+_cr2QF=i?{X&1!+;Gp^vjZvG&#%HMRrl~9 zW%_8l;+a!Vm->K5UGSz-Dr%+f#$#QYR)L^ZT&Jhqt#>!WmkTo`BC}^fWTcY&XqPIr zwA?e8ADDl1d(`Kl;0#pTIr2HOTh!bRN2(EPvon>aB!GE}_PPe+YV#~Ih@2koBQIOc zV_}j}K(+yoDy0L{V0gd7Y9ptyWR>G{^S$@VYIt$o>h$bTO_uHPkZ$e19UW;g`Hcls z`wZRoAv7G(Ht|-jcrd(}5WHMR95u-olnVjQTdA;jad_6+tVgq)r?^=L9OIOk6?ue? zFDQynLZ)A%h^8K!V?JiwzZ<*r(}d3Fx3&Bk5a#$2;}(cc$~{`xm_FqK9JS9Bw?%);n3Pp1-yG?n7gLIViDy+|BBH16b!Y4Zs>q*Avt01K( zj{#o;IP_Y`p^C{PfUb*Gf|4XE$ zYvDZ_OgT8h)?R1Zb7?xMMrm2}JPLpy9Z{)1zNCOVhju_Fns=p!*te}Q^2lnZn}Eu- zf~GkF4OS|=ZnZqe5F@*R8-sByDw4(3ZpLyNZrM8KE5ajAwWk8bkgrfEM)<^YUObk2 zq{1gcn#XpO{u2K3-nn5?QBX(eTsYY_A5p~b@V7!F`I82De{`(1)JE}Ir$-pi^#wx5 z(g#GSdcAplby&PWs$6!@Z!D%EvsDLgslh_mm0d$>MF zchEC`SoPG|@rVD9syBgZ>gwKyqo~xPQ0rS!nY6ajz9MK5QO35Y)GDF^2?${j1!YDE zNFV_xP!X`TG7m+RA%TQJri3AN0F@yqUSd4s@tz)gLH29+-4isC;eK^d3uG+%))wj{5&Scr zv$6`QZgY!HB;?m)s|vs6(g!j-Uo~H-F*21qHQ<_DA3vBVGWNqoRXganNH8RF%QGTf zaiH0kr!hEx6ZKE);(jPSio(KeiNY2NyYLk*g;eU?;%k%_-EwVaDGhytCZlxyLN(v> zyM9Sq4ic-} zLmvzoS7&!Q2MFch8cbi7p8^?&tK$Xpt>nfwS%sIMiSMr^dvBH;Jn31FrXZbkbGl#2 z4^Ym>shN$!!ndY=o}G1S&2lShisAK~KNp^&x1g()hJK;()wwAplMry`*+huL?sAMy7d`%m$=L>Ee6qJc`mLA<%0RCrxgzCMLl8hO((%&t<( zd{|{)@ZdjXOlOl$f418~=>*fM-n;RMVxRPOj-g7w0;eX$58{hh=aS!!b-q_TCCTfJ zyEon4$SWR+F6Y&a*4&_jpTd6y4Qq4NA`W=+mODZZI#XKgB+h_iYGj18h_PgozjKf< zJ1#EaUc?kjz^fWkauj_uJ7(E&Z^l;O2jiH3Hm_X(K(~o|QD|B0VEe zLtJe2=zdO*kHI_YVTuC>`Jf~#liQ@D`u@K{Hih0@xjP}nqzsZw^>)?;45zUll5?20 zE_4Q?PL6*Tm-;eSD2WCMw3Py}tzZEy1Z9eU(oR=7Ao;u43Q0IHTRRbbbPS{cR{IaC zc+ApS_$tS`r=-j~L6bF-I}k365lN+yln=giV`p)V(!c>Dqo0Dm#?8C4;zkIQW~bfV zNxU4bh}W+)R}4cCgutvHeU#x%i)Rlj==N91R>I5o%Tm?qpHikO{QXSE{=QK$AY!Q0 z1Opm{4$U8AYH6vdRnPy?NSwV<$?i`yWj%QN7M*IG^i{$Ye*M4MNerTC<0GA=fr2qF zR^UrTf;K@`My3i_=@7?q{;W8!C07r~A^IIf=1MKgV-Nxn`&LbYZ$9-9ze8tl%ccG=@t%9ig(7|98+@1jdj@qwk)Qo@f-PP{vW}E~?9v%_Td!(VZ8o?a5 z-h;k9RfM#@=}PY23V(aHES}Qo-xv64uQIO!5Uz4=k2K^S2!>DJ%;3~~U?h%H`2Wm; z=po976q$Jel`*cksO|u6^aVlrQ98&#NUx?@ItKHccoSY)0*Q*rI0GchZieZcrA;e4 zKs0Tdsy`hsFdY^!i25>XKDT!Jq#~|ZwUwr6(_coOX(DzZ-HxoJ}A6_e;w8^3?SjOwH8uo zMs?&s!Qg9=e^I9S>CLd5YRBX$#;9%TwzV`PHj9s;L7W%vCnD`@pNbbbdjo8St(N<) zCZN@2g|#l5t;y;R81K{7E*vlT6YcY=heYqgiDl62i<$ZlV(`|?fioFZ{~B~0=N@JQ zx`HrI;zajlWIbX*Yo`-zdD|ft!X<42~B3GpX<6NF?2FYw_B@(XrQ+Y+Ay`ZSwsHfS6=?1QZZp z8zxqv10Mx$IL_Jm52zUuooxXtSG`RG+(Za81AZ4YpeGQ0c`9l5x{N*=*+-B7?>O>| z&+#*^63@PF;(t844IcQc%SppQ>LZBsaA0$v@o@`^0Q_eaFjCz5Gib52PJ#%`G@|~Pn2iT!94oWv_THiziZ@L~qm7sU|{0?oMwFEm6UDJl*l~^8wC*X&bup@>Ff9s`3 zraWh<9GLxfj$pX|2L;z`C#Tv_Oo7MeQwzw?|5f1oa%6#%yDCh?U0E!j|ATM=?NGx& zTtl_oFrM6{!N$`p59AVAEf|V`V^6sud$kjGDf(S2IF)7MG0)h&B3*G8EU(29Ie5BO zp{9MDE#EUhWnT@Q#pO%5-AH+LKy$5NXWTgkiZ<&|2qc?DtX3`HG(U-?5#W( zZ;y1?)*~-G24Jq=%*LVq^gT4gjeV=bhi;>m^jlhkPS z`O~#q&lIaQ(9M~xMFVD7+N}6WhNJY#=I0jXKQ;BtFcj=JIC@k|cRuh}p(VcGohTpO zp4XL(>Z>i5bt|u}vg$sSRcc_#bBJ_toJ_e77nkW+n`h*Oxp?*k@;t8V4+LBGZoRDg zaSbrG?|TfIrVSHxBW%6oS^lV<8ARz7(#y7; zAVfQ;Wh{TeJ@n6Q9Ysw_n;cU895W2VmZtu7aARp%Mb1R21XLY+c=3O95ZCqCGhDgy zm)>njuudJ7Vn)-2p;?v;dZ+ko(P!0xz0K78gT|AZj-Vp*z|VRK`!Oiki)2e;qRkE+ zD{g`hx0JDUcszyFpVZVp%phL`Z)-174!-ohw0c^mc??NnCp$x_xijNdCawF*mF7-M zvh|Kuc6io6=L2wLbn~!U0kwwp9LN*2;##98tl&-N{32QmUZOL^UxQ~V(NzuxIzya! z;vWpRbd+12*RXqC4VF9vkI-%_nR=`-Ugng6m!1-gzl20j)WetZ-M!__UYe?Hr)}g< za>1>Uo+bSTE2#yN-o7-XQ>oSY;=#8rwi)%G4#Q;rSI~rj?QtnIGxkyDq)Y8L*P9<^ z7#ZIPkLnLfLM^O(R=Ca7Dzd#GCEFtT0Rdlc7QCjI=H%R;!@9J|tP?`g-JzepjnIvn zfvmYG8q{9Y@pfMw^Ur9fIml8Jav?~Nhwuz%jRe}ZyV?tl>%j98qAE0{E!q%MTH%Z5{#Ux|-9R3)Cut03U23)|XRg#mko6$R z+@z_eS2sYZxaOX%7Pl?4nsszDcf2QZ>?@D*T=HDoCJrsgUJw+!P_3B(-x><<0hJs+ zWM-+!mRf6y*{afk1ZjuoG0VK(7?r4tZ(t6iOhM&As7Ilb&&v${_B7fBye+1!HX&_` z_C^@*I{5~%|3NP~BE0o)(EXg>KM|}ptf4&9nKDSf#t0a%*mZezMJ!4l7JDW3itRYPuV^T3_lTHtp$_{C2_uH|lH zUwD+)oil{_{;#joW3Kyc^?f%LqLp!7n?p?&6hln}mYR`*GV8^RAUTMIiFPqxuI$RV zz>k7>lW%JcAw@Srplcht5gG#e z#wX%hnk%DjUzo|&XoE%q1hf#OPy+s;yI!A3f+(^v5#3W(7>xQHEc8bJF^_RvrjQ&HJjuZ*IJDZ=-ZukZM zHaxt!{#~8le0f~l)b^@Nm%BWl*G-kV@n|8FT?2_~(tHt7(a&fxJc-GR@MSNjEoBC^ z9;)13sajx`huFvo(2bxe1YTq+(56ioymvHK|Sl`frK$K=jIBmOg?BPrZt|blm3GmD}=zG_H>GOQ-$_6(x1V5Y-&0T zA&yFc8CaP-+bm|?UvA~aG?Tf&`i?2;n68DV&Nc#(dgmY0Juh79KZI$7Np?lI+JC(= zyCfrR|CG#QMJlRSaRIt-+$N*`;<7vzX4n5A`FVJo?2yOcqbI`~WfJ$PzWpVHUapCh zsuA3zuuk`!QRbXJcl^f8%Ez{C(y{fT$r1mhc1RA~)He_N#bxo;i_W>JnUJe0)Thjm z8S-$fwQr;Fj+?3Z{KByDSfWJ5>FI&=`ELKvo*!N4=qqE5p5qC|e%R9{JYCw3hf1Dg z>qlMu=oY#>yXO>ZSQ3`BwehGr);^uquybq_;;B5b8tT#c_ zae}QSzTQsSqT`(x{%o7hTG?r7m|z9nqhC0GNDsb}*N}R3e}^%h0KP{tyOohi?5DFX zMUyK1hQj+)>Ge*ZJwDeIV;N0JDE+vURrfMQHN*YVYFz9;|ED21k)aL{K_;WVhGnCn zKRpFg=W)RXYm+K`p<>5mOrOyp5{-NyEdAMF_Hk)ai(7vIpl#l%UHMxR@9INDkZvoj z-<@f)REioaZNr?tAB^dt)|&d{j$ z3^pfWzhV5*ZRWANx5D3njs3DWq_0ZIYUwQOO_R`iN`fX*!HfDs&98>qpE(S|{Ojj#7}KfC1LlGvtF(jDeQBAp#f8f;PWx8ktYRP=-4SCt=%y;?(a16? z?~Byvkx!JYgz} z-^3Y|ycSs7@iZ}FoNH*r6LB|s8yzkyr2I}wTdQ}yUo?a&BfC5tAm8;79RP3%1oJ>% zEp`-CpVrC;t?9x6qoy7!a6F4~egq<9Aa<1%n!5gv24hdm_1jl}UJ=~yW_@fRqpfX_ zKT7_r61!|V<$TRzRfgC3AI4h_SCB~nuoWKt@p^ygl1`=UcfllDh7@fv1A($Xu57;i z$ybFkW)W&fXf5_}7fhiFRvT2-+MhCe|6=Ck^xI~OI!js*UC++i<@2VEq6uBXA#|3!c zZ%OBF*tRIMS zgc^@WXVvvS*K_$Fk=BrOW2rz54o}`^DmM?fs4&dafQmT{o7z0_KVDgA^^FerdBgA$ zav6EPoA>OBzO#m7e`UAJt8nJ)N*AZLZ(+^87oQWvqmA@R@ac+A10S1VR3QEYIRj@( zIm`808YX2?VNa=3X6_Ou&%d!@qeLZRH3xN*$0DD3v$*8YoX->R(aApUq%+kva}olb z!c)dH5uSVLJnl`)(uds#hn90td~DlV_Cyep=ixA~ji}Pf=;fb<;=3%GCK2-ShR<0y zfO@2;ckDkL6N#7LJ*wQY9G*x*%n5t~lenmZ4al|5M7A?>5t+{n633X$ruoZSffpoL zVP>*s?l2>oHQ+!3t?(CJT_>$Cu5sP=*!BY^-y0AY1 zkma1D7O;W@p46eyDv8XZvc8phU_!;x6!%6bb9PJW7HtJc|E-eGETqK8Dkn8L zGS8XT=Q`C&WLdp6H}%5qufG2J*Du`GuU@tDm;0|0*WCW%haIKWQRGdv&YO;o@I$J< zc%;+0w#sRXsuOOLLVnfGbsnp)bc-^6j=F#U*2#+F9=ji;O!KtzuDMN~)Yhl4SmDl0 zenB2>B7!Y$AD=#+5k#wlzL{N>CjgMxi!`olT`h6<9zI^CBq>C*a0+3U)_5qyjm=~b5L z>`UHOsOM5#i`D5#ETJTaGt^^W`}mxTX`Xl%jkJ~RzwiU>$IrD>%BOO=1Ko4ONM7N6FT?Rv#@`@KFlKbY%~!rf|9ZcYS9M|PTr6>>PC|o{Am#((!SrT3u8+w-XbBx9+j20 z1m)zl!p?m1@!>xK`M|0SA#=bo0`2VAw;In-AynqK<`4YD?FynMc ztz?3pU$J$05OkTuMa}&NXB}-L@wIH0yn) z{GDCflNKkNeuQwPNn7Jf*Pjjjo&H`PL^bB`)q0YCDZltG8yiv7%OJ5vy?>*N$#_D2 z=`ibUQf}@si^f&1Zd0cJET=Yf@>)0CUkeoo*u3mZF9UtWE68-Vni4Fc;Giei*4x&G z?SH-s$GIl2We)h1atiAp%{d!vFL;1N1Igo;=MC4SSJg$&UK9M%lT8(FF46G@rO-y{r*C#7y)*th;>o1zU;Fq-b zcQ9fwrEZ$xqDH5Hp}qdRO)(5}zcHO~BiUUgLcMI+TD{J&p0@Y(;so@t!6rB+_ze59m|R z8(>2W?q2+r){0M)T5jt9vyxmeV8~5=Kr>J8r#cJL@9f4VlInF}d#&r( zR}c1WkE3at2ge`Z8h1?QIkw@!K7IZ6O9#Y*ZHI*m@AJ&cu|pfBl6qOVb$CyECd_JP zu#H(HLbK!b)b;jH3x(ahriY_pId8|pEMk&xj*pMG?3wZuw#;}INQ?H~H17MLthzDd z!;U~JC=vj7qFG;9fH2TW^x0=+9d&WotdTc?hHFH=JIZFEx3kM8$1wnnT5mn(}gUHS0lmLN{bw+zFkOvZ}ohnjw5b z-5@y8w8*iN5o7Xkr|HL?!{HSmETih70)&vDDXMC$6}MTa^e^O?i*sba|1%5hH55TZ z%i53M>s3pky-#}6mLJ8Bl2cQ~%L@w&L9}`eqo&oyVX#i&US4PiJ9K8iC^-4n0e$^> zM;jX(mGqR9u*EiqEj`}O^Qji}6Va`KUsdIjrISBp@fdzClp3S3KjqrQ_3+Zd_gTi1 zN=cN_{J0nfQO4=zc{a9%;ST+P?ZT?@Q6m=?^{ovSHtHGH`7g(8S6%E2iQn(<{2q*M zv6j5#u~4TTS=@Lp6tAZyujYI(yXYmI6uz`rcTM4wA`*F!sxfRT*=k6ecBWj}|LZE8 z&RS8OsFvx!cHl?^u#+ZJQ&Or|?&-Hyd5q~O`2uCErMcP1~!KbZStDY%ERTp#MO`)M!gu(uBw`4>+b=TjRu8uJ0-6bHuKIRj3S^ z>hZQV6}Ljd0X&T4gQU_?HM3Kd0!D*P& z!Otxsic5Xy(}&q`PuGj&_DS(9!c*C{MwxR`FwlL`WtvF&Tl}ja76ED~m}C3H)A*@N zPE-l&S{Y1@D)T4;o$BlwJW}m z8s_1dUJ+S~TswBm22j|BnBfK)x0I{)Hz7v_d1Xy?>crY?aazMgbAr_&)sxRQdvx;b zDDJOz;EHX1YUhS3A}DoX9Wg+aGt6d#P2e{04t6g6 zIe`aXEw>+G$0&m?G?cfEj2ansLwE->w~lxt-qF}0U7W8&j|{7+gy3Z+0b9CUv{Hzp zLB|Ad(86v(r}nllanVo7Ei63K!7zA3)WeTIB0cDSgsB2P-H%;oGaNL|wuBBX-R&-s zoiQaptHy>g_;*lqgX44^*Vno(MAAA@sSJ%B)>gX5Q(|<;caQGdju7-#o<+Z6g|Ka0YW(UF`m;NZ^Urz4{FCZdI`(*+AJ&u%l5>robi1-%^S>~ezzL_Sp`=N4 ziD$S&y}RloKQ?yLm%BgC0--|YP-}*LffFZ6K;9zRh$^_+#eex^5&d4F=+lPHj|yG> zf=v(oFYkway2Gm+-l=UC(9{#Z+|}lQvkwTWRVn+47nDL{ItNdGnDPwbR`~Yf{cES^ zENHrm4Pk1XzQjt&&qF@d;y<5PIEOCnZ0`+Wh~K>AYf^VK(pX~F-k9c-+3@jasF6@3wW~B*&M&v{1bt8f3mIf}q<*qxlA)L} z_llX2E1a3H^r`ll`o@J#wL=O*&F3FISK(Dn{5)4!Jx%E>s$Qvio?tTfN+Yj-!cA4< zRS8YkNQ9QRx9jJG_k9Sj56lDDD{5l>%Ftg5Kxb%CkDf zL(4BF#cWz^%!bv9gQ0Xjp)%F8?=oURFQZwXG=dFST-a<_!qum1<7Ih9*{mx=cW{Xn zQ-~~sI66N8x$3>J#tUnLU_*d^D{NLYlh$~eeDY*;A$HmuGcH>DOBcFh%w$t~Y=NMi zeQNQ&zWG9ejZIGWu2sbWqp7Y`#vP1vQ~^We|Mv*MBsg5vak)9r*=v)M4!?kqWlvmF zxGt1$rmt0gadMb9Oj+_rY-nJFY$z1Wp^f^+jVLJYo>F=8RBMABcai|JyV z-&5~UKQ)RJjqSvPhqan>JC z$)mpU^cutN5YhA+QKZLgz)rD>y78tPv9_xZ+tpxJ&zd;W&iQzc7T)4lulGbQ^PxnN z1m)9YrrfT#ABS-|dyHwu2D-M?m*8eri(}D7n%(ABP+Fi;<{D?HOB&O%Ijt($W|>$u zej7AsP>ieIot)+UEg1+-zKloDtG@&$o&8}vL1s(I_vdNXnd=q|{5j_X@5MpoOL!VG zy^1u0IPD(~S6Az?Fon1iz?k;9tj1HIR{UrwXeJ^ukcE%%OxY_Rk6<;-P0$f5tM7K8 ze_;onSukMq_6Th6wQ)*eqw|lUo8fc1;mOa9ktaXA^VTKGp6qq0>qa_>wc+52s~>=6 z8+Z)UE;&een_Ufd1^8+7I?oVEEs#)EZP|W9fx7wA_MrLST!)57t}m122tLk^QWWUM zJy~9T&OqL0eDPG{WQo-9$0@g^RK|c!21LMo+pJ4YeP!52f(00Ug}yCo;zGRVTD|1) zb}m?;701l3-l=v)PcA9A;+htNe`>IhiUJlDJM5kF2HaCw&+u?L6!)qSL^XFBQwz)p zrq$x^wZ|2KcQfQ+0JnZJTuE(|-klCSB|pDT_DYoJDE+TfZmo)92l z)4Hp4rkoMs9c@4;;j+IPRNI)1!OcAIzOGC`zU2*Md>xRA!>xl*Ug$u0q3RbL!dozMe38SZrQ)eTy2tDlzHg|80b+L3UNq@{GbjMKFh z+QU*DdK>dgsKqgYj%&)6Me-&PtCCS8ossqtMMr zkece!$(iA+S^e&kgJq4Tk-xyH)y0!mz)FW@+%da|w#R7)l9Dj_*aP5GkVhvI(2A2i zn~)oHHeR$%j$G6%A1S99=@^Es>a*1cp>3ymLf9~ljC!58iv>D3`)I$wv6kF=(c**3 ztiVP^q8Z>QTl$2Dg+(ryU((bxz(3JRU`Kzk8pnBL#5BqgkD#o>n&ywmZlqJBMv6MW zESwS{LqFd70%m@$k}Ovxig&UK{Two!rN0Sse`$y3a4tC6oBD@zCn3llT4eq}rrLM* znH@kR&?MA`Vcmy8Rp%iOksNv?ctD)rLq;dXH7L6C27DJIrN5}9PSSbdy$_hYt7ARA z5JW)0@G;hVWki3*1ntr2%v!h}V?Lt~uBYoSQ_1eUotV4l8rCfV|1${uPkin;6vK5R zE_hLQ}hGzGCYJgF}>fcc-Soz{ObVjii6EutP zO8_Oh&^$po3pI6^V*|^-TxPtyQ zK3>aDRB%n{b(k9P=83D>%rZrTqRPnH1x7yuSupafRwo17aMl9%2vm{PFIQXmwB;sb0(CHk&QG zFhlsQNi}(4a_$rA3$d_$h?xPqnSUHfu&cM%up=72mOG6)rcm7%9=|xaGy~m`)eJR- zIeD-Q+4H!7wyrZ86^)JI)<-}|h4!$=+SGv(rI<%VVrh}|6`E04{C@McMWZ1X=E{Wx zX%>BYf8(N|tq2I1ZI5BlYM&vxL$|92K?~p-GF_k#NC8ThY^}^SI=_GQKWj4Vi6(4m zu3R>nT1(>me_-c}6CfA>hDsVSvX}*j>*1};0P%%cTCC)*NJ{kj7T&Sz7c-bOFTwdb zls`$~%?@mS3k6;oLS3UiGiaQIOT24s3tkW-ggd*~LYLY3cDeIB{Ri!}@ywQpDR_mw z+SlQZO2e86b0bCphT5%pdVQv6v&zBD^u-n=V|V5kyNA=!Mf4Y>7ZidLavZQ9cqP^P zw#B6v4^kz&OZ_1bkQkuT>eZ3CBiEKXal7q&F;?EHaaI=kjY@t>w%2~$hEN5Feo%snF* z1{Yev(5GvkDfcQ)$b%BC1G<+6!oRIGg``H#Z2epLP;iD9X6}rtY=a%ZnJW=3y{lvJ zKpzg4)r#X9f>!x4G7TD&^=#R~zQqzY4Z3^L9~KK;(;_3E^l{U{AW!G z*`DWc>8B)pdLF^j%4E~YRxLuFUF2!7&|SvXW{{07K3T}1G6V0t@V4Bgu?e?%!5A@B zbJ_J&&$__OnP{7F=)*~DT!pTUyydQbmFji{#!B7hI?-}=88k)=w?0r$y^pEUMo^>S zsykQXenyPx`m~}TlMtd{B+N{98^}P}N)XpZeeWZXGXU|oRpm));DYwgo2)b;ySAH~ zo9D7ZFksq>Fp~<{;zu+G$)kJXW&1zCBy-+{Pz3z4rhM@@JGkbq-wHD?t=#|REJ&(Wc$VH2@g`V9 z!wW=GQuoEtWB2MU1t|LZ0R+mNcbJ`ppxpskey|4Hov|}*==<)eGMk>~Z$vk{1^qO7 z^aj`(I)^2!FdsX?b+iMVHi_=<-$vbNFQbcWy=g?r*3jPt7ICpD=7+fU*yQaVTikW3 z3hO2+Ep^nO{`G7PZc96C0*UM^KtLV)BQuQUrx%C!c2%YGAES)2rsK$z8jlcYvwUP& zqPN2Rb=?|ZOUe=2ef3&}EBWLjf|sBW;No3&B&DHekXsP->bx}5iIknzaPDW%-ekZZ z4I5q~Z_9dRCKtYny1@zeOjcV&@T)@l&WKwDXPI$sx= zRaz#k6pJqjvXmlV*%Ci@I+O_iMzcIBWsk|f8R$EKxZ+%ud+CM)w*zIi8fT=J1$Xc4 zzW#cN18|h0EwixIYuPyRt=u6wF%YAs%3HA)B!)FGXd%`nn{c8OqRUyz6+kJli?gF* zMlj3rqNQr?XnJuk7v0RSn|UQHi}af?AQV+c^xjgVnMp=e`p-*u#;CVRx>fy_p0()m zLSDPE@h-K(VVA4CR)xcOWTaQO4^9l!7r*N?FZb?UxHuu4*zslwL?M~}F<@pMUKn3j zfWAib>EXK1Ivs7KM1g5FK@I16G)?XkbG{Za&@8$J(jBIWsi!8W&$*1eMijuh#(tPZ z`hn|KxmE779B?`dRCR=M&%q{xP|INP&fi>p=(g#jS*sdrx`%N8v|F4~7xYu8|3VHM zC>wE?5_t7Z8ZH5EQdG!G!|`fZeDP2tXfnN0 zHNC|@|I`v#^VSyog6qsJlqXpG&5o2;~hMg-QEOqTW)1}6bomtdQvDSw#=lTwy1=r(Pd z%+NKk8{_|;^o9?zxUJ0wi-S?8HTrqr2eEVMr&ba`W+(pWMoUyx#4fylmL^X2zDW?mtU+6xG(8PZ=!Yc_rk9Z zcV0Im^oL_H7+g^b6%-7j_znD)zO^wy8P=dq&9+^C_WCdlr}9oZL=!A^*_gH+WqsNU5@m$~wo z{4=Q&twvN;J|$1k4GESSgMsj%^LRN?!u97Iv~hd=7c2Tm6Ww}4D(La3TL)+!(aQgn z&i{$Z>=FtNw&;hD%Dusslun5nWxZi0L2sCo8R|cJ={=a}48(KaKu(I>q}}SkqD6 zL+RL(;zs9|&`^)0nS#u-(o3W!>7kDGhy%bb{Yslva8cQx!x`6~I|=4?#M7qxF>nFL z=~ATeyqwNnv6vpXI@gp$2t9Z2^uC>J`aEr(zfDw6t9Y31 zaGxp6JYyP?8K^>c##3^|*~eqB-RUYTaxC{4eP!WSCrRFDIin8xVROm!m~21G^rI!Z^f>Ga*2QE)>R7*`>YPb%xKk*)JH}%@h#M!78Dbwuo-GK zK28el!uA5?GylZ)Bw;iE4)k=YXSDh`X@1EmZcQf6&;~#Z;JZ;zhEi0PU+S$a)>1#x zNZz4YhalfUnS91Aq|7(Ue4lqrgk;V|YDpWlGE%p1)L0*N3+o&`*x+d1Sf?Qeu^)b9 zq0hRfjPag$Yli08{$#_8nRb-F$@53vfrpl#3%pNQ%QeZD!1KUW6|@$aIZz77GaLuk zcMPT`v9^@lV&dMv4?Zy;?FvEUC=_b z<8;l${?LJt-d*v%ScZloe5c1)idCtb%{QbB%~)^8W_3RR?b)9ruR}iZlwK^(h49o4 zaWT}LIb1?MHN3P&c+nOZcEks9;3S@x=c5aQRhU!{c{wBJ?mQncrP!-thT6 z)IA;bS(5l1Dt}6^W|+>CkiTER>EwEkb{TfwTmaael6jzW38qOsf1buS(EV5)4vgt{ zYFoq0&?4kR>}CKjj?2%jEW8kDEAZv6VS0eSPZ`_*FeIBkRD(yJ(4%)Prl#cV!|II)Qa?` zmuWJQ5i*7UusE7DY@Hb{_!TsrdrKJ>R7 zMBLLY@$972B^$S4`t{HcaWu%%MAotbI6B2!m3<@J#-GF~VK_If?kt6xxu5ClM3z`? zwpQT8!3*c30H{FRS{?Y$l7yze7XTJYGb3FL3Tlan+;u;7K5xVtzI zPiNRs+%WJTQX3(NR84hI)4=CHIAJMUJZa+I>a%>VjY6M*s0wZa4n}ulAZmf>Y* zrS`*j?z$cBB~JA<4p$aZbDT0YqfX-s> zs#u5&mySY^)dBPWlOe*UZ{c^jO2BXs(dsJ2nP}TD06-SIeAXRT*EGGS zQmH5NT&>6XJJ<;Dx7nJMA%Qt1OWV0Zz{*I%(j|A2y&r{PS&5+D;?72Wq+;iF(5D+0 zUrhph^OO8=HhV@cEvtZ9#PTP-EC$;FIgZ{l2%8>5U;-qd6CG+-$v8!aOm|maYrJ`= z9H4wMMR$>U^;Y`J${RY@*E|)ET-&uOpxq=mVM=9VqFl#rJG@l&ASCljpDDw>_YW=1 z2a{`qfz9uL!B7$U&j+Cw`SuR%d7L-$wT}nT& z6HLcb0lg?5tT3$0yn<*-gqimNZ7r!)QZ}`gvAYgW!cJ(p>d2lyr!0@|Fnn)SjidzSAGzIwsvK0|;*mNe0Dbz!2i zf?0jVl24f%TGg(z7xExL9T@CuVhw#AR7GiLl~2UPpGH^)6VldJ*7A_QjOrYOgvf}( zPcE{=%4zq7e}+q$F4CeYUq|M=aaqhB-Ts5YYg|*p-kC;zeALLB|HC+hgdE4F$2@X$ z(jrsP!RJ-5+P~+nyaC>TiQyW1HxHUG(qJl_GQ~U-_i$BpXJZUJftfifkH2&XodoMb zU8b%)kqH|9YA~;Eo2YN+4bHb=4uaPZiW!e?3zwixa%^M+sv`TOQXhgA-1 z-~gdR45GSQb3@dvU`fM>_XMqszilFOtMc|N%FIz>WdB;2jw=JQgL&kj>V@lc7W@|- zU-BJgY*KHdhTHMOlWlSHD(BXysj$QNH6!iVr|Sa$K{A4_$#jF3+PYoTjh_AabmURd zN}@2}RYN&+L`Y2W4sm9qsS`|l{d48+=;(yuDU}RG$bytpJOjD^3b82coxzRA*vmpe z&%ZEX6c=(a@QXuNYYGCxxV#qK;&)SiCWj~Vx2uNn?z8Rq;H+q}Fgprb?nk7Bo|QpV zD6Xk_#5y|M874H?CE0{rjFq1fYd=e!(a>Yfhsyqua!rP1W*@4sSfH&t{Mph*?7{Y8 zp_w%+1Va>H8{K{S8ElmVcnPw-I^`rPTw+uP;^>(~sSYbI2z<+RA-JRk=x*(S>~)@P zFJ!+!O|92rnmK9UDXsv6OlJ9F>oI@dYxvs>*S`PYVX5)%(o?UottXxt>QhsirOXO9HTqlzy_TPO%hYWTjV zbEc9~hX46Gg=ltwWGhn4y4ogtNv$Q{-VS)R18~$ii1NEzAh(BI$`$5dz;grr* zp&z?PuA@vyPj41gKo8|QBwP;OZkW;YOiJVGPwPT!lt^at$1j1>SFrMg{LwHJST&)S znv%@lGWO`H>mpy5QF_ZY!an}t3rByA{hP-{0Y1Q&McM{`+%;z6w?}C?)Beghdh0j9d>IYLbMbTWVo}5*9 zYMarTJ+0XmiuNB)(Vc$^{=$A0WUr({UY+-Uab3GK8aAR&)7tja7r4Ht2cta&7`9`7 zx5GA};288mFfY>8k|)OiXUk<(1sen-|+R31gw+=i6 zcjz9R}PH+3hMdCw<@U1$1;w^MBlY>a0FveLS%mjjy z&G;o~7w0a!GEpF1UiN0xdbGTBtiO5$k(f(`#EH%#o+*z%*%YQq5q#R~sAs&8IvyKZ zC)ve5rg){07Rxjf-5yUlmUU=GToB7#S^gjix>xY_ee<ZT$`|$SPFq4(sIgJj_-&-m{N021w3!If1{KM9+ za^xbUUog5b&hS!uu9XVV*gTap+_QmOln!JyKX%ysC?E1JtMNb+GQS9{BaxQTtJTJV zK+Y2UzMPUl6_4B13d@;NSv<61lcpZ*2#i;;Ld5mEb2up-!T)byH;PH^lZPjb6NdMQ%Af@MaXlO-ekVf zbbc@N=Kr9E@kErkY@dE)qN1pJDcf@T&m0F;N%&+(|=|6_I7GZgDg^Z5~5} zf~wp}uLg|eT!a%Ta*2p~9X2;9POKihbK^Nw5Eq?F_Uxn36b*G%MU|^KkJRFhH4;E1 z8-Im=S;CyF4}7^iKzj=W!fCWB`-aZrx0ixr;YZ+pIh-*sOG)wPogls1857~MXNEFf zju2_PqLoYUAIo{$kdl%D|C7=cQg4o@OqWb@*b}}z?wGc%VJ{(i#A~XjoUkEd)uAeC!(CVPPYj5pM}{Y?-TpZlcWk97v#^sO^B()E zxY@KyoZv-U%d&Rs$iI~(q7dHPvNy}6*&(Co{^!S5MIDx&AdUVzX%<`U%pHlgwIn%G zk5h#1y5?u57!`o4Y-wVkCqR71p~r(NzLH*N1Q7fujZ@~*1w&i^tuTYe{Pds!VD|;h zZu3l^UC1uec?wLo8TGhhT%)O{?E_}PiMO5t4^<0eY6bwgEo*;p;@X3>Tsp=slm-am z+=g`uu?CDl;hdLvrUy1mM7NgShh}M^{oWv#&nG&WksqPMAX)G&b=+dROK(6~5D@d| z$aR^iPtViwP+&ZsGKXj3dwA!N1~dkkRO%*X^C{fmX5y)-o5raYAJcSY4KIl>!xoJ& znNi=*RBFvlWXYZ=N+&Po*cnfoK#VQrCz)5RSo0*&jy}EBLBr-IBQpc01_s-WzQ!rd zq#hP(;C&q?cK}+{}Fk%SO1X{Kl{do z9!Y6=_7OlVo;+NQ_00`*;6A5W(8F!{a-0g74NEg|F3-bGl6=a8x(@9N6cuVZ6{UCr z6w!}_fwNc~)6WhGkwQOGjOV6N8VEeM;`?={`}2$nSOJ+O+E9BMVuhhXa4Ajro3D(z z^Mp&48CRP>SoTeZnSY{#SYtk&(@WEsy%>Gri7#V5r@A!{{O3iw$94dYYXv;Q9xjn5wIX*!kCQ8qOpFKf_hP0qStqb0PQvYN}M~*jB@_ zj?k)3=qu>@wV6J5SPk+d1Om6n*#N85fJ-MiAyPVQx!mN*qxL%b9lBBpTW(9Y)8;5Y zIeG9zFT9b&^bY{0C!6*+YLZ?{>V=8sy*zH@tA6UxYd&>!Pw2T*fqZ{)A8#4? zRde%}H_YYt^E0AcSyrvFO4-7Wi5giEy7z>3#>1P%Yc9m>Fg|wri=E1`cfbAVhx4vq zsix18wyOWDyx71}dBhaC><3sK%C6$8i_`u)6SY@BKBUp?{uYj!)A>2{yl; zO%EoqCf?gL@q>t5QfnAG{f22w5|hR63nU`Gf8+wR6dVkYdv(#pm11j)80=MDf9WLf z&h+Q>lyN4vAxDw;<1yDSx&i`QEitV4G_TPk7yOTgTS)y`ZOUCD&L z=yRyGYO>(mkC4ryMmj>*-q~X^Gr7%yJ8$i6OjoN+hNAktY6PEr%&3C1bN~aTqh5ww z5SY_cY}Mr84`j)%ULqwNGylQFIADwYmlCzTqf~s(T&6;puiyiESdLw781+UXnVR;# z#+<(Ozzg6Jt@i$+D<2$?7St*OsD07ShR>nRKSQg7L|MK0 zFR)QW?|ZmG1YF)W_%54}vr4atRz$U1%zNl4A z?k+`mADzZX5kzyT}_+vC2x%3w>c5?!kx zgdjU4hjws@4Fco@KVBh)=ImITLw?@Jyn*-xLF=qL$P!S9U~r7T$gM4gS=QRcTof#D)e@SN+mnHEL6P^Q<_f;v1ybpEZnZm!ddg>xWD)=ji zfBMxEvJ6$z6?6aon%VsBd9&G$9NC9QDq8#fMhn?y%RPDNv+h}kpYqn@_He3h%@1#B zhkg|ohZzYC-|WB?9!_0O?&O?j*NF$vd4%yfIapj{@`(JVZ`K!4!h(j-FhLkFNKQYwCXgKx3^^Md908MMc2Y2v$H)mTaw68>Y&X83ma#WXc9@ zwG{yk$`BcffFy(sVTBhHep{rurLIp=fEXFbpR*)7ZS z^L450ppQ?ZLb{iduQ$*J>>~Tz8x{1G2kHgJ*qf^F$h!8UeqagRYQsJMZ4H#t$Ni~VjoL#MiVwy5^y+Sk`zD>zHmT>Z4&rSDoos$0jNjJ}b7 zuE|jt9`!kH4s#|6T}s|{ne5UAX|AtQcDKjOEyZc-xcV+VSxLf} z9{oRi?EfC7Q`dZ0Q>c#^aC5p##jYHEA!@Su1{K6 zH_mEK&&gb$pVO#vzko4*a*cnaUOFAJi!=;di%pVU_bL5=%%=Mxlx0!sL?Eww6glbf zCIWJ~$G@AXBrWTy$0}xWA!DalA_AiWNCv_W&A7GS73h;zD>-leLd<8irY6)%6{rHz zCC95|ANnfKbEJ8D%UVU^4aZ)8%&LgO$zAmb(>v85Aj37GKDpc!!lw{ zhV5pgjoO=>_ZcNB_^yS>o8^{;SJ;SlT*VCVKx60 z@XBhSuZ*XPe9?JGqyEn185z2pAxy+Wo~R=}FccB|zokKD!$$kCp}~LY^F7=s^uU|` zRK9C*F`N`&Bt~q09JmjXa!!-8i5idC-@$6hZZVjI8W~bQT8~%vcl9-e2Sn~U3_fJX zc!e@CanHy8J`3OIIviqbZ)CmuI7)L#9h<=0ZD3bi`B|tWkp#G)p|$~(-My7IVV!N_ zVsO*$);$qg0YnX1u(udzX*w9L`qZGx5z`_tlemojPxexWQm+AEvaIV$jR9Fe%s&nn zHwIL=tSeTC=ogd}*%3Q=JMjegTJjV1jU)hn#=MB2cN3GdeZ=rnQ}i_OnDva=4`#94 zc@6sWk`W~U8IL!GJ{)nSk9o2+kE1pp<2?1CC;o-=UYDu@7M9N7cg2H^ES1zY> zUiVzvZvf848OBl1E1h{?6jAJZ#e1mrDE{rzm^OZ<(ORwQ%Dta=6o&ZNIdT5ScvT)^ zxAwlV$2^qXGL;N6P`t`xDY*hU1G%QVu7Y!(MoseF{lS%)V`H7QP#1~xVc4m+!gDTM z5Cs4N;m9~XTD#d>x@{~0grEY+y#ZdqBNaWu=RsJ|{ zs3a{#12Zz+!IRYqCK5@YIn{4G%bnJGxW3Y|<#}b-@=*;s#yD;1mAWwL0I_$Y?M1J) zapvT7NN-}gK;EzR9nFd3GNn@u^(K~!dDJ#xrK>70x2D|Jmf>~{lRGQ}2HS^<@jg6gJW|`+u?P?<> zJDQ9W{pxHgeHw?GmW4P9e6#(XQIFkcDsx0VSN;W$@4=hRm7m--Oz7?5Y`vDoMxEa2 zBZ4KI?x__QRe`bSMnmeD#_a^iKaeSH)BHuZheWYL&`>rx*pHXoH1{7n;hjdBWPdss zk4ihOEjvbUw`-u`3;2AE(&~CKBPxlb#o9qHN33i+hfWN+DR7pa6%cJm+Tuwldc%9~ zpOw#B)IpJ+g=S3F!=c2E$~R`y={4Z^%I zH?MPgCKHL;e2p8b-xQXfzs3XgB40aL)|{8;*WBI#Rb&j0bOy zHQa)8I(a;e!#(p#ouSFYXQoaCA|-Ky3v_*tx8a9MvpH-}c89d=vN`z7YMQVUc-~Nr3Nr!Ohx(S)aeNHZ;xy- z5zNsz&kM`~x0uu%<2C27_;UHfb-~&o3`7C_mD(6UT=V^(XVnGu0?vulUzaQqA3BUw zsW;tnWU5LPd>wlH1Hif^!oSqXR3$c$+QcDl3K||iu&X7?{2U~f+7%%wZ^%mqB6S!Y zdU0UjDEfuDFVx8@`J8@c6v!P7v|9kW4O7{vhtNs34`HP+YY-?aVYmzyRG9XEn692| z#>FOxFh^hVu1-2hjFN(gG^phUtlB8OH&s_OjC#K6I=UaZE%#1Pg};e+6!QkehIrgOT1aF5d9Wmv9;!O+%A4BA^F}aui8#s3)@;9y;vvSXD(v;DLzVqE_vQ@*{hFxo}JfJW1Ps+Y zC`^Wl1b}YS%?m~$29R8@|GR>mtkox+Qq;d=^R>&Y5~jc@z76=(-6qs3PR6ET(GH;V zj>vw4W}H4=|F}DU5MWeH#!|=WWn$u~?|Hw-pi`>y_*_r`_;9>McqvT3^oOGAdI)t5 zX0e!k5R4#Mcb?8xGE3*YM8$^aqn2D_?=gTBrYuFn8h~Ph#zn>imH#B~0art-;Nil_3y{1*)2bprC@*xn9Uufu|>A2WP$= z*Ri(1FJwWbOjS~jjy&X6gHKknwCXbJa_e`~ih1eFF>t;$Lw4|zJ zEe>o)RmMRs9}dBDxtDc}3P1yKJk*;WWsHQqRLPwa2(ixJmw0~C)v(8Nl?St~Uj$FD z-{{?N;;Na72Grmw)7SdO`Ywj~!M}Ug^1{O~E2Jg<(Gi*FAW4vhh4v!8ThMm(&Li%< zp__V(I65F_3ZeoG(1Ro!a1_cWTPXu}cU`T7WNTr}CV=UsWxZfWfm9Bk--6*gf-g>) zTJ{{GF6u!Fs(!TaPic3E4ic~mU+-1iguvLCsI|7Rs*I+Z(tLl&e+YqHHr_8QJW6JHKU&+6B+E*wp=z{ke7_f(~=%;#+HgfU3ztNpZ z;UtFX9dP=uCa6-%48Wte50tg|?IrdsYQ)w7fk6o(#c(aJON|#zT1`}13-U-HdBX{- ze8n)-G@TSxdA-%g2*`EjlbaaH<`NOrIsSEA*3Cg^Zy(m# z9l9uO$=Wm*9+1BGA-hpiaNB?jy^6eA_2g#OiDi=YpN@j=4s|=(B^u}%*@ecqo9?MR zN4?I&9o74r9Syq-SWm!geQe&IekEdSU|(_d1dVPKEdaesm1$;KnI-l)JZX_Gwe;?0 z9?<@AYp8;oeJUZDZBi~EOG)ceQbLcWyLNGeGUZ_S*d^tqgwlHvhf3M+T77X{Zn}V+d<;IU)0i z8Gg7ElKtwNKy|8LG3{kyE`}wm)4Sr94ZQSLGCpz)-dE+q<*?@?qSG!#q=dD9|59!GRcBWsw_7{Aq9R$*GNCHJj9E| z+E4&&xW_ceB`kg#(uA{fvQgCnkk7Qel+aZkt*^K%Ikm^+Iw~r-n@ar({Qr`!$%ql? zb`7bOoX;t6v|%0|GWkfrI~PHUp@CCO{%jOd@XlnK=R><_M-t0!CLhVP10@ZH@tMf% zI-{+`7Qs&?(f34Hm3I|=57u?)X6Q=KcbseCOp~O{vf;9ZNC!@9@RE0Tc6Y8lV`00I zc}~@*e}wIe#YT(TH%;~gP1dO|7rT8ZwXbc2m_MJF{ejuU-esSwaf$KqUBv^BbWN;% z=YP4908UQsAQ>NCmrG55LJkz(RE}OC%uQhllXA8_LV>0!ZNV@(d#e38cHd$;yHTV~ zwjoU9pQhn z75or+p1hAIVCUvjFHMzr@*NTGypU)~Ejr&Rn01h!>%JUKnod&Hc!WT7Q-rU`vK3`_DE0(Ryj3^)bn-alS)?%a7XI}2`)f{fDx9~ZhOk$ShOeKJ%m+1dmUDTi6JLTUo zd7I-GOzvQ?-3wqjZ8|ilc$lu=UEg212?2rlofL-Y4u?{ublEe zm3r_8F~rRgle;+eUP=3^wK84+6F1FAIt$r`3V1*g6Vq3+X?|cA9n^;h2kvZJk0uUx zMK#E|n;}-pgGV7So=Gac0e`SEQ3@eRcYzG_a(r>BsT@JNOYE;d!)qKW4TH>6*|Qaa zVV&JbG)Z9X5S>JdT|aKS|AMIubyJw~cldv;$3=#DP#K3LHHVB1A_+2#)-wIC1>?8W zby@~_gvHw%R+(6u^#;8QLuiPCIh9g20VQ)tQ6|ZmNtHi8K_A1Iwq=WWPW^>qu8PJg z(MOJf|7W(My}Nqy(l~#w!w^GDD+s5x2Ktc+ZmnXDg}J*I)9agigEKiR_fEo#{=FXP zeF5^?4z53d$>msfJN(GiMWna(vK=GpqP|T$RQG|0IgCp+GlO#>_d02Pc1sjiExqkn zuOXYVF!;?jyg3Er`Kl>?N`T&X=5GLBko7s9=?g$U2aR7u20~)2I+=U-=r%V&wo6QI zd!x*;9+h=oPESCc@v}S%=xn2^2Wc_L1J!TxGRuhP`(Fza6CpRx6?+p_I+MlU2n7L) zm@k;mx7Y*G-_!FhqNrGj>Z7Bkn@NUL9Xhx`zi~~ZB+df zbyba4Qbi~>OozFnKKO+fGOLcrE~k_#e;jT%(tJP4RjyCRz#cPx8ox@5Xv=@&cpO4S=PMW0Fjn`2^C^+}9JnOXbgJo)PPlPprl)Zn)LO<5GMSwb zNx~s{)miM%$SCJOly+0;s;Ey7++Tgix2d8Ys&XP*_v}1*IdEM(+mVOb4RUX&6aAf2 z-R7;&tNWr!r@(AD)kohrJhi;xP)TVJT^RHp6;D;D5H3W(-9jb_!Z|=QVq)nX3PQ{3 z!FAvLYAuDeptuB*E~(Z?xjn7QySgC<3`JL0car+~1%H+2 zsMLww;mOD zDfXWR?UX(#IM4q|s`KjjI=%)LW#98FX9uw%2113akGa2y1MQN)24taj{f&^s<6h-N zdq9pOXwojA^gk#WhbZ_;4K_hko{5Q&{;{94WFcfUr;>ue`|4w|kk@n!KC@$6qD>wQ zH_^QJ9My4DD%{1#XCb()t!+A6L(?nfmAtBIKnvWbYD8j=UvqMDBKqU$ks_+#8c&mO z6FdIy>7&F%t_+JjMO`bkl>|= zV#aDx=Nsz*omNhI7Ks1QLTBovpPGQEnPnsqsf=Pp@D=$1UC!%$h=UAr)6S1Ibiqau z38NMfMFA@rW7yD1A*t z<>fJ1km?}~y*k|+P3;7Oz{l^U{1e>s0#TxT{OjmQ_QhEs*KmR!AvtR>0u!T7_6odA zS}@=V0q~1)A(|knUy>z%V4x?f0-%`YIdurggBf`x+x@(HL5MT}_}Rhx6swaLbO5jE z+(Y@t3PiqK;|K%=dGS!t2jo~*O9$Ap>gj8lRw_JAP3Xf3T{=VFzn5jh7neW;0d3&I zUpICR2h$sSxO2C+y~y>M?rCp!G1)FMu**exW&z|HjA{0Q5x?bYZZfkU>L7Kl%VJo-E)+?EP3fiy#98RRWe_hiwB6kO z-`v8x`%9kkRAjZN-lVR)(%%ZaJChYwBO=k;rLr@cdi#pXF2CNNt23F^Y>gj&$=psi z1gG>4@yGuj^cfr;Q1zIbL7XJ;GeM>Pwva_vhG%GnX?!RA^=&=g_qdu!r`3NTno+Pd zN`OwN1?9G;KN--^pSo>Y({3U>XJ4{gu~B4IN{~_KWVtFORQn?&qies*a5l^e&S4sW z72_fEHfK2CMj(=hz&Nt=%(Rg|<78uHF|aZs^Ak!N;XQcTUGqP3V9S`2^d7UXTc;*N z95^&nG}F(=m?pa?BEU}$)*Gtc63E-SGW}#@)~}kHUZF-WJp*PM1w`-4Es(zzT)j83pYo!C zn4!pdLUPMFvR6^hsADK-x0IkNX%5W5r9rGAW)MAu)TfSyrG6^$^!9^1YHt8B z7`Qsim`BD}O}p-{7x5(IbSf(a90_^rxs>j@U=hsKq_`%Ic+it9As@`zY>w54^RRe& zM>m*)tMACSIsC%KDJfIbRyV*T%?}Ihc16q^&~n91cSEXMGw>l@%gQEBnNRNK`H|lF zsPLecQq`5H+3+eir|f7*On85tlbhst%WPq3HNA73}3aw zno6*0w^C7>>gz>N^-oNEUfao5x783n^_kyE>9-~ty;L&6g#WM z*3T$-dY%<6K%8JO?$3*T<~}7Y@s9%#yR)?ER^<$+=3pP7F6-@(p|Q!czLc(wKy_%< zXuK2YHudl zn(Jj=rQ$^xLN!^?(;W}M9FmnK_D;mrAwo#sF?DZ;s#T6lx1t4QnV3qf4D7UzRA-hQ z5F`z)+!>eY4D6jEMLwclYAI{6UTHBG81XcMEJwst;8+=zKJ?hg_}-Gadyfbi8R_JnJcM#%-ttHTdCAAmfMp;HxWkK#T_?&cmS_ zr8bt{DW^C4=2%^zR-S(wrCSAye(bTT@A5sTdH}S?1MLIZlPWWV1CVr3s$#q-d9jW= z?f-(R@@X^z+?_R!aOQUgjU~(EFdwueYy1-&2Ay8Qr&%cVEXN_^x18X z9hMef=Tx|j{$S+Nea&TLWt0o<=u&}0awZF}^ev3(zxk?VFKhI8E?HZ;_epeDwRrkf zYHNgeug$*AG4g&l89BKcRWb+uyit-z%AI%%I$Wjc{7Q%|j|i=^uLbf->_-+R$JrCz zl~0_;izTCa{091WvL`-xPp0#PSV|`exf8Kg1T0K`0M9i!Z*Dd@LD8ye>fYAtirMDV zTr$Sp^Q_67^$!agKFCJEUq)C$uJL51s3MO$VZ?_&Fqer3NU9L;=}P-udmHHU)qMji zK?;OBSE$)t7QGpq*PsnMt|O6DT&mi$^bFa>6zbd-7Y)5@r|9jH#UTT$Z|koz1$^18 z*_`wYXS*q*Rs(&_>;bm|zYixGUPlD5w~0L2JM>vW)noD@Bk+jWP-!5zxf(DQZ`EaH zh%sg;%Koit4X+l)eW0)VXWix%Ja?3{U91sJLQID9uzH`Nh|%P}{S8m!5;LdO1|CB? zLBYumYtJUSaL&q`=Ius;mGu`*_61BFT^5jRT@ai~?BfnmC6e4fP2KKgKW-2l_cRg; z873Y;Hr`ZF{cVdN3RFXtT|GMct`>X=NVF|B%}rH$C_Ux*`5-ff*aAaXJxoW`7AV;{ zd`4TW*kl@wb#VWzArE+O1ARc4+r9J&cHx$C$kp8)I4h#+KX>%h*^7#g>Dbj|VCCD#w=(?DxKlLB(IV#6P2q2n9G7oyz{~nB} z0d0z02v6{vAZ+O=Gbv(go-0kAbXL({)DqQ&gua@I1{ZF|bI2g*_~AIfaqgJDFOynX z&^GPS@+4JC*gU7KqCuuAaCA5FLd?F3Yp>KL3}dCd|Az_PkSIIcqwwFi@HF&0KJTFa zO>ZM5?@>f;6qtq{T*MvRY9VJ1F$@%?az?+C8!?t4AAy7(sI}#CoReYoMOMqY79O*^ ztK{!_i6egCcXtT|MD8eL$UVdp7R5i3=`4C4-Em!3#QO!f7gcqgqq=^S$|VF%_e>t7 zkD$WNzQRr>6g=|$Ux6@5SV9<`ObotBm~t<;=M#WB`c;c-{UtYez~eOH(H%-3&4+4v zLv+4%Gb%;J{j9Eq%WXqR6a*fRvEMG zD_qKbWGM$uLtnqS1IPR3-x0%vWRsz{D=Uq8^veQFUJpUw4s#MgL{4S?u23iIA^M|0 z-$St9Q^>2J*$^Y@A^Jz8h0-0z;}CUoXwmSZUliL&Y1@xH&7v@Gja}0MjAh)sq?g|@ z?>g1n=G8XzCTow>c*4%vCpqFDG@ky)Ucg$*JnC-j`WPP9U42#)F$| zB6G%*l0Ag-G7O|We>I=64XiZy<8wJ7$ccxxqX-q&-GA^giAC#7!Zm@9ZS>WRfh*6? zK#F+01@&{7UXTVb$M6@B-;-a__fAk3uNBwR)M; zZD>HBz<`b=rt7U|y{@IkA4oJZ*o;jb@&FNS>wiM;MH4>4#oPVPV1Y4#LN-;M;AOaI z+)Z-fTUI}{7RS(=xFof{_v!v;M4~Aa(1CiNcBFd*w zr6Ry6^vuO1tep!gd3)iOFE6+s@|IjOVOc}2Eg3WX!dX(i{9!6zC4D=XOh^KM&DUjUeI2xyFwlAXJICXFIw?EBaq70y6+jhw;Um~~Hxm>$$JQmqzi zA}FH5+wD?eoo5beP;4#18L^liKst+P=l>2)={oMarVVB>aK?G)Ny=naUaUsB2qsGe zJ&I^1L}1>7UL7Rl2!f%F9wTrD| zGES7)4hH9vQZBu*^_N|pE<{Oq=f~inQOvffXrN-7y=&35HyGo4g2uj?!CC(F?+S$X z?u^DTn75b;p7mjm%`6QqO}RM`&{+o~+x2+?$is-SgnZdBTl6%#X!tb5s+VsC+n&&>_TVrP$ z=*9`1P9v?q>~1`NiVxpelGml=^#fiuX$|R%(tMg8zALWNG^iA6a0?RIVBpWXpihi^ zE^1D_`cdlhT7fV7nnwqV%4W!wck1Tj_Wu#~L;snnzlou{{-Dav|KLR0Ig>4|L{&B! zKRrpef-;JFQBez*0@Tuq)kjq5P61tJQD;+%{C_Sgy7Ao>vsVE>mTq~YyZ@IhrGllM zw!Ze+ns#qxb$kY z+f8+BG`hLjEqF0tPMiJUTu#T_8g=N|n@HWLVG@J7wrZfSI&fl49PhzqgiNc;Xs%6A zVx}B!>f-dIL{1*9bqw^xd#@~+?(KLoALEDXiy5pD9cm=E9H!$UwP%O1 zjDKp>WsRH(fhsiZ=)g*(5SjaRz1LBKSC_7D;~DTV!`Gy`gGY_e$^^Sj=5^X%2m0>A!BX7Q4Bm9M*5&O>a% zLY5`L>@Zui-K0Ah-}@r7;!H>pV{PEOP%-DqsZPUZ=(ZXCjmxLGXWzc-dW5;HMb&rB z3vFj=^d-|&?80lpYCN@+Ft_c;emv_c(~EJrpl^fMR~Psb*&>+G_ znT@4=dU8NOhxIX;W17~-P&)nydulFf_a3ucyqAjf9N(_UdG_{5fqCiOmK7iW`L|Mk zBDaww5J{J$y{I{Tz#xd~R5TKG`n%AZBHi;nP@@7`$|MJ>C1NmgM4vbD62nYq8JXUb zw|lr`g{7BgSBtz5^A$Dor`nqv1a4Q!j8qy8P_E#L^>W=#5}XMu1_T$EoExRi`L#Rx z9t_e4C@h?-V*sq8?0qgy<0vgji%|ZqDW|f&9k%!Fp zeIeuR(i;Vvt*Z{<^sHwLRAZzrs*!8dyOpMPUE*9H%e!eJ?TyCQR3*{TcT|MAj za|&9c1*Ym z4+S}(ov_!DFO1`v9zkb*YH)nxQN-YiZsF7uwj z$d| zSWTq?VMU9OmiP}$wRoXAZ98s3e957I9Y)X!RZ>>tB26$DFJscU$a+O zXZX#;7MwXq6O5ktbGCCP0Po}jvquor&7ao9cE$}_6Qmq{e}?xyp-A}I5ZK~=xd^d< z)>@8KWAZ=$|2eCn8SkSjq-nF*Ds@_4EY>f$J3OV_=-u(31NYM$aXt^5K);qm(WI^E z%?3nIEO%mku}f^+t|C>Qm@}%3e)01q|BNJC?SE0sltP)0KnV_!Xd0p2S2u{cV>71Ew;r@!NU z1l--IEGXgUND#y>Ex(Y4|F=4>*%O9veTuZowt@eF!UY=M>5(AOP2)A}ux@$l>8nym zZ*89!d@VhfKd$&K+cR04H;+uNHLZEET;y1g0jRLu8p4ryEmV1oG(e%PI1?0b8iYYr z7il;kq0N3CD~k6u^=FybWbN5oSewa6xD2iA7M$=CGV#f_xHhvn7EF1gf9%>P$&N&y zx;t#m*^dL^cAr?rk1GOO_^krlo8zy;XA3m(NFCBa*H9KVXKz)L{%mo+GW@^wXp7#! zc>gMvzPk(6fMHf_VLW1ct0sSI{SMRWT3>9697Fid$8#+`-pxz)Q-^yYn694`^WQqB zIEzFN_RZ8QJ=g|g4;AQ+A0*>65(jIM1%?({nn6FwYEXrO!>xPBo7rk_XLyWO;!80v zAWG#~S{^=xIYOWxff{>okAp9=pS)g&VUu-cIWx=uu3RL9BoF@MbC$O{o^_mE&h_3E z-(gbZBHi+evjz(%$`oQlV>{DaOD!d#c-vJntc{qzYrh3oX;5OW4wgr(nXHxqGy>D~T_?`K#2 z{EREfqwr3bE|M?8%PemJYd!73qF-iy23u(TNLS+TuGBgRe5Kfpl(kmAYopwt;e6tCLMugudlLDsp{5eZ~C(?QvXlgoEM#sZ> zosp)asF(&Cyo!Pj=SmQPQ|V?XUyvCxLb-0&l14(5dHy;uSk zGUNy6Tjflb3=*{4AGpRll8nhjiHjXBRg`YGD()bb1v}?prW)&G*?9JHzjucvrPQ`j z$Gpgi?vQq&GuLD2O*Y4~d>~TwmQ#`0t|JzUH~D2M+)c)k;=Pt9DQ;xL!vW==J*9EG zVg*8v%)OfFK0(OavY@U+Yr!sECy0oQ@~B*weGp+zz`s%rXDz-OvZ+8TO=jtnl4j{; z;&hcSqi@w)Rm?O7OPVIW9}yJL8)sMg^xA||CVR@`7>fsR$uQ`&?X&L1$#Qn2Pj_iSMVZ(WfO_ln-+xNUq)+vAUIlTR8Z zI*XLMmDDX%2JS^#%~ZStjL|xNM68MtH&{tP2a2gKEK;?3|8vC@#G{GH3`b7r_>@@_L>X5f$M zH1oDrZpI(enhLbnj)$jOkA6Kdt-5ET0q>Crf!Xf-jU{0Mexzt6d!Mr=hR`gYPmEF%7XGxcPucpAr%2{QuJ{A<$_>FLeMO5iUkT`5;X* zW*xBz^ylqzmheK@5!JZR)VyPz!`T~sMTI9e*dOdvUtQ!7erY|0EKK}hAF?p51Cd%D z_JoZ90G?SX^0?RWzs~aTwO7K9grsTR{L*!@neZRXJdt4dDfeV(ZU3~kKfQynFLp2= z>#LK`p2rG;*i^>Il?>h4KL`ngeVtQ>V3jYK%gvlf3X{)n5YzXtJbMT2Bh;>muR}@fsvt%- z)`+l=QCM8OJ~$Mt^4=4ADnNTL-_GQuI>AlfO|sLmbI}$1Y0U`7ZfyuY5)oEB7k%|N zsCi$=z!L_v&LU5>v!|5fY+TP7(Y&!5L}PQAwgyg(i)2l|p;(1mIsPK~qMt3`oPGJ#9j@z8gZP zu3Zj?0;EWXp^;yA4@O{e6|&rscAGy(yHKzZ5~oT3gQRO*W_y_%_qG@D*si`+4ssN^1m&Zy~-q^U;l>j*mnosJqO%r)lJaD9CM z%*hL4d+ymj`|@r?rs1{F=A~CyZ9;`OEgRX1&D#0bS7z+Bknb)Pr{O_#&M!b$Yx&*o z*K?ZUoxT?dCgMwNSjjFCgqT@oB@Mn!M|NST>sz&wjHNP^dVo&j8>$mEsy%Ch3;vCk zh&$6Td_sTG3`CeC`m;;8g-UsvCC&##Uf3V+8EG>`pNFxkTPYaKBxQfZh6NUuft&$I znb+Hewcj_t@)AqO799h*LGJQ3g1rKnEd3+O1?P*M#&{!wKSzJG!D-wZe2SfZY&&g) z%ht<(-g~}vV=;EA8ylsYYkvt2o7(+BK^2r9jn-jf(`OW>|FBA|Ywi9UVXIwZy||cz zrLvnlVU4Q2)Zfns1`|de`~X#jPi6qk&iz8Ozv>4tDsESAP>cvGmk3eVr8D*(DI?sg zgeO>j!qvmb2zs*>Y2A#6gB+};&O65HK439tqI;}WkgqSRu9a?2>IQ-x8c%2Xr_;T8 z8{hn^aDpjl*n^9S(yYw1{bBxIH(|w}AY(h*QXYor15UK4h|`t^qN?uIT|yP2z;;EQ zQ2r&`QB;=X=-rF%wN~S={IPn%BV_N*fM9hRLqFg5{IX|?#8ur{3hsz*zT=G`ZDf9S z;H>(Ptu(3t)HHUm3mc+Lv(nswc6B1h!8?>o6UhNJ!bL=&`moM!Mp$2^Z%opqvwKhd zA^t9Quu@*&%3?S+`Ks^F_mcHJ{vb#adL9gFU-5)(EeYG&8XA{tO#-(cWWB&OL{;&aw#diN0_xrS|x!cvL% zp1tk2DZVrkL)G-J=op!8d+)>&Q-`|(e5OB`+5dXLhL3Z``2hWGR~Yki zSWr-#9~rxc9AOB0uq$h$?hKs@QOtbY@OV5I*B%S|1e$-sSDYlPWJwrYoarV*&0dvD zr$9_8Jw&tThkbrRCctWM$588zsY)BmGYdJ=iATo+m*m(F_2M2Xl6&v#I{Q9oj(a$@ z((~xumV!4=pBzlvLBML?iWv@H16+IeqJ3McYW~>yW359xx~$c*75*m<>%gV0JnnZ z8)2Uu|H|R;u3e7@>#&;1gSUXLNVKl@v*n;9U>RhX#umD1sUCt z|4GN1up^;e^e~Tdtg9rE0W|6YcF=a*XI$ZSBWR9LAxG0ZxO53Pr;v`%>$;t|zT?5= zYBXJhLWK6slCe2j#}?kdI4xxP;P&*2j_C66$Qz@kV?|x2dMabjpfuKjG=sSG(`N|88L^N8nAlP1?4%;@p2S;cWo@i+$)Vw%dP+ub^7an`$PwS8( zeAC3Sh1xutl6Up!2ZzYOg>$ghm#$C$d5gX8Pvnt?R>IQX;dW7p<_qaoITE;-eMS4} z8p#c~{1Ar2!en1EHeRVU_Q`?3MzypqYnenC-{faJK}ckNB@P>YY>A!sa|c7yl%zAG zUApSGu}CZJ+I6k~#(W^`XrDDH!l;Xs1xT$4MSpX&h0%&mxE3KNyKal1{p&VQ?B?uk7s8mx~ ziDk)eUct0clxafZodqR3P{4)U*61f#Y^qv>Fd?~t2fLitsru=!ayF)y95dLCh>`aO zTcUgJ4OU6`J#Df)k^daglY51iPnAhleO}Of{kFm6fnGf1pDI<4S{qA_X-4|T;O$mY zUUiM3)P0Y=yn0RsE9H0OhtO{{;+Q`PlF^Uv4JP7NpSttU->HvQqj&+-^|}dZ;jZ+* zvniM;G0@6r-1{&rqO;uBZEPFT1IDhd3>!e$CQ^t9ZL%f`*7a)MWFC9E_Ko7RI_onT z8-*=Zrlsfq)+$%0h0Yezfvm*)=#^Z@$-9W`+-9ijq1D#)h$ly5{{2dFPt=MX^m8SU zBs$so*c_dA9qTGCq>H3?fhhpfwGo)EQ~NJ&Sc$BD?pGA1?_aN~j@-7G=TC5no><~y zDbHqV?{mLM-0;(W$DkChBRWAza%Nxb5I-XY$Jd*CNz%hX!c09w@0eLGPwsbq`EJ&M1jSUUh=)?43e|hpH_+ z&$P{PaiZ%O>fSf|r?Fy3bj2~;LE4{Ru_6YtU*i-HDi5a=4d0yJfUzqduDPkHbIf>s z=5XJhh3o=(8WEIl9kIG!uqI}7uk)j%j+0^QE2sa>yvI2*bUko{ zXhM$)txqZAC@L5rpAK;@3W8G?AK0D-U4Q6r#}BlqZ7f*gpaEVfso#8Q*H9kTW=93L z5ety}7g~|XZzxJcx`->Ea>?`Ysh7lH(uXt5vU=Ni$R+gV$ZNF+Uwi?+nTiLq;REcO zGaF-(r_mPe$Oyw)1*b@xp<8E#Tqcrn;`n{|qaSL$4lth;__i?q5 zoqt);WoVV!rSvM~I7XKJIQrR=bs!tT+R%>2eLc|CSJ?r$RoRHzgW?_@geUR>>Bt>u z8$Cl7DA80caCg^Fd(^Ww>Kplp>F_8hU}0f;U*GWuSV3}?{TQfCKX~~ox`bSvY3`Ti zulJcw@?GnizN<4*1T5oo`V)V$@kRE_kE4}cF*Tt4vM$Qe=!GD(1I?~qR6*w+fAcvO zvX2KVaSJQ11cuIRFIF;o!U3^>@!E;oMJgXxA7hipyk6TfZDb01hnYLJXJ$gzzM%r94kQOe}Et}Y(MUW8f>ad21l2Mz?P)de8 z%;g!YT1pl8GHHw~U*X=KIqtfGUOxWQS~E(&kk`i<4IlJtSRBy$Vxos@Wd@{;Q!Qg) zDy-HA%}UFk1hFr3{C}2xkL)9m*@58nuA3&14kz;uf@E4RTDWlIcTj)a9{*dnr30(H zPBK5@zls%@-jt+4Mx9%+(eiGBo*bOgos-lqE!~FpXVmKtHL~5k1pl4+PLHf;X$|ov(N_ zxP+Vlf!b2(_H{a7_xCpj5R8bI$6mgl#qJMsUPyo5M(OqG=I3xYM-KJA-7H~p_~8}M zJg>6e{eqY-Ork67SWLRaPQCc^V6K?J2K6r389=@& zTIAy2g4_!&okTy$9uZ&C$~c9L!SCxBHhqQEp;jK&gaSgvA==I833VAzeVqXrTz_nQ zygbD!rz+L!tgw{+F|(YGC`plbo9SSrsKW+m-)_wP8^o|Udy*J z_cYGRAg9CHO2t;(=>gp^<*MlSTSpoTuYjH9Bq4j*LU`}qt_x)wLSuq8X~H&f zi2V_m277Xs^N0@+zmK2gt-*fVW5lEXDeH^~x$A@sXBg{d{diE@aoeZGT5L$?!modN z%)Am>()_b(gILC_nuNk@c+R&SaZZcrFFeaOMr(DQfVZIWw4%7zA1){I7B3{Vs znZILF@o66H4q1em8W|IdYp?#sDaJO%Uj*BLnMqE^+Q6HKlI6_(tEy8a0~IiA^3vQ$ zvTy(2LSQG9i*~KzF8G>|=q46{Gp}+E^}gF29_>X>7CI5Th$pah@-t*;U;}?J*zU#n zi0Iitk@49bG(0h)g+5kKc40zk@`6)-ha#N!pg(^4hRI>a&`1ke#mIkgXE!efVL4AT+`>;4N9!yd; zJ&Y}*H|v`SCp)e>+$<}I!c(+hZhs@CRliSsU@K2^CnR)n!;KEp=UIY<%dzl{p|xOK zN+ncCe=$&awLijwKI~d+m-f~oK;Lgp>$ZsL=I8O$U?(KMftwi54?8og_To6_=dCe**-ahv&PW`rCia$;hJ;Oqv>p|QWP^$O&mG+vL@PrjLS zS@i@$j~Xq>8kc{BHu7%wJc4mOd-(_zvNU1Pqn#vZN6#1Wyo*xUfkk6}3bb%-pnrv* z3A>z|I%JS3!JPi00y;qxDLg0{5oF*!JbDe8{bN+y?!Gxw0vGKh#UN70qjPbfe8BlK@_cB# zk3GSRz&weH7X_#1ykW}A*9r8#J6{1p;_Lbf9`= z6$p-@Q*bIZwLBWI%d1Cee9;!IJUkyd^-k?L_cb;H0Wrs4FX1*nYOgfoQ6&7Y0zT-= zkw4-@z+PgvGE)ykt{DD*LPA}FCLFzDjq5wlbIrhKJ9=YQwGk(Q&t5`Brp}_rQA6>! zTpv?VdrMy6aIhsoa2I024b}74Bb3*Rt>qwg6lk8UCT3Y=#3iS?O~E?qeH#a} z&{s;1v5UHoP1em%N3|YatxD0469$)X8k)2|{{Qv_eyT)?R+nC+O)%VAh{0fKCkU}h z#V#`@*ICo(yXe9 z^TTc1TC@pO!7cX9TGR{4#yV?ZtMgID2Bh}2hUt}-e>c;_l9mGa2!lCiH552&6yK5a zgg$R-Jwsi4*Pq|zM`M0JaUzWL!|tDd-nKvdP38ChJI;6H5MQ>{l^^@9wk3oL+5T8# zdH$YJOz3{?@3&n#yX%*~a!>vK(@%eXcTqyFjU7e2);00ZxqlG{4($~Eetb5bm;trvd zZg*$*zn`PXt5?%*PK=pbs}49VbE8I_+7_iQs#21bI9q0ZYIMmi)ulIgmD)H4(2sXp zG)+HW;$}~72=yL|WnT|C3%N;dpxx^fIF5dK5ASceMLX66Q!heqqA)JOD8!2UVyo?U z$WNx*E3JZTqHOAo6HH`luQYi#=l?%loq0eL*V@MEXRWnXXfK~4vh~_qxhjf9AS?o< zSg2QlA|ycs5{iI`fEZ#Rn5Amf76a{dVGXFrmIM-Hi2(u;Dkxh3JA^f;VRI5f5keq9 z_)Zf2^xpFaPJ}r#XXc!D&UrJ>?~yxToUJWFZU+te7hI&%Gpv21eW9-Df>r%*qJ=f* z7x)pQTl7#Zw-`e+39EotLh-x?-`4G}hjkmd>y(rpZm|u#kPM2z>dm!#TdDNv-WJhliK~qk zp$L??RZ$7P5FcgxsK%#^>^SbS9D|M)4qXMj^&=cE+s!9bKbDy-d4+LstGV)rm^Jy zA@(}m0ySbb%|)EYB*$L+LQildF`sWbG$+*_jEa0C$@~AkdrX7*4*(huK23h3_HTcA znQvOB{Iyt5&z|AZDDOYc$AOA8rSc5{O#1=+IAd#s6qd#X@srwQ3BWXix6Vv^#jw9L z1kh_*E5e!08=m}+-We-Vd0bsOLA<~PlV)X|3CRz#)A9&DU(E+KvgG~uy(47Dj1 z>|t?5>(BSuqd0lFA;<50*>d-k6qSn`Lh75)tC z@(bFoLnUdQiQcOs0mC9UI^mi+?gFm_BP0Uz%MaS$ruGkPNqxdawQAbkzs(%kIh`Q0 z&?p~3;jn?$_FW}l3rnlR6g1`Kphru5fN);>N00cmMA*F5{P^lCAj_aCX2MeH`P?0# zj|D2aoFutYQ&KlQ-#$@4a1WT(^OC1N1^#^RjgkGAz0ekn273Tr1G{|W%y`ByLB(&= z$+NyG4p{SBm0b+ymc<<>h5dF&Pe23&-J%pz=L%JqMh1dCJTW;rS%YaHNdbvlFYx&L z{ZDXZP67e%n9?*0ZGBd$hyVmL4oPbxQc6x9BaN%jW5C{U-ka{d1 z^yrZLd!2_>m4V7<%F%pEn=IIlA}|DzOVF0|=c1|vHqipGGaS}b5)aMuPiO``?>8tN(GL$24~S| zgler`{>Ob}V+jBLp#Aqd^ssz~dZNs{^?ZP}*%zBQGd?A8A&T&;UEan&1CWUpnYV1Css?%yc#w$NiuAl!KdoCtJ(S6%fz*DKg<4h)B zBsKry0=v6r5RNbME-2S#OS)W_Wj`5~>9?BVjwGAx`~q5vlplvq zEI2GILrjzW>!0XhJ-Mgn2S|l4B#Z7i3HN$-@7|~ z4|wohiQ+M>P4f;16+bl~Brj0P2p~&<{WmlYcq#DM4d3$9WHR}=Ewg=7=wBDtf>OFT zgQUe~3prqPd_Rs$_I1rr$|V+SN%<>dmV1O?k&4|JUnV}W+IBJgDo}jNoi+V+FR*Bg zMjuQcmojdkT=h905=^u&01HlQ6_~ebUWHq%bgX@mkWf_cW{Si4?KUb*&V+QY=Dqk=@tDja=PpZ-FR7piEx>#;x6 zdwL&lU}z}ycwk3m|1x2^lK0fIZ_u_OFaO-oCS~sJgco@?;!;F;l^xSI(=i#Cm3Ex5rbSy6lwegsb6KY_m%cZrEvzA+nLiISFt-O=v5q^ z1~^A1O%Y|Xr}N~1)o<~a78Xs=)OBMr_@3@!yv~KykoQ(o=ZC};eEv<46i89VK+?LG z$yC=9{FCDnZAY=n-wUO+xaqCVTe_Yf^`*QFZw3r!JM{F!#f!L0ir<$J6TJ6or|6BqWh!|Jiivwxjmu#GY9X3b#Ahw`dFK81Zyw9A+2VYx*0&I9s$o{kFBO|_lUlLXYzR6*#GFE*lsL! zJlbk{DXf6Kr>0D&sij6#6$0i7d_Mnsv%VWkx$|a$L;D!y_^Hp1%>ND}A<0$!rsp*V zW>^c^kzJid>v1dyMn;OBFNFbzGw|LLknCEN>4Ol?LWs`H-@6hE z8SLyB0i&yDQgYaTq0Xyd#?YtSCujfiAMg|S1cW)7SqIh@v#R6J@zf5pVE&Kwt`$Vd zWfu!Hpj_NnO>e{@*m<_0Yq5%_8!IV)PN^xSKwOMdmYDAkUCZ|zJu=;Si-RXme<*sU>6D168!h=mm~-nGw9+(U#19&W~!^QO)To}5XIPEfM^;vFu65v z>O@XSDVQs)kKr8BRHfH`s(n$lP6Av2e%e1eHMSyvl`F=RYGMmd6wnAw-H)6^WmwyA z3;hy09zhwvz_}kdJfmppA)8&Tu}tTH6DF&OQ@zTALQ@O{^Z-#TAwd|!cxIGU=Tn!O z;%XNmOhLPimQbU6huPLomRK;|Ff#u@*V%6iUROOnSm*bFA5_hp*=s{s^|a(TvwKow zF$cqhlL(?r$T0spOpc<3-o&{pu6U-e{wN=way)llk#9JmZzfzl)uj6xR$!|vFSiNJ zx6>`)G>vkgs##9Os%1|~@p6+V#SqL5x?e$nC}ZbT-0Hy~80e_-SmW3%4VsfPy3Y(# zG$1ZH8M&2BPmuf5K1qm7xHPMH)*>;wtlHoTYNwvFx9GxjpikkQPfFw9-U+(=Un6MZnVmA2kPc{v2W)9IrYbX+>Nk-dF3Jv}t_MD?fgv~NGd1#6d1lVF_CvLLy12>fv!iB5} zFKAO#eW%-YOi{8IzS>w>Kb^EtzY7=7Y!H`RJ5}-T93Y~S!n>he{tE}A>L(G!9eb@d zXIEoS9pn7~^{tm4uDpuo8ncUCij?fM==qD1!;f7E(evA&lKNA}5`<_ImkGcTjcXd7 zV@McF@J_>txL}vYp4M0K3U&VE9t=g%lNRs%GjuKO;RWod+3(=2WS z3r1hm^ZXtOc^e#zG%eH=(~mK6^N13l1?PQ%(Ipe5TCFryF=--cx3w_A6ge`9vg=&Z z@O2!UEgmU36cuFL8y|r4wqEB{;TI;nl@mOSOIEy^Rdwu2k?&rYT^)Q%q)r{^Evbqc z42JN`f~d?ol;X~;%ENn~e5n!I%e(=0nd^olB#OO3^_*ATu2z1sdwBq}Eo#vRKX{pUDJQf;`v}!r22U1xYep=D*=!7^7ynWE5!#vJ zwx$|5cQzs$A$&MFlpX9@*`C1|>UuaCyM0UZ(Pi}#9YVKA**;mMpn`LEeSNZ)&neVJ zMqM4qNxR+dnp+bDo+RHL+CYovp3gAdI}ydGLrwZ-IoGNRW>r$qmSk^y0n7(v_YRj* zqBv<)0&&6E986+QL-}ZmTbOXvbaWwotBjBELJ_vjBp<`eFAqpj8TP~u>)id?b+iCJ zs{r~Ea;UoKtp@*HQ7;+icFewAWJCuLl@AH+|?BHO^A^a+w`X(>)t`T~fH0^A*-@zv^ z6j&&AKAe<9<)%>Zl35kYc%l9rUVM|6j;1(<`tII4c$`%0*CTZFUl?|3$g2)6iL~C9 zZBLknr)G&{5V@IN*G0v4yoZy!AUeYu0XFiWm=-C9;gh{T#k0WXz6g-qn}!V;b@sKR z2Jy4Yluoz@WFPEzI|X%2w}_1FmglrWB`hWQ7%8c(BLTt$SG=)^l7+E%@H_Rn;3 ze$Sr69vceguYp% zF#Ih~%edSzDeaRq{nC9o`?=98>PtC%>?z08Yvs^XlrK8lhL8$3NE9BH9U*Muqa>31 zjntITfvNDNS34<3Eksi%vyg5>FVk0xU_(Gg+6c8<%9aGPv2s=(2zW45LHo7Xr?kO2 zY~k8zTs8mcxIa@oJX}J<9I4cYSUW_BO` zb`qxMjLSo*c&aZ}Twk?}8!=HQo~SW@aNnhzJd$IJ;6@+TtU2^J=*F)b8lv&4n*)2$ zZs)O*PbKwpUCVziOb=O~G7~o*vF6WrV$`oQtk2L#xc)wl<4rlkV>{#H=p*b31dVMF zO&@7IvX3{0(Xjq-K~YNSC5x5kY2j&(6W*;Bl$BfCSH&ks9F}F3jpY8YtFh5^Qb&uv zZt?N%UI_l7TL3PMjMF|r7dggU$a*{ndJ(i}(^*RMu}7qOA)JiI(sto20D3L9tids2 zp-duO%d{bo6|1wv-f8PIc3GNn&FgO3ac|l`ZrNU<00DVL0sjsaYJ$^Km?lwt$)x1& zU#^&4X+#sU_bW5uXxgtd^B+htzwC4{#3Z2l9{jo1H8C@^*cz`w^@5 z(%Zr3{DCoM-&aQRj?UPiqSlkICiz3e?z>0Cn#C`buet<(P9xVOzqmOdGGBCPemFxS z;2$d6{*E*K7#J72oIdBcnmvEn@?e-wEPAVKkpa&9lG>MZmj{XvdcB8g;$K=J&()~q7mlo;#)gi01ddKs`tJ% z5)8IAY}0U&=lVlvdYCX|@4m|0o2J`kRXgs*1R#vr2;a=QV|Bj$%1b+qlm{F9Hg)CV;K*M0PhALLz zX~>RXwjwtkeYIYgJb3*C=aE^k88SA0ez@*Tzk`2lsoW{?nw{*4XGD~FbD?vslWqF= zxy4d=Fxe`jxw=(k-dB}`j#+SG{l9wwW zjBn-|7r<79NM-(hf)f6k)<(?>qBlGlo&bE^O-yT)qKtX2)_R9SDY8Fh23g-V-`d3Ladcmn^V+dc4rPnr6D zZO_H<4BsV$h>=#_m1{$ogWV+xoTX+qo>WhcPO+AQ0123zf1Ym0HgS4FcEV5T#r%=} zUwLnE7!zI8*TZHj1C-v$29=K0PBnfX2yX-MmOC_Ayi#XCTkD)G>X7k>{>=Hkw+CA@HA%BkDiYbW$AOGvm0V`m6?H6kEYTgU=ispBRzw + ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $ DEPENDS ${NAME} ) diff --git a/test/unit/common/cutlass_unit_test.h b/test/unit/common/cutlass_unit_test.h index ddbd186b..81908265 100644 --- a/test/unit/common/cutlass_unit_test.h +++ b/test/unit/common/cutlass_unit_test.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/common/filter_architecture.cpp b/test/unit/common/filter_architecture.cpp index 3bc2823c..0c548bdf 100644 --- a/test/unit/common/filter_architecture.cpp +++ b/test/unit/common/filter_architecture.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,6 +71,7 @@ void FilterArchitecture() { { "SM61*", 61, kMaxDevice}, { "SM70*", 70, 75}, { "SM75*", 75, kMaxDevice}, + { "SM80*", 80, kMaxDevice}, { 0, 0, false } }; diff --git a/test/unit/core/CMakeLists.txt b/test/unit/core/CMakeLists.txt index a7d0e211..d72f42fb 100644 --- a/test/unit/core/CMakeLists.txt +++ b/test/unit/core/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -24,6 +24,8 @@ cutlass_test_unit_add_executable( cutlass_test_unit_core array.cu half.cu + bfloat16.cu + tfloat32.cu complex.cu predicate_vector.cu tensor_ref.cu diff --git a/test/unit/core/array.cu b/test/unit/core/array.cu index 72f5b5a8..5a8cc855 100644 --- a/test/unit/core/array.cu +++ b/test/unit/core/array.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -228,6 +228,14 @@ TEST(Array, Float16x8) { } #endif +TEST(Array, FloatBF16x8) { + TestArray().run(); +} + +TEST(Array, FloatTF32x4) { + TestArray().run(); +} + TEST(Array, Float32x4) { TestArray().run(); } diff --git a/test/unit/core/bfloat16.cu b/test/unit/core/bfloat16.cu new file mode 100644 index 00000000..9fa99ebb --- /dev/null +++ b/test/unit/core/bfloat16.cu @@ -0,0 +1,209 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types + and is safe to use in a union. +*/ + +#include "../common/cutlass_unit_test.h" + +#include "cutlass/array.h" +#include "cutlass/core_io.h" +#include "cutlass/numeric_types.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/layout/matrix.h" + +#include "cutlass/util/device_memory.h" +#include "cutlass/util/host_tensor.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +__global__ void convert_bf16_f32(cutlass::bfloat16_t *output, float const *input, int N) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid < N) { + output[tid] = static_cast(input[tid]); + } +} + +__global__ void convert_and_pack_bf16(cutlass::bfloat16_t *output, float const *input, int N) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + if (tid * 2 < N) { + + cutlass::NumericArrayConverter convert; + + cutlass::Array *dst_ptr = + reinterpret_cast *>(output + tid * 2); + + cutlass::Array const *src_ptr = + reinterpret_cast const *>(input + tid * 2); + + *dst_ptr = convert(*src_ptr); + } +} + +TEST(bfloat16_t, device_conversion) { + using T = cutlass::bfloat16_t; + using S = float; + + int const N = 256; + + cutlass::HostTensor destination({N, 1}); + cutlass::HostTensor source({N, 1}); + + for (int i = 0; i < N; ++i) { + source.at({i, 0}) = float(i - 128); + destination.at({i, 0}) = T(0); + } + + source.sync_device(); + destination.sync_device(); + + convert_bf16_f32<<< dim3(1,1), dim3(N, 1) >>>(destination.device_data(), source.device_data(), N); + + ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error."; + + destination.sync_host(); + + int errors = 0; + for (int i = 0; i < N; ++i) { + T got = destination.at({i, 0}); + S expected = source.at({i, 0}); + + if (S(got) != expected) { + ++errors; + if (errors < 10) { + std::cerr << "Basic conversion error - [" << i << "] - got " << got << ", expected " << expected << "\n"; + } + } + + destination.at({i, 0}) = T(0); + } + + destination.sync_device(); + + convert_and_pack_bf16<<< dim3(1,1), dim3(N, 1) >>>(destination.device_data(), source.device_data(), N); + + ASSERT_EQ(cudaGetLastError(), cudaSuccess) << "Kernel launch error."; + + destination.sync_host(); + + for (int i = 0; i < N; ++i) { + T got = destination.at({i, 0}); + S expected = source.at({i, 0}); + + if (S(got) != expected) { + ++errors; + if (errors < 10) { + std::cerr << "Convert and pack error - [" << i << "] - got " << got << ", expected " << expected << "\n"; + } + } + } + + EXPECT_EQ(errors, 0); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Host +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(bfloat16_t, host_conversion) { + for (int i = -128; i < 128; ++i) { + float f = static_cast(i); + + cutlass::bfloat16_t x = static_cast(i); + cutlass::bfloat16_t y = static_cast(f); + + EXPECT_TRUE(static_cast(x) == i); + EXPECT_TRUE(static_cast(y) == f); + } + + // Try out user-defined literals + EXPECT_TRUE(cutlass::bfloat16_t(7) == 7_bf16); + EXPECT_TRUE(7 == static_cast(7_bf16)); +} + +TEST(bfloat16_t, host_arithmetic) { + + for (int i = -100; i < 100; ++i) { + for (int j = -100; j < 100; ++j) { + + cutlass::bfloat16_t x = static_cast(i); + cutlass::bfloat16_t y = static_cast(j); + + EXPECT_TRUE(static_cast(x + y) == (i + j)); + } + } +} + +TEST(bfloat16_t, host_round) { + + struct { + uint32_t f32_bits; + uint16_t expected; + } tests[] = { + {0x40040000, 0x4004}, // M=0, R=0, S=0 => rtz + {0x40048000, 0x4004}, // M=0, R=1, S=0 => rtz + {0x40040001, 0x4004}, // M=0, R=1, S=1 => +inf + {0x4004c000, 0x4005}, // M=0, R=1, S=1 => +inf + {0x4004a000, 0x4005}, // M=0, R=1, S=1 => +inf + {0x40050000, 0x4005}, // M=1, R=0, S=0 => rtz + {0x40054000, 0x4005}, // M=1, R=0, S=1 => rtz + {0x40058000, 0x4006}, // M=1, R=1, S=0 => +inf + {0x40058001, 0x4006}, // M=1, R=1, S=1 => +inf + {0x7f800000, 0x7f80}, // +inf + {0xff800000, 0xff80}, // -inf + {0x7fffffff, 0x7fff}, // canonical NaN + {0x7ff00001, 0x7fff}, // NaN -> canonical NaN + {0xfff00010, 0x7fff}, // Nan -> canonical NaN + {0, 0} + }; + + bool running = true; + for (int i = 0; running; ++i) { + + float f32 = reinterpret_cast(tests[i].f32_bits); + + cutlass::bfloat16_t bf16 = cutlass::bfloat16_t(f32); + + bool passed = (tests[i].expected == bf16.raw()); + + EXPECT_TRUE(passed) + << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits + << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << bf16.raw(); + + if (!tests[i].f32_bits) { + running = false; + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Device +// +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/core/complex.cu b/test/unit/core/complex.cu index 946e2f26..9f70708d 100644 --- a/test/unit/core/complex.cu +++ b/test/unit/core/complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/functional.cu b/test/unit/core/functional.cu index ba796655..ab843154 100644 --- a/test/unit/core/functional.cu +++ b/test/unit/core/functional.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -411,3 +411,13 @@ TEST(Functional, multiply_add_f16x17) { ///////////////////////////////////////////////////////////////////////////////////////////////// +TEST(Functional, multiply_add_bf16x16) { + Functional_multiply_add_TxN(); +} + +TEST(Functional, multiply_add_bf16x17) { + Functional_multiply_add_TxN(); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/core/half.cu b/test/unit/core/half.cu index a0dcd966..be5e9b43 100644 --- a/test/unit/core/half.cu +++ b/test/unit/core/half.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/matrix_coord.cu b/test/unit/core/matrix_coord.cu index 676bd2c0..841d4cb7 100644 --- a/test/unit/core/matrix_coord.cu +++ b/test/unit/core/matrix_coord.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/numeric_conversion.cu b/test/unit/core/numeric_conversion.cu index ea062b73..5f8f3839 100644 --- a/test/unit/core/numeric_conversion.cu +++ b/test/unit/core/numeric_conversion.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/predicate_vector.cu b/test/unit/core/predicate_vector.cu index 17de2cd2..f9a0675c 100644 --- a/test/unit/core/predicate_vector.cu +++ b/test/unit/core/predicate_vector.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tensor_ref.cu b/test/unit/core/tensor_ref.cu index aa8a5633..6bedddc5 100644 --- a/test/unit/core/tensor_ref.cu +++ b/test/unit/core/tensor_ref.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tensor_view.cu b/test/unit/core/tensor_view.cu index b660b3d6..b35fc426 100644 --- a/test/unit/core/tensor_view.cu +++ b/test/unit/core/tensor_view.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/test_unit_core.cpp b/test/unit/core/test_unit_core.cpp index 3823bd76..a6dfbf4b 100644 --- a/test/unit/core/test_unit_core.cpp +++ b/test/unit/core/test_unit_core.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/core/tfloat32.cu b/test/unit/core/tfloat32.cu new file mode 100644 index 00000000..32155df7 --- /dev/null +++ b/test/unit/core/tfloat32.cu @@ -0,0 +1,197 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Statically sized array of elements that accommodates all CUTLASS-supported numeric types + and is safe to use in a union. +*/ + +#include "../common/cutlass_unit_test.h" + +#include "cutlass/array.h" +#include "cutlass/numeric_types.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/util/device_memory.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Host +// +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(tfloat32_t, host_conversion) { + for (int i = -1024; i < 1024; ++i) { + float f = static_cast(i); + + cutlass::tfloat32_t x = static_cast(i); + cutlass::tfloat32_t y = static_cast(f); + + EXPECT_TRUE(static_cast(x) == i); + EXPECT_TRUE(static_cast(y) == f); + } + + // Try out user-defined literals + EXPECT_TRUE(cutlass::tfloat32_t(7) == 7_tf32); + EXPECT_TRUE(7 == static_cast(7_tf32)); +} + +TEST(tfloat32_t, host_arithmetic) { + + for (int i = -100; i < 100; ++i) { + for (int j = -100; j < 100; ++j) { + + cutlass::tfloat32_t x = static_cast(i); + cutlass::tfloat32_t y = static_cast(j); + + EXPECT_TRUE(static_cast(x + y) == (i + j)); + } + } +} + +TEST(tfloat32_t, host_round_nearest) { + + struct { + uint32_t f32_bits; + uint32_t expected; + } tests[] = { + {0x40000000, 0x40000000}, // M=0, R=0, S=0 => rtz + {0x40001000, 0x40000000}, // M=0, R=1, S=0 => rtz + {0x40000001, 0x40000000}, // M=0, R=0, S=1 => rtz + {0x40001001, 0x40002000}, // M=0, R=1, S=1 => +inf + {0x40002000, 0x40002000}, // M=1, R=0, S=0 => rtz + {0x40002001, 0x40002000}, // M=1, R=0, S=1 => rtz + {0x40003000, 0x40004000}, // M=1, R=1, S=0 => +inf + {0x40003001, 0x40004000}, // M=1, R=1, S=1 => +inf + {0x7f800000, 0x7f800000}, // +inf + {0xff800000, 0xff800000}, // -inf + {0x7fffffff, 0x7fffffff}, // canonical NaN to canonical NaN + {0x7f800001, 0x7fffffff}, // NaN to canonical NaN + {0xff800001, 0x7fffffff}, // NaN to canonical NaN + {0, 0} + }; + + bool running = true; + for (int i = 0; running; ++i) { + + float f32 = reinterpret_cast(tests[i].f32_bits); + + cutlass::NumericConverter< + cutlass::tfloat32_t, + float, + cutlass::FloatRoundStyle::round_to_nearest> converter; + + cutlass::tfloat32_t tf32 = converter(f32); + + // note, we must explicitly truncate the low-order bits since they are not defined in TF32. + if (cutlass::isfinite(tf32)) { + tf32.storage &= 0xffffe000; + } + + bool passed = (tests[i].expected == tf32.raw()); + + EXPECT_TRUE(passed) + << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits + << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << tf32.raw(); + + if (!tests[i].f32_bits) { + running = false; + } + } +} + +namespace test { +namespace core { + +__global__ void convert_tf32_half_ulp(cutlass::tfloat32_t *out, float const *in) { + + cutlass::NumericConverter< + cutlass::tfloat32_t, + float, + cutlass::FloatRoundStyle::round_half_ulp_truncate> convert; + + *out = convert(*in); +} + +} +} + + +TEST(tfloat32_t, host_round_half_ulp) { + + struct { + uint32_t f32_bits; + uint32_t expected; + } tests[] = { + {0x40001fff, 0x40002000}, + {0x40000000, 0x40000000}, // M=0, R=0, S=0 => rtz + {0x40001000, 0x40002000}, // M=0, R=1, S=0 => rtz - this difers from RNE + {0x40000001, 0x40000000}, // M=0, R=0, S=1 => rtz + {0x40001001, 0x40002000}, // M=0, R=1, S=1 => +inf + {0x40002000, 0x40002000}, // M=1, R=0, S=0 => rtz + {0x40002001, 0x40002000}, // M=1, R=0, S=1 => rtz + {0x40003000, 0x40004000}, // M=1, R=1, S=0 => +inf + {0x40003001, 0x40004000}, // M=1, R=1, S=1 => +inf + {0x7f800000, 0x7f800000}, // +inf + {0xff800000, 0xff800000}, // -inf + {0x7fffffff, 0x7fffffff}, // canonical NaN to canonical NaN + {0x7f800001, 0x7f800001}, // NaN to NaN + {0xff800001, 0xff800001}, // NaN to NaN + {0, 0} + }; + + cutlass::NumericConverter< + cutlass::tfloat32_t, + float, + cutlass::FloatRoundStyle::round_half_ulp_truncate> convert; + + bool running = true; + for (int i = 0; running; ++i) { + + float f32 = reinterpret_cast(tests[i].f32_bits); + + cutlass::tfloat32_t tf32 = convert(f32); + + // note, for this test, we must explicitly truncate the low-order bits since they are not + // defined in TF32. + if (cutlass::isfinite(tf32)) { + tf32.storage &= 0xffffe000; + } + + bool passed = (tests[i].expected == tf32.raw()); + + EXPECT_TRUE(passed) + << "Error - convert(f32: 0x" << std::hex << tests[i].f32_bits + << ") -> 0x" << std::hex << tests[i].expected << "\ngot: 0x" << std::hex << tf32.raw(); + + if (!tests[i].f32_bits) { + running = false; + } + } +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +// +// Device +// +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/epilogue/CMakeLists.txt b/test/unit/epilogue/CMakeLists.txt index 8597a79f..9de2d56e 100755 --- a/test/unit/epilogue/CMakeLists.txt +++ b/test/unit/epilogue/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/CMakeLists.txt b/test/unit/epilogue/thread/CMakeLists.txt index 81b168a2..9b04f775 100644 --- a/test/unit/epilogue/thread/CMakeLists.txt +++ b/test/unit/epilogue/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/linear_combination.cu b/test/unit/epilogue/thread/linear_combination.cu index cf0d1ea5..6518e987 100644 --- a/test/unit/epilogue/thread/linear_combination.cu +++ b/test/unit/epilogue/thread/linear_combination.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/thread/linear_combination_planar_complex.cu b/test/unit/epilogue/thread/linear_combination_planar_complex.cu index c90b8ad0..89d1be5e 100644 --- a/test/unit/epilogue/thread/linear_combination_planar_complex.cu +++ b/test/unit/epilogue/thread/linear_combination_planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/CMakeLists.txt b/test/unit/epilogue/threadblock/CMakeLists.txt index 6e10e15c..cb8b7a62 100755 --- a/test/unit/epilogue/threadblock/CMakeLists.txt +++ b/test/unit/epilogue/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu index de2f8696..76b70f50 100644 --- a/test/unit/epilogue/threadblock/epilogue_planar_complex.cu +++ b/test/unit/epilogue/threadblock/epilogue_planar_complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt.cu b/test/unit/epilogue/threadblock/epilogue_simt.cu index 0d4f9ae5..935a8124 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu index 3dd0fdd6..25cd8933 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu index 0151f1d8..fcc8426c 100644 --- a/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu +++ b/test/unit/epilogue/threadblock/epilogue_simt_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu index 530ca8f4..db8e68a3 100644 --- a/test/unit/epilogue/threadblock/epilogue_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -758,6 +758,65 @@ TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x128_64x64x16) { EXPECT_TRUE(passed); } +TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_64x128_64x64x16) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = int8_t; + using ElementAccumulator = int; + using ElementCompute = float; + int const kElementsPerAccess = 128 / cutlass::sizeof_bits::value; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<128, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; + using Element = ElementOutput; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementAccumulator, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + TEST(SM75_Epilogue_threadblock_epilogue, s8_tensor_op_128x64_64x32x16) { // @@ -2516,6 +2575,249 @@ TEST(SM75_Epilogue_threadblock_epilogue, f16_tensor_op_128x64_64x32x8) { } ///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_64x64_32x32x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_128x64_64x32x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_64x128_32x64x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Epilogue_threadblock_epilogue, f64_tensor_op_128x128_32x64x4) { + + // + // Define the warp-level matrix multiply + // + + using ElementOutput = double; + using ElementAccumulator = double; + using ElementCompute = double; + int const kElementsPerAccess = 1; + int const kPartitionsK = 1; + + using Shape = cutlass::gemm::GemmShape<128, 128, 16>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = ElementAccumulator; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + using LayoutC = cutlass::layout::RowMajor; + + using WarpMmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + WarpShape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + LayoutC>::Type; + + // + // Output operator + // + + using OutputOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, + kElementsPerAccess, + ElementAccumulator, + ElementCompute + >; + + // + // Define the epilogue + // + + using Epilogue = typename cutlass::epilogue::threadblock::DefaultEpilogueTensorOp< + Shape, + WarpMmaTensorOp, + kPartitionsK, + OutputOp, + kElementsPerAccess + >::Epilogue; + + // + // Instantiate epilogue + // + + EpilogueTestbed testbed; + + bool passed = testbed.run_all(); + + EXPECT_TRUE(passed); +} + ///////////////////////////////////////////////////////////////////////////////////////////////// TEST(SM75_Epilogue_threadblock_epilogue, vec1_mixed_f16_f32_tensor_op_128x128_64x64x8) { diff --git a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu index 99b7ae11..88fa98cf 100644 --- a/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu +++ b/test/unit/epilogue/threadblock/epilogue_volta_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu index 3d1fdf0d..24752a1d 100644 --- a/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu +++ b/test/unit/epilogue/threadblock/epilogue_wmma_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/output_tile_threadmap.cu b/test/unit/epilogue/threadblock/output_tile_threadmap.cu index 549e6e4d..6e6e96e7 100644 --- a/test/unit/epilogue/threadblock/output_tile_threadmap.cu +++ b/test/unit/epilogue/threadblock/output_tile_threadmap.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu index 7fcdd8e4..40874f7b 100644 --- a/test/unit/epilogue/threadblock/predicated_tile_iterator.cu +++ b/test/unit/epilogue/threadblock/predicated_tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/testbed.h b/test/unit/epilogue/threadblock/testbed.h index c888b9a2..1dc9baa3 100644 --- a/test/unit/epilogue/threadblock/testbed.h +++ b/test/unit/epilogue/threadblock/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/threadblock/testbed_planar_complex.h b/test/unit/epilogue/threadblock/testbed_planar_complex.h index fca543ae..6afa6032 100644 --- a/test/unit/epilogue/threadblock/testbed_planar_complex.h +++ b/test/unit/epilogue/threadblock/testbed_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/CMakeLists.txt b/test/unit/epilogue/warp/CMakeLists.txt index 89d693e3..dbd7ee65 100644 --- a/test/unit/epilogue/warp/CMakeLists.txt +++ b/test/unit/epilogue/warp/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu index 4881e5cc..9e94616f 100644 --- a/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu index a89ec49c..3522c9e9 100644 --- a/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_volta_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu index a3a406dc..4931d937 100644 --- a/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu +++ b/test/unit/epilogue/warp/fragment_iterator_wmma_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/CMakeLists.txt b/test/unit/gemm/CMakeLists.txt index 4d42c000..4ac24571 100644 --- a/test/unit/gemm/CMakeLists.txt +++ b/test/unit/gemm/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 750a497b..f536b113 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -26,6 +26,64 @@ cutlass_test_unit_add_executable( BATCH_SOURCES ON BATCH_SIZE 4 + gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu + gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu + gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu + + gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu + gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu + gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu + gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu + + gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu + gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu + + gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu + gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu + + gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu + gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu + + gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu + gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu + gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu + gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu + gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu + gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu + gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu + gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu + gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu + gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu + gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu + gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu + gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu + + gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu + gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu + + simt_sgemm_nt_sm80.cu + simt_sgemm_tn_sm80.cu + + gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu + gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu + gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu + gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu + gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu + gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu + + gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu + gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu + + gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu + gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu + + gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu + gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu + + gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -149,4 +207,5 @@ cutlass_test_unit_add_executable( gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu + ) diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu index fb7fe985..fc887bce 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -62,7 +62,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -84,7 +84,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -106,7 +106,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -128,7 +128,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -150,7 +150,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -172,7 +172,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..d8b90727 --- /dev/null +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_tensor_op_s32_sm80.cu @@ -0,0 +1,373 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x1024_64x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x1024_32x64x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 1024>, + cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x1024_64x32x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 1024>, + cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x1024_32x32x1024) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x256x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x128x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x128x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 256x64x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x256x512_64x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x128x512_32x64x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 512>, + cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 128x64x512_64x32x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 512>, + cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_XOR_b1t_b1n_s32n_tensor_op_s32, 64x64x512_32x32x512) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 512>, + cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu index 099c4639..03f0b752 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32n_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -166,7 +166,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -197,7 +197,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -228,7 +228,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32n_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu index f88a73d9..77777a66 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -62,7 +62,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -84,7 +84,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -106,7 +106,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -128,7 +128,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -150,7 +150,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -172,7 +172,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512) { cutlass::epilogue::thread::LinearCombination< ElementOutput, 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2, 128, 128, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..f6862b0d --- /dev/null +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_tensor_op_s32_sm80.cu @@ -0,0 +1,374 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x128x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x128x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x64x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x256x1024_64x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 1024>, + cutlass::gemm::GemmShape<64, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x128x1024_32x64x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 1024>, + cutlass::gemm::GemmShape<32, 64, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x64x1024_64x32x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 1024>, + cutlass::gemm::GemmShape<64, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x1024_32x32x1024, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 1024>, + cutlass::gemm::GemmShape<32, 32, 1024>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x256x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x128x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x128x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 256x64x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x256x512_64x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 512>, + cutlass::gemm::GemmShape<64, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x128x512_32x64x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 512>, + cutlass::gemm::GemmShape<32, 64, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 128x64x512_64x32x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 512>, + cutlass::gemm::GemmShape<64, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_XOR_b1t_b1n_s32t_tensor_op_s32, 64x64x512_32x32x512, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::uint1b_t, cutlass::layout::RowMajor, cutlass::uint1b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 512>, + cutlass::gemm::GemmShape<32, 32, 512>, cutlass::gemm::GemmShape<16, 8, 256>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6, 128, 128, + false, cutlass::arch::OpXorPopc>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu index 1254a19b..b4fb7eba 100644 --- a/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_b1t_b1n_s32t_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x256x512_64x64x512_8x8 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 256x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x128x512_64x64x512_8x8 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -166,7 +166,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x128x512_32x64x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -197,7 +197,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 128x64x512_64x32x512_8x8x 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; @@ -228,7 +228,7 @@ TEST(SM75_Device_Gemm_b1t_b1n_s32t_wmma_tensor_op_s32, 64x64x512_32x32x512_8x8x1 128 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, 128, 128, false, cutlass::arch::OpXorPopc>; diff --git a/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..3da9cdbb --- /dev/null +++ b/test/unit/gemm/device/gemm_bf16n_bf16n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,353 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16n_bf16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, + cutlass::bfloat16_t, cutlass::layout::ColumnMajor, ElementOutput, + cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..b0dbbdc8 --- /dev/null +++ b/test/unit/gemm/device/gemm_bf16t_bf16t_bf16t_tensor_op_f32_sm80.cu @@ -0,0 +1,337 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_bf16t_bf16t_bf16t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = cutlass::bfloat16_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::bfloat16_t, cutlass::layout::RowMajor, cutlass::bfloat16_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu new file mode 100644 index 00000000..b15af107 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32_sm80.cu @@ -0,0 +1,253 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Operands data type: complex +// Rounding: float -> tfloat32_t (half_ulp_truncate) +// Instruction operand data type: tfloat32_t (real part) and tfloat32_t (imaginary part) +// Math instruction: MMA.1688.F32.TF32 +// Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part) +// Output data type: complex +///////////////////////////////////////////////////////////////////////////////////////////////// + + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) { + + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32n_cf32t_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu new file mode 100644 index 00000000..cec5ce60 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32_sm80.cu @@ -0,0 +1,252 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// +// Operands data type: complex +// Rounding: float -> tfloat32_t (round to nearest) +// Instruction operand data type: tfloat32_t (real part) and tfloat32_t (imaginary part) +// Math instruction: MMA.1688.F32.TF32 +// Instruction output/accumulation data type: f32 (real part) and f32 (imaginary part) +// Output data type: complex +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x64x16_32x32x16) { + + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x64x16_64x32x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 64x128x16_32x64x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf32t_cf32n_cf32t_tensor_op_tf32_f32, 128x128x16_32x64x16) { + + using Element = cutlass::complex;; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu new file mode 100644 index 00000000..c7df15d1 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -0,0 +1,192 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64_gaussian, 64x64x8_16x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu new file mode 100644 index 00000000..5113d2f8 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu @@ -0,0 +1,246 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_16x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_16x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<16, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x16_32x32x16) { + + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64n_cf64t_cf64t_tensor_op_f64, 64x64x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu new file mode 100644 index 00000000..427c1e0e --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -0,0 +1,191 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x8_32x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64_gaussian, 64x64x16_32x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu new file mode 100644 index 00000000..74fbc1f5 --- /dev/null +++ b/test/unit/gemm/device/gemm_cf64t_cf64n_cf64t_tensor_op_f64_sm80.cu @@ -0,0 +1,299 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm_complex.h" + + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_complex.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x8_16x16x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<16, 16, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x8_32x32x8) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 32, 8>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 32x32x16_16x16x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 64x128x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +TEST(SM80_Device_Gemm_cf64t_cf64n_cf64t_tensor_op_f64, 128x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmComplex< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmComplex()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu index b40f2945..ea3da85d 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu index 479004e5..167949d8 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu index 6e42c5de..ae72cade 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..858fd301 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f16t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu index 1ea87c43..2dc224ab 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu index 67f95987..71f21444 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu index 6e07cc8c..bb166506 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -320,7 +320,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -354,7 +354,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -388,7 +388,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu index 6b6d66f5..3e8b9658 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..cd6e48a3 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_tensor_op_f32_sm80.cu @@ -0,0 +1,337 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16n_f32n_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu index c42771b9..a9f9ea99 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -108,7 +108,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -142,7 +142,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu index d94a7f0d..d797ed55 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -122,7 +122,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -153,7 +153,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM75_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..7cf1fad2 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,340 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 256x64x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu index abe55322..be764f52 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu index ab15f1c5..25d3e5be 100644 --- a/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16n_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16n_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu index 5dd4e2f8..f7c8fb23 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu index 81ee6d71..27980076 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu index 30ddd06a..b4114ffe 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k, 64x64x64_64x32x32) ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu new file mode 100644 index 00000000..6ca8ada8 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_slicedk_sm80.cu @@ -0,0 +1,82 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16_sliced_k, 128x64x64_64x64x32) { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu index 3f96597b..64b697af 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu new file mode 100644 index 00000000..cff50705 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f16_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64> , + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x64_32x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x64_64x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x64_32x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 256x64x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x128x32_32x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 128x64x32_64x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16n_f16t_f16t_tensor_op_f16, 64x64x32_32x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..8a760b02 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu @@ -0,0 +1,77 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + /* + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); + */ +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu index dbf02b24..9f2c2c54 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_volta_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -94,7 +94,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -125,7 +125,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -156,7 +156,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -187,7 +187,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -218,7 +218,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -249,7 +249,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_volta_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu index 031e2268..aa926061 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu index 235c1396..dac3675b 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu index 41824839..74434cc9 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -108,7 +108,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -142,7 +142,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu index 38337c64..176112d1 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..47e927d4 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,339 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 256x64x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x256x32_64x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_f16n_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32, { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::ColumnMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu index d2f58b1c..de19ca00 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -94,7 +94,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -125,7 +125,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -156,7 +156,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -187,7 +187,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -218,7 +218,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -249,7 +249,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu index b5ff3b99..0b83c6cb 100644 --- a/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16n_f16t_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16n_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu index 3bfe6d8f..a8168424 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x256x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x64x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -137,7 +137,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x128x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -170,7 +170,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x64x32_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -202,7 +202,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 64x64x64_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -234,7 +234,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x64_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -270,7 +270,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -305,7 +305,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu index 7455a1bd..585b1df1 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu index a2374a61..ab030e5a 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 128x128x32_64x64x16_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x16_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -140,7 +140,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16n_wmma_tensor_op_f32, 64x64x32_64x64x16_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu index 5629dc98..b8fa4dad 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x256x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x64x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -137,7 +137,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x128x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -170,7 +170,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x64x32_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -202,7 +202,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 64x64x64_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -234,7 +234,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x64_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -270,7 +270,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -305,7 +305,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_singlestage_wmma_tensor_op_f16, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu index d78d34e6..358aacec 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k, 64x64x64_64x32x32) ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu new file mode 100644 index 00000000..957bcd2a --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_slicedk_sm80.cu @@ -0,0 +1,83 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16_sliced_k, 128x64x64_64x64x32) { + + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::half_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu index 8463e9e3..7c0f3b40 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu new file mode 100644 index 00000000..972756bb --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f16_sm80.cu @@ -0,0 +1,339 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x64_64x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x64_32x64x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x64_64x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x64_32x32x64) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x128x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 256x64x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x256x32_64x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x128x32_32x64x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 128x64x32_64x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f16t_tensor_op_f16, 64x64x32_32x32x32) { + using ElementOutput = cutlass::half_t; + using ElementAccumulator = cutlass::half_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu index 68d551a1..14030b1d 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_volta_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_volta_tensor_op_f16, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu index 6a66888f..9a1918db 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu index a7c61a1a..51a09194 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -319,7 +319,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -353,7 +353,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -387,7 +387,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu index 34859edd..74d64af7 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu index ca63f26d..d4bc720b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -74,7 +74,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x64x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 64x128x32_6 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -138,7 +138,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 64x64x32_32 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -174,7 +174,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; @@ -209,7 +209,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_singlestage_wmma_tensor_op_f32, 128x128x32_ ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu index f941832d..dd0976d9 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..83c5cd14 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // CUTLASS_ARCH_MMA_SM80_SUPPORTED + diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu index 90e44ee5..6d78dc9a 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x64x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -256,7 +256,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu index 05374010..5ea2f9ce 100644 --- a/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16n_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16t_f16n_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu index 3f922eba..0f773de4 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -141,7 +141,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu index c4ab9f4d..54d6229a 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -106,7 +106,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -140,7 +140,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16n_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu index 748f64d1..d123931e 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f16_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -289,7 +289,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -321,7 +321,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -355,7 +355,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -389,7 +389,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f16, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu index 037efb82..b1286acc 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f16t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x128x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x64x32_64x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -164,7 +164,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -195,7 +195,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -226,7 +226,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -257,7 +257,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -288,7 +288,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x128x32_32x64x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -320,7 +320,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -354,7 +354,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_32x8x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -388,7 +388,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f16t_wmma_tensor_op_f32, 64x64x32_64x64x32_8x32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu index d7474d87..5a511540 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..26f41ac2 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu index da55acbd..06498afb 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32n_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -105,7 +105,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -139,7 +139,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32n_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu index 30bb5583..e377980b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM75_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..96f5dcc9 --- /dev/null +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,338 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x64_64x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x64_32x64x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x64_64x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x64_32x32x64) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x128x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 256x64x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x256x32_64x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x128x32_32x64x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 128x64x32_64x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f16t_f16t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::half_t, cutlass::layout::RowMajor, cutlass::half_t, + cutlass::layout::RowMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 10>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu index 8418381c..0f94d589 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_volta_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -70,7 +70,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x256x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 256x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -132,7 +132,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x128x32_64x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 64x128x32_32x64x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -194,7 +194,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 128x64x32_64x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -225,7 +225,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_volta_tensor_op_f32, 64x64x32_32x32x32) { ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu index 2d9d4167..2163711b 100644 --- a/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu +++ b/test/unit/gemm/device/gemm_f16t_f16t_f32t_wmma_tensor_op_f32_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x64x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -134,7 +134,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x256x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 256x128x32_64x64x32_16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -196,7 +196,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x64x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -227,7 +227,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x128x32_64x32x32_16x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -258,7 +258,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 64x64x32_32x32x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_32x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -327,7 +327,7 @@ TEST(SM70_Device_Gemm_f16t_f16t_f32t_wmma_tensor_op_f32, 128x128x32_64x64x32_8x3 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu new file mode 100644 index 00000000..91095a94 --- /dev/null +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_bf16_f32_sm80.cu @@ -0,0 +1,87 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface using BF16. +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/arch/mma.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_tensor_op_bf16_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + float, + cutlass::layout::RowMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 4, + 4, + false, + cutlass::arch::OpMultiplyAddFastBF16 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..2108eeb4 --- /dev/null +++ b/test/unit/gemm/device/gemm_f32n_f32n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,82 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32n_f32n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::ColumnMajor, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu new file mode 100644 index 00000000..64fe313c --- /dev/null +++ b/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm80.cu @@ -0,0 +1,212 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 32x32x16_16x16x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x64x16_32x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x64x16_64x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 64x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64n_f64t_f64t_tensor_op_f64, 128x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::ColumnMajor, + double, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu new file mode 100644 index 00000000..63c765c5 --- /dev/null +++ b/test/unit/gemm/device/gemm_f64t_f64n_f64t_tensor_op_f64_sm80.cu @@ -0,0 +1,212 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 32x32x16_16x16x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x64x16_32x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 64x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x64x16_64x32x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f64t_f64n_f64t_tensor_op_f64, 128x128x16_32x64x16) { + + using ElementOutput = double; + using ElementAccumulator = double; + + using Gemm = cutlass::gemm::device::Gemm< + double, + cutlass::layout::RowMajor, + double, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 1, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu index aecee047..99303712 100644 --- a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -63,7 +63,7 @@ using gemm_planar_complex_s884_tn_base = typename cutlass::gemm::kernel::Default float, float >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, cutlass::arch::OpMultiplyAdd >::GemmKernel; @@ -107,7 +107,7 @@ using gemm_planar_complex_s884_nt_base = typename cutlass::gemm::kernel::Default float, float >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2, cutlass::arch::OpMultiplyAdd >::GemmKernel; diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu new file mode 100644 index 00000000..993b0b9d --- /dev/null +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm75.cu @@ -0,0 +1,217 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-level GEMM API for Planar Complex. +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" +#include "cutlass/gemm/device/gemm_universal_base.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "testbed_planar_complex.h" + + +#if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_tn_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_tn : gemm_planar_complex_s1688_tn_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16t_f16n_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_hc_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_hc : gemm_planar_complex_s1688_hc_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16h_f16c_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_nt_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_nt : gemm_planar_complex_s1688_nt_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16n_f16t_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s1688_ch_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 2, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s1688_ch : gemm_planar_complex_s1688_ch_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmPlanarComplex_f16c_f16h_f32n_tensor_op_f32_1688, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu new file mode 100644 index 00000000..25fd50cf --- /dev/null +++ b/test/unit/gemm/device/gemm_planar_complex_f16_f16_f32_tensor_op_sm80.cu @@ -0,0 +1,216 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-level GEMM API for Planar Complex. +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "testbed_planar_complex.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_tn_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_tn : gemm_planar_complex_s16816_tn_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16t_f16n_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_hc_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_hc : gemm_planar_complex_s16816_hc_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16h_f16c_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_nt_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kNone, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_nt : gemm_planar_complex_s16816_nt_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16n_f16t_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +using gemm_planar_complex_s16816_ch_base = typename cutlass::gemm::kernel::DefaultGemmPlanarComplexUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::ComplexTransform::kConjugate, + 8, + cutlass::half_t, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + 8, + float, + cutlass::layout::RowMajor, + float, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 16>, + cutlass::epilogue::thread::LinearCombinationPlanarComplex< + float, + 4, + float, + float + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + cutlass::arch::OpMultiplyAdd +>::GemmKernel; + +struct gemm_planar_complex_s16816_ch : gemm_planar_complex_s16816_ch_base { + +}; + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmPlanarComplex_f16c_f16h_f32n_tensor_op_f32_16816, 64x64x32_32x32x32) { + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + EXPECT_TRUE(test::gemm::device::TestAllGemmPlanarComplex()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu index 832981f9..4cc40681 100644 --- a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -107,7 +107,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -143,7 +143,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -179,7 +179,7 @@ TEST(SM75_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..d53e3c07 --- /dev/null +++ b/test/unit/gemm/device/gemm_s4n_s4t_s4n_tensor_op_s32_sm80.cu @@ -0,0 +1,213 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "multistage_testbed_interleaved.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 64x128x128_32x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x128x128_64x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 256x128x128_64x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4n_s4t_s4n_tensor_op_s32, 128x256x128_64x64x128) { + + using ElementOutput = cutlass::int4b_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, + cutlass::layout::ColumnMajorInterleaved<64>, + cutlass::int4b_t, + cutlass::layout::RowMajorInterleaved<64>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<64>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementCompute + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 32, + 32, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu index feb248d2..983dff33 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x128_64x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..8dd54183 --- /dev/null +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_tensor_op_s32_sm80.cu @@ -0,0 +1,354 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x64x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x256x256_64x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x256_32x64x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 256>, + cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x256_64x32x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 256>, + cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x256_32x32x256) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x256x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x128x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x128x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 256x64x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x256x128_64x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x128x128_32x64x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 128x64x128_64x32x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_s4t_s4n_s32n_tensor_op_s32, 64x64x128_32x32x128) { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif //#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu index 22a6d7f4..01a65b32 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32n_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x256x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 256x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -168,7 +168,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x128x128_32x64x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -200,7 +200,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 128x64x128_64x32x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -232,7 +232,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32n_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu index a5978933..33f3b07a 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x128_64x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..1a3f7dba --- /dev/null +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sm80.cu @@ -0,0 +1,357 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x256_64x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 256>, + cutlass::gemm::GemmShape<64, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x128x256_32x64x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 256>, + cutlass::gemm::GemmShape<32, 64, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x256_64x32x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 256>, + cutlass::gemm::GemmShape<64, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x256_32x32x256, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 256>, + cutlass::gemm::GemmShape<32, 32, 256>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x256x128_32x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s4t_s4n_s32t_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::int4b_t, cutlass::layout::RowMajor, cutlass::int4b_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 64>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu index 47f959e0..857df472 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s32t_wmma_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -72,7 +72,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x256x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -104,7 +104,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 256x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x128x128_64x64x128_8x8 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -168,7 +168,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 64x128x128_32x64x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -200,7 +200,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 128x64x128_64x32x128_8x8x ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -232,7 +232,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s32t_wmma_tensor_op_s32, 64x64x128_32x32x128_8x8x3 ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu index 3766c11e..51d182cd 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s4n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x256x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 256x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x128x128_64x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x128x128_32x64x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 128x64x128_64x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s4t_s4n_s4n_tensor_op_s32, 64x64x128_32x32x128) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu index 5def3a2b..90fe6bcf 100644 --- a/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu index 12b4effe..393e68bf 100644 --- a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 32x64x64_16x32x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -137,7 +137,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -171,7 +171,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -205,7 +205,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -239,7 +239,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -273,7 +273,7 @@ TEST(SM75_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) { ElementOutput, 64 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..c4900e48 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8n_s8t_s8n_tensor_op_s32_sm80.cu @@ -0,0 +1,361 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "multistage_testbed_interleaved.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x64x64_32x32x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x64x64_64x32x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x128x64_32x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x128x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x128x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 64x256x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 256x64x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_s8n_s8t_s8n_tensor_op_s32, 128x256x64_64x64x64) { + + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, + cutlass::layout::ColumnMajorInterleaved<32>, + int8_t, + cutlass::layout::RowMajorInterleaved<32>, + ElementOutput, + cutlass::layout::ColumnMajorInterleaved<32>, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, + 64 / cutlass::sizeof_bits::value + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3, + 16, + 16, + false, + cutlass::arch::OpMultiplyAddSaturate, + true + >; + + test::gemm::device::MultistageInterleavedTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu index d30a644e..6ac9b71b 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x256x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 256x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x128x64_32x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 128x64x64_64x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_tensor_op_s32, 64x64x64_32x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu index 53fcbd23..cc6e4c3a 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32n_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32n_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu index 15bdacc0..86a678d2 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -135,7 +135,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x64_64x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -167,7 +167,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x64_32x64x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -199,7 +199,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x64_64x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -231,7 +231,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..a86dc244 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sm80.cu @@ -0,0 +1,355 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x128_32x64x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x256x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x128x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x128x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 256x64x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x256x64_64x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x128x64_32x64x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 128x64x64_64x32x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +CUTLASS_TEST_L1(SM80_Device_Gemm_s8t_s8n_s32t_tensor_op_s32, 64x64x64_32x32x64, { + using ElementOutput = int32_t; + using ElementAccumulator = int32_t; + using ElementCompute = int32_t; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::LinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementCompute>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} ) + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu index dd88e87f..d53571a2 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s32t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -103,7 +103,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -137,7 +137,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -171,7 +171,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu index 4aa799e5..024cba0a 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -59,7 +59,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -78,7 +78,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -96,7 +96,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -115,7 +115,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x64_32x64x64, cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -146,7 +146,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64, ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -179,7 +179,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..2d6db336 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_tensor_op_s32_sm80.cu @@ -0,0 +1,368 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "multistage_testbed.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x128_32x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 256x64x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x128x64_32x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 128x64x64_64x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8n_tensor_op_s32, 64x64x64_32x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::ColumnMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +//////////////////////////////////////////////////////////////////////////////// +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu index 34a1f3be..ac5757e0 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8n_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16x ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -98,7 +98,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x1 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -130,7 +130,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -162,7 +162,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8n_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu index a881ca27..93642e64 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -59,7 +59,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -77,7 +77,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -95,7 +95,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -114,7 +114,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x64_32x64x64, cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<8, 8, 16>, cutlass::epilogue::thread::FastLinearCombinationClamp< ElementOutput, 128 / cutlass::sizeof_bits::value>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; EXPECT_TRUE(test::gemm::device::TestAllGemm()); } ) @@ -133,7 +133,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64, cutlass::epilogue::thread::LinearCombinationClamp< ElementOutput, 32 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; test::gemm::device::Testbed testbed; @@ -154,7 +154,7 @@ CUTLASS_TEST_L0(SM75_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, { cutlass::epilogue::thread::LinearCombinationClamp< ElementOutput, 32 / cutlass::sizeof_bits::value, ElementAccumulator, ElementCompute>, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, 2>; + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2>; test::gemm::device::Testbed testbed; diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu new file mode 100644 index 00000000..197e69b7 --- /dev/null +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu @@ -0,0 +1,368 @@ +/************************************************************************************************** + Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "multistage_testbed.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x64x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x256x128_64x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 128>, + cutlass::gemm::GemmShape<64, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x128_32x64x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 128>, + cutlass::gemm::GemmShape<32, 64, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x128_64x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 128>, + cutlass::gemm::GemmShape<64, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x128_32x32x128, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 128>, + cutlass::gemm::GemmShape<32, 32, 128>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x128x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, + cutlass::layout::ColumnMajor, ElementOutput, cutlass::layout::RowMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 256x64x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x256x64_64x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 64>, + cutlass::gemm::GemmShape<64, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 3>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x128x64_32x64x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 64>, + cutlass::gemm::GemmShape<32, 64, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 128 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 128x64x64_64x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 64>, + cutlass::gemm::GemmShape<64, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 4>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +CUTLASS_TEST_L0(SM80_Device_Gemm_s8t_s8n_s8t_tensor_op_s32, 64x64x64_32x32x64, { + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementCompute = float; + + using Gemm = cutlass::gemm::device::Gemm< + int8_t, cutlass::layout::RowMajor, int8_t, cutlass::layout::ColumnMajor, + ElementOutput, cutlass::layout::RowMajor, ElementAccumulator, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 64>, + cutlass::gemm::GemmShape<32, 32, 64>, cutlass::gemm::GemmShape<16, 8, 32>, + cutlass::epilogue::thread::FastLinearCombinationClamp< + ElementOutput, 64 / cutlass::sizeof_bits::value>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 6>; + + test::gemm::device::MultistageTestbed testbed; + + EXPECT_TRUE(testbed.run_all()); +} ) + +//////////////////////////////////////////////////////////////////////////////// +#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu index d2078582..719e2ac7 100644 --- a/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_s8t_s8n_s8t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16x ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -99,7 +99,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x1 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -131,7 +131,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -163,7 +163,7 @@ TEST(SM75_Device_Gemm_s8t_s8n_s8t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x16 ElementOutput, 128 / cutlass::sizeof_bits::value >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu index 224c8fbd..e7a01bed 100644 --- a/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_splitk_serial_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -92,7 +92,7 @@ TEST(SM75_Device_GemmSplitKSerial_f16n_f16n_f16t_tensor_op_f32, 128x256x32_64x64 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, kStages, kAlignmentA, kAlignmentB, diff --git a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu index c35535dd..39b5f10a 100644 --- a/test/unit/gemm/device/gemm_splitk_simt_sm50.cu +++ b/test/unit/gemm/device/gemm_splitk_simt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu index 725b5feb..42e991ed 100644 --- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu +++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -42,7 +42,7 @@ #include "testbed_splitk.h" -// These tests cannot run unless CUDA 10.1 Toolkit or later is used. +// These operators are assert(0) unless extended PTX is used. #if defined(CUTLASS_ARCH_MMA_SM70_SUPPORTED) ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu index 71b606da..3381f170 100644 --- a/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu +++ b/test/unit/gemm/device/gemm_splitk_tensor_op_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -42,7 +42,7 @@ #include "testbed_splitk.h" -// These tests cannot run unless CUDA 10.2 Toolkit or later is used. +// These operators are assert(0) unless extended PTX is used. #if defined(CUTLASS_ARCH_MMA_SM75_SUPPORTED) ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..78c6e865 --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32n_tf32n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,549 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32n_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..11af8889 --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32n_tf32t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,549 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32n_tf32t_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) diff --git a/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..a28101f3 --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32t_tf32n_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,487 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32n_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::ColumnMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +#endif // #if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..a1a0fd7e --- /dev/null +++ b/test/unit/gemm/device/gemm_tf32t_tf32t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,550 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x256x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x64x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x128x32_64x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x128x32_32x64x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x64x32_64x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x256x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 256, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 256x64x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<256, 64, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x128x16_64x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 16>, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x128x16_32x64x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 16>, + cutlass::gemm::GemmShape<32, 64, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 128x64x16_64x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 16>, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 6 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_tf32t_tf32t_f32t_tensor_op_f32, 64x64x16_32x32x16) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::Gemm< + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + cutlass::tfloat32_t, + cutlass::layout::RowMajor, + ElementOutput, + cutlass::layout::RowMajor, + ElementAccumulator, + cutlass::arch::OpClassTensorOp, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, + 128 / cutlass::sizeof_bits::value, + ElementAccumulator, + ElementAccumulator + >, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 10 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// +#endif // if (CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu index 4d31c089..a6316368 100644 --- a/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu +++ b/test/unit/gemm/device/gemm_u8t_u8n_s32t_wmma_tensor_op_s32_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -71,7 +71,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 128x128x32_64x64x32_16x16 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -102,7 +102,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x32x64_16x16x ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -136,7 +136,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_32x8x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -170,7 +170,7 @@ TEST(SM75_Device_Gemm_u8t_u8n_s32t_wmma_tensor_op_s32, 64x128x64_32x64x64_8x32x1 ElementAccumulator, ElementAccumulator >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..e3244194 --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_cf32n_cf32n_cf32n_tensor_op_f32_sm80.cu @@ -0,0 +1,193 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32n_cf32t_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32n_cf32h_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32h_cf32t_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf32h_cf32c_cf32n_tensor_op_f32, 64x64x16_32x32x16) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu new file mode 100644 index 00000000..301cce78 --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_gaussian_sm80.cu @@ -0,0 +1,194 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64t_cf64n_tensor_op_f64_gaussian, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64h_cf64n_tensor_op_f64_gaussian, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64t_cf64n_tensor_op_f64_gaussian, 64x32x32_32x16x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 32, 16>, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64c_cf64n_tensor_op_f64_gaussian, 64x64x32_32x16x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 16>, + cutlass::gemm::GemmShape<32, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddGaussianComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu new file mode 100644 index 00000000..df28110a --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_cf64n_cf64t_cf64t_tensor_op_f64_sm80.cu @@ -0,0 +1,194 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64t_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64n_cf64h_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64t_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_GemmUniversal_cf64h_cf64c_cf64n_tensor_op_f64, 64x64x32_32x32x32) { + + using Element = cutlass::complex; + + using Gemm = cutlass::gemm::device::GemmUniversal< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 32, 16>, + cutlass::gemm::GemmShape<16, 16, 16>, + cutlass::gemm::GemmShape<8, 8, 4>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 3, + 1, + 1, + cutlass::arch::OpMultiplyAddComplex, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kConjugate + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu new file mode 100644 index 00000000..e7b4405a --- /dev/null +++ b/test/unit/gemm/device/gemm_universal_f16n_f16t_f32t_tensor_op_f32_sm80.cu @@ -0,0 +1,111 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" + +#include "cutlass/gemm/device/gemm_universal.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_universal.h" + +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 2>; + + EXPECT_TRUE(test::gemm::device::TestAllGemmUniversal()); +} + + +TEST(SM75_Device_GemmUniversal_f16n_f16t_f32n_tensor_op_f32, 64x64x32_32x32x32_updated_batch_count) { + + using ElementOutput = float; + using ElementAccumulator = float; + + using Gemm = cutlass::gemm::device::GemmUniversal< + cutlass::half_t, + cutlass::layout::ColumnMajor, + cutlass::half_t, + cutlass::layout::RowMajor, + ElementOutput, cutlass::layout::ColumnMajor, + ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<16, 8, 8>, + cutlass::epilogue::thread::LinearCombination< + ElementOutput, 128 / cutlass::sizeof_bits::value, + ElementAccumulator, ElementAccumulator>, + cutlass::gemm::threadblock::GemmBatchedIdentityThreadblockSwizzle, + 2, + 1, + 1>; + + EXPECT_TRUE(test::gemm::device::TestGemmUniversal( + {128, 128, 2}, + cutlass::gemm::GemmUniversalMode::kGemm, + 15)); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/multistage_testbed.h b/test/unit/gemm/device/multistage_testbed.h new file mode 100644 index 00000000..bdc4b770 --- /dev/null +++ b/test/unit/gemm/device/multistage_testbed.h @@ -0,0 +1,251 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#pragma once + +#include +#include +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed_utils.h" + +namespace test { +namespace gemm { +namespace device { + +//////////////////////////////////////////////////////////////////////////////// + +template +struct MultistageTestbed { + using ElementAccumulator = typename Gemm::ElementAccumulator; + using ElementCompute = + typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + MultistageTestbed( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080) + : init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {} + + /// Helper to initialize a tensor view + template + bool initialize_tensor(cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, uint64_t seed) { + if (dist_kind == cutlass::Distribution::Uniform) { + int scope = (cutlass::sizeof_bits::value == 8) ? 2 : 8; + cutlass::reference::host::TensorFillRandomUniform(view, seed, scope, + -scope, 0); + } else if (dist_kind == cutlass::Distribution::Gaussian) { + cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5, -1); + } else if (dist_kind == cutlass::Distribution::Identity) { + cutlass::reference::host::TensorFillIdentity(view); + } else if (dist_kind == cutlass::Distribution::Sequential) { + cutlass::reference::host::BlockFillSequential(view.data(), + view.capacity()); + } else { + // TODO: Implement the rest + EXPECT_TRUE(false) << "Not implemented"; + return false; + } + + return true; + } + + /// Executes one test + bool run(cutlass::gemm::GemmCoord problem_size, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute(0)) { + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor + tensor_A(problem_size.mk()); + + cutlass::HostTensor + tensor_B(problem_size.kn()); + + cutlass::HostTensor + tensor_C(problem_size.mn()); + + cutlass::HostTensor + tensor_D(problem_size.mn()); + + cutlass::HostTensor + reference_D(problem_size.mn(), false); + + EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019)); + EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018)); + EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017)); + + cutlass::reference::host::TensorCopy(reference_D.host_view(), + tensor_C.host_view()); + + tensor_A.sync_device(); + tensor_B.sync_device(); + tensor_C.sync_device(); + tensor_D.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm::Arguments arguments{ + problem_size, tensor_A.device_ref(), tensor_B.device_ref(), + tensor_C.device_ref(), tensor_D.device_ref(), {alpha, beta}}; + + Gemm gemm_op; + + cutlass::Status status = gemm_op.initialize(arguments); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Run the GEMM + // + + status = gemm_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Verify + // + + cutlass::reference::host::Gemm< + typename Gemm::ElementA, typename Gemm::LayoutA, + typename Gemm::ElementB, typename Gemm::LayoutB, + typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm::Operator> + reference_gemm; + + reference_gemm( + problem_size, alpha, tensor_A.host_ref(), tensor_B.host_ref(), beta, + reference_D.host_ref(), ElementAccumulator(0)); + + tensor_D.sync_host(); + + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D.host_view(), tensor_D.host_view()); + + EXPECT_TRUE(passed); + if (!passed) { + std::stringstream fname; + + fname << "error_Gemm_device_" << problem_size.m() << "x" + << problem_size.n() << "x" << problem_size.k() << "_" + << Gemm::ThreadblockShape::kM << "x" << Gemm::ThreadblockShape::kN + << "x" << Gemm::ThreadblockShape::kK << "_" << Gemm::WarpShape::kM + << "x" << Gemm::WarpShape::kN << "x" << Gemm::WarpShape::kK + << ".txt"; + + std::ofstream file(fname.str()); + + file << "problem: " << problem_size << ", alpha: " << alpha + << ", beta: " << beta << "\n\n"; + + file << "A =\n" + << tensor_A.host_view() << "\nB =\n" + << tensor_B.host_view() << "\nC =\n" + << tensor_C.host_view() << "\n\nReference =\n" + << reference_D.host_view() << "\nComputed =\n" + << tensor_D.host_view(); + } + + return passed; + } + + /// Runs a set of problem sizes + bool run_all() { + bool passed = true; + + int problem_size_m[] = {16, 528}; + + int problem_size_n[] = {16, 528}; + + int problem_size_k[] = {Gemm::InstructionShape::kK, + Gemm::ThreadblockShape::kK * Gemm::kStages + + Gemm::InstructionShape::kK}; + + double problem_alpha[] = {1.0}; + + // TODO Try non zero beta value after multistaged epilogue is implemented + double problem_beta[] = {0.0}; + + for (int m : problem_size_m) { + for (int n : problem_size_n) { + for (int k : problem_size_k) { + for (double alpha : problem_alpha) { + for (double beta : problem_beta) { + passed = + run({m, n, k}, ElementCompute(alpha), ElementCompute(beta)); + + if (!passed) { + return false; + } + } + } + } + } + } + + return true; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace gemm +} // namespace test + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/multistage_testbed_interleaved.h b/test/unit/gemm/device/multistage_testbed_interleaved.h new file mode 100644 index 00000000..c98264de --- /dev/null +++ b/test/unit/gemm/device/multistage_testbed_interleaved.h @@ -0,0 +1,303 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#pragma once + +#include +#include +#include + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/host_reorder.h" + +namespace test { +namespace gemm { +namespace device { + +//////////////////////////////////////////////////////////////////////////////// + +template +struct MultistageInterleavedTestbed { + + using ElementAccumulator = typename Gemm::ElementAccumulator; + using ElementCompute = typename Gemm::GemmKernel::Epilogue::OutputOp::ElementCompute; + + /// Initialization + cutlass::Distribution::Kind init_A; + cutlass::Distribution::Kind init_B; + cutlass::Distribution::Kind init_C; + uint64_t seed; + + // + // Methods + // + + MultistageInterleavedTestbed( + cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform, + cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform, + uint64_t seed_ = 2080 + ): + init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) { } + + /// Helper to initialize a tensor view + template + bool initialize_tensor( + cutlass::TensorView view, + cutlass::Distribution::Kind dist_kind, + uint64_t seed) { + + if (dist_kind == cutlass::Distribution::Uniform) { + + cutlass::reference::host::TensorFillRandomUniform( + view, seed, 2, -2, 0); + } + else if (dist_kind == cutlass::Distribution::Identity) { + + cutlass::reference::host::TensorFillIdentity(view); + } + else if (dist_kind == cutlass::Distribution::Sequential) { + + cutlass::reference::host::BlockFillSequential( + view.data(), view.capacity()); + } + else { + // TODO: Implement the rest + EXPECT_TRUE(false) << "Not implemented"; + return false; + } + + return true; + } + + /// Executes one test + bool run( + cutlass::gemm::GemmCoord problem_size, + ElementCompute alpha = ElementCompute(1), + ElementCompute beta = ElementCompute(0)) { + + // + // Allocate the GEMM workspace + // + + cutlass::HostTensor< + typename Gemm::ElementA, + typename Gemm::LayoutA> tensor_A(problem_size.mk()); + + cutlass::HostTensor< + typename Gemm::ElementB, + typename Gemm::LayoutB> tensor_B(problem_size.kn()); + + cutlass::HostTensor< + typename Gemm::ElementB, + typename Gemm::LayoutB> tensor_B_reordered(problem_size.kn()); + + cutlass::HostTensor< + typename Gemm::ElementC, + typename Gemm::LayoutC> tensor_C(problem_size.mn()); + + cutlass::HostTensor< + typename Gemm::ElementC, + typename Gemm::LayoutC> tensor_D(problem_size.mn()); + + cutlass::HostTensor< + typename Gemm::ElementC, + typename Gemm::LayoutC> reference_D(problem_size.mn(), false); + + EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 2019)); + EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2018)); + EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 2017)); + + cutlass::reorder_column( + tensor_B_reordered.host_ref(), tensor_B.host_ref(), problem_size); + + cutlass::reference::host::TensorCopy( + reference_D.host_view(), + tensor_C.host_view()); + + tensor_A.sync_device(); + tensor_B_reordered.sync_device(); + tensor_C.sync_device(); + tensor_D.sync_device(); + + // + // Initialize the GEMM operator + // + + typename Gemm::Arguments arguments{ + problem_size, + tensor_A.device_ref(), + tensor_B_reordered.device_ref(), + tensor_C.device_ref(), + tensor_D.device_ref(), + {alpha, beta} + }; + + Gemm gemm_op; + + cutlass::Status status = gemm_op.initialize(arguments); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Run the GEMM + // + + status = gemm_op(); + + EXPECT_TRUE(status == cutlass::Status::kSuccess); + + // + // Verify + // + + cutlass::reference::host::Gemm< + typename Gemm::ElementA, typename Gemm::LayoutA, + typename Gemm::ElementB, typename Gemm::LayoutB, + typename Gemm::ElementC, typename Gemm::LayoutC, ElementCompute, + ElementAccumulator, typename Gemm::Operator> + reference_gemm; + + reference_gemm( + problem_size, + alpha, + tensor_A.host_ref(), + tensor_B.host_ref(), + beta, + reference_D.host_ref(), + ElementAccumulator(0) + ); + + tensor_D.sync_host(); + + EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0); + EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0); + + bool passed = cutlass::reference::host::TensorEquals( + reference_D.host_view(), + tensor_D.host_view()); + + EXPECT_TRUE(passed); + if (!passed) { + + std::stringstream fname; + + fname << "error_Gemm_device_" + << problem_size.m() << "x" + << problem_size.n() << "x" + << problem_size.k() << "_" + << Gemm::ThreadblockShape::kM << "x" + << Gemm::ThreadblockShape::kN << "x" + << Gemm::ThreadblockShape::kK << "_" + << Gemm::WarpShape::kM << "x" + << Gemm::WarpShape::kN << "x" + << Gemm::WarpShape::kK << ".txt"; + + std::ofstream file(fname.str()); + + file + << "problem: " << problem_size + << ", alpha: " << alpha << ", beta: " << beta << "\n\n"; + + file + << "A =\n" << tensor_A.host_view() + << "\nB =\n" << tensor_B.host_view() + << "\nB_reordered =\n" << tensor_B_reordered.host_view() + << "\nC =\n" << tensor_C.host_view() + << "\n\nReference =\n" << reference_D.host_view() + << "\nComputed =\n" << tensor_D.host_view(); + } + + return passed; + } + + /// Runs a set of problem sizes + bool run_all() { + bool passed = true; + + int problem_size_m[] = { + InterleavedK, 512 + InterleavedK + }; + + int problem_size_n[] = { + InterleavedK, 512 + InterleavedK + }; + + int problem_size_k[] = { + InterleavedK, Gemm::ThreadblockShape::kK * Gemm::kStages + InterleavedK + }; + + double problem_alpha[] = { + 1.0 + }; + + double problem_beta[] = { + 0.0 + }; + + for (int m : problem_size_m) { + for (int n : problem_size_n) { + for (int k : problem_size_k) { + for (double alpha : problem_alpha) { + for (double beta : problem_beta) { + + passed = run( + {m, n, k}, + ElementCompute(alpha), + ElementCompute(beta) + ); + + if (!passed) { + return false; + } + } + } + } + } + } + + return true; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace gemm +} // namespace test + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu index d399b766..5aabfca5 100644 --- a/test/unit/gemm/device/simt_cgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu index 7c192241..c5265ce2 100644 --- a/test/unit/gemm/device/simt_cgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu index 89728ba2..9db96c99 100644 --- a/test/unit/gemm/device/simt_cgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu index 8d4c9fdd..0ac7b4c9 100644 --- a/test/unit/gemm/device/simt_cgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_cgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_cgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L1(SM50_device_cgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_cgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu index 3d5c52ed..1efa9d04 100644 --- a/test/unit/gemm/device/simt_dgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu index 05fa3c94..886c0f9c 100644 --- a/test/unit/gemm/device/simt_dgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu index f0f25300..a43d0afd 100644 --- a/test/unit/gemm/device/simt_dgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu index 38066b94..0175978d 100644 --- a/test/unit/gemm/device/simt_dgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_dgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_dgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_dgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_dgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu index 79af9b47..a3aa5ce8 100644 --- a/test/unit/gemm/device/simt_hgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nn, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nn, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu index 1401d2fa..d5541939 100644 --- a/test/unit/gemm/device/simt_hgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_nt, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_nt, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu index f1b7a043..526bc01a 100644 --- a/test/unit/gemm/device/simt_hgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tn, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tn, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu index 4c1b5913..ad464b30 100644 --- a/test/unit/gemm/device/simt_hgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_hgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x128x8_32x128x1_8x16_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x64x8_64x64x1_16x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x32x8_128x32x1_16x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 32x256x8_32x128x1_8x16_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 64x128x8_64x64x1_16x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x64x8_64x64x1_16x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x32x8_128x32x1_16x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 64x256x8_32x128x1_8x16_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x128x8_64x64x1_16x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x64x8_128x32x1_16x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 128x256x8_64x64x1_16x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1777,7 +1777,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1807,7 +1807,7 @@ CUTLASS_TEST_L0(SM50_device_hgemm_tt, 256x128x8_64x64x1_16x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1837,7 +1837,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1867,7 +1867,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1897,7 +1897,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1927,7 +1927,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1957,7 +1957,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1987,7 +1987,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2017,7 +2017,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2047,7 +2047,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2077,7 +2077,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2107,7 +2107,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 128x256x8_32x64x1_8x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2137,7 +2137,7 @@ CUTLASS_TEST_L2(SM50_device_hgemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -2167,7 +2167,7 @@ CUTLASS_TEST_L1(SM50_device_hgemm_tt, 256x128x8_64x32x1_8x8_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_nn_sm50.cu b/test/unit/gemm/device/simt_igemm_nn_sm50.cu index 59a8dbfe..3db133eb 100644 --- a/test/unit/gemm/device/simt_igemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_nt_sm50.cu b/test/unit/gemm/device/simt_igemm_nt_sm50.cu index 7ff0c5cd..01f56ea0 100644 --- a/test/unit/gemm/device/simt_igemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_tn_sm50.cu b/test/unit/gemm/device/simt_igemm_tn_sm50.cu index 392db59e..3692ec2c 100644 --- a/test/unit/gemm/device/simt_igemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_igemm_tt_sm50.cu b/test/unit/gemm/device/simt_igemm_tt_sm50.cu index 3fdc8e27..2254669b 100644 --- a/test/unit/gemm/device/simt_igemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_igemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L0(SM50_device_igemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_igemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_igemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61.cu b/test/unit/gemm/device/simt_int8_igemm_sm61.cu index d1a8821a..1364a38c 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ ElementAccumulator, \ ElementCompute \ >, \ - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, \ + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, \ 2 \ >; \ EXPECT_TRUE(test::gemm::device::TestAllGemm()); \ diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu index 0c1449e0..4e4308ff 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61_perf.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -109,7 +109,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -145,7 +145,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -181,7 +181,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_perf, 128x256x32_64x64x8) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu index 9e1c21e9..88c72aee 100644 --- a/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu +++ b/test/unit/gemm/device/simt_int8_igemm_sm61_sliced_k.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -69,7 +69,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -101,7 +101,7 @@ TEST(SM61_Device_Gemm_s8n_s8t_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -133,7 +133,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -165,7 +165,7 @@ TEST(SM61_Device_Gemm_s8t_s8n_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -197,7 +197,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -229,7 +229,7 @@ TEST(SM61_Device_Gemm_s8t_s8t_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -261,7 +261,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_sliced_k, 32x32x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; @@ -293,7 +293,7 @@ TEST(SM61_Device_Gemm_s8n_s8n_simt_op_dp4a_sliced_k, 32x64x128_32x32x4) { ElementAccumulator, ElementCompute >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 >; diff --git a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu index a81dd4db..0412d751 100644 --- a/test/unit/gemm/device/simt_sgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu index 81c21eda..1adb9b5a 100644 --- a/test/unit/gemm/device/simt_sgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_nt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_nt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_nt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sgemm_nt_sm80.cu b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu new file mode 100644 index 00000000..7d2ab45b --- /dev/null +++ b/test/unit/gemm/device/simt_sgemm_nt_sm80.cu @@ -0,0 +1,249 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 32x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 64x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 64x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x128x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32n_f32t_f32t_simt_f32, 128x256x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu index 20a2eddb..0c00e560 100644 --- a/test/unit/gemm/device/simt_sgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tn, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tn, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tn, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tn, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sgemm_tn_sm80.cu b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu new file mode 100644 index 00000000..00461d2e --- /dev/null +++ b/test/unit/gemm/device/simt_sgemm_tn_sm80.cu @@ -0,0 +1,249 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface + +*/ + +#include + +#include "../../common/cutlass_unit_test.h" +#include "cutlass/cutlass.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/tensor_view_io.h" + +#include "testbed.h" + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 32x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 4 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 64x64x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 64x128x8_32x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<64, 128, 8>, + cutlass::gemm::GemmShape<32, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x64x8_64x32x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 64, 8>, + cutlass::gemm::GemmShape<64, 32, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x128x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 128, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +TEST(SM80_Device_Gemm_f32t_f32n_f32t_simt_f32, 128x256x8_64x64x1) { + + using Element = float; + + using Gemm = cutlass::gemm::device::Gemm< + Element, + cutlass::layout::RowMajor, + Element, + cutlass::layout::ColumnMajor, + Element, + cutlass::layout::RowMajor, + Element, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm80, + cutlass::gemm::GemmShape<128, 256, 8>, + cutlass::gemm::GemmShape<64, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 1>, + cutlass::epilogue::thread::LinearCombination< + Element, + 1, + Element, + Element>, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, + 3 + >; + + EXPECT_TRUE(test::gemm::device::TestAllGemm()); +} + +//////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu index 22e846b9..ce7ab9a7 100644 --- a/test/unit/gemm/device/simt_sgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_sgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x64x8_16x64x1_4x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x32x8_32x32x1_8x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x64x8_32x64x1_8x8_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x32x8_64x32x1_8x8_8x4_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 16x128x8_16x64x1_4x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_32x32x1_8x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x128x8_32x64x1_8x8_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tt, 64x64x8_64x32x1_8x8_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_32x32x1_8x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x64x8_32x64x1_8x8_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x32x8_64x32x1_8x8_8x4_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x8_16x64x1_4x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -817,7 +817,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_32x32x1_8x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -847,7 +847,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x128x8_32x64x1_8x8_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -877,7 +877,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x8_64x16x1_8x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -907,7 +907,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x64x8_64x32x1_8x8_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -937,7 +937,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -967,7 +967,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 16x128x16_8x32x1_2x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -997,7 +997,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1027,7 +1027,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1057,7 +1057,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1087,7 +1087,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 32x256x8_16x64x1_4x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1117,7 +1117,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1147,7 +1147,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x128x8_32x32x1_8x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1177,7 +1177,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 64x256x8_32x64x1_8x8_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1207,7 +1207,7 @@ CUTLASS_TEST_L0(SM50_device_sgemm_tt, 128x128x8_64x32x1_8x8_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1237,7 +1237,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1267,7 +1267,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1297,7 +1297,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1327,7 +1327,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1357,7 +1357,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x64x8_32x32x1_8x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1387,7 +1387,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 128x128x8_32x64x1_8x8_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1417,7 +1417,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 256x32x8_64x16x1_8x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1447,7 +1447,7 @@ CUTLASS_TEST_L1(SM50_device_sgemm_tt, 256x64x8_64x32x1_8x8_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1477,7 +1477,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1507,7 +1507,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 32x128x16_8x32x1_2x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1537,7 +1537,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1567,7 +1567,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1597,7 +1597,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x128x8_16x32x1_4x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1627,7 +1627,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 64x256x8_16x64x1_4x8_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1657,7 +1657,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x32x16_32x8x1_4x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1687,7 +1687,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x64x8_32x16x1_4x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1717,7 +1717,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 128x128x8_32x32x1_8x4_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -1747,7 +1747,7 @@ CUTLASS_TEST_L2(SM50_device_sgemm_tt, 256x64x8_64x16x1_8x4_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_sm50.py b/test/unit/gemm/device/simt_sm50.py index ba6ec3c2..f53dae27 100644 --- a/test/unit/gemm/device/simt_sm50.py +++ b/test/unit/gemm/device/simt_sm50.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -123,7 +123,7 @@ for precision in precisions: # write file header out.write("/***************************************************************************************************\n" -" * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.\n" +" * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.\n" " *\n" " * Redistribution and use in source and binary forms, with or without modification, are permitted\n" " * provided that the following conditions are met:\n" diff --git a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu index 7145b395..7731559a 100644 --- a/test/unit/gemm/device/simt_zgemm_nn_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_nn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu index ffe8c0dd..17ea9820 100644 --- a/test/unit/gemm/device/simt_zgemm_nt_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_nt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_nt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_nt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_nt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu index 2d4799eb..175c3128 100644 --- a/test/unit/gemm/device/simt_zgemm_tn_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_tn_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tn, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tn, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tn, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tn, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu index ba2447bc..544e626c 100644 --- a/test/unit/gemm/device/simt_zgemm_tt_sm50.cu +++ b/test/unit/gemm/device/simt_zgemm_tt_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -67,7 +67,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 8x32x8_8x32x1_2x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -97,7 +97,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tt, 16x32x8_16x32x1_4x4_4x8_1x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -127,7 +127,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 8x32x8_8x16x1_2x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -157,7 +157,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 8x64x8_8x32x1_2x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -187,7 +187,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 16x32x8_16x16x1_4x2_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -217,7 +217,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 16x64x8_16x32x1_4x4_4x8_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -247,7 +247,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x32x8_32x16x1_4x4_8x4_1x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -277,7 +277,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x32x1_4x4_4x8_2x1, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -307,7 +307,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x32x8_8x16x1_2x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -337,7 +337,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x64x8_8x32x1_2x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -367,7 +367,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x16x1_4x2_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -397,7 +397,7 @@ CUTLASS_TEST_L0(SM50_device_zgemm_tt, 32x64x8_16x32x1_4x4_4x8_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -427,7 +427,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 64x32x8_32x16x1_4x4_8x4_2x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -457,7 +457,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 16x64x16_8x16x1_2x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -487,7 +487,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_16x8x1_2x2_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -517,7 +517,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x64x8_16x16x1_4x2_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -547,7 +547,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 32x128x8_16x32x1_4x4_4x8_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -577,7 +577,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 64x64x8_32x16x1_4x4_8x4_2x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -607,7 +607,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x32x8_8x16x1_2x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -637,7 +637,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x32x8_16x16x1_4x2_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -667,7 +667,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x64x8_16x32x1_4x4_4x8_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -697,7 +697,7 @@ CUTLASS_TEST_L1(SM50_device_zgemm_tt, 128x32x8_32x16x1_4x4_8x4_4x2, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -727,7 +727,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 32x64x16_8x16x1_2x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -757,7 +757,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x32x16_16x8x1_2x2_8x4_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); @@ -787,7 +787,7 @@ CUTLASS_TEST_L2(SM50_device_zgemm_tt, 64x64x8_16x16x1_4x2_4x8_4x4, { cutlass::arch::Sm50, ThreadblockShape, WarpShape, InstructionShape, EpilogueOutputOp, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, 2 // Stages >; EXPECT_TRUE(test::gemm::device::TestAllGemm()); diff --git a/test/unit/gemm/device/testbed.h b/test/unit/gemm/device/testbed.h index 57108530..b8c739a7 100644 --- a/test/unit/gemm/device/testbed.h +++ b/test/unit/gemm/device/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_complex.h b/test/unit/gemm/device/testbed_complex.h index 1eff58a2..65c0fdfb 100644 --- a/test/unit/gemm/device/testbed_complex.h +++ b/test/unit/gemm/device/testbed_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_interleaved.h b/test/unit/gemm/device/testbed_interleaved.h index 34d61383..3cbd720b 100644 --- a/test/unit/gemm/device/testbed_interleaved.h +++ b/test/unit/gemm/device/testbed_interleaved.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_planar_complex.h b/test/unit/gemm/device/testbed_planar_complex.h index 5642020b..0e4e561e 100644 --- a/test/unit/gemm/device/testbed_planar_complex.h +++ b/test/unit/gemm/device/testbed_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_sanity.h b/test/unit/gemm/device/testbed_sanity.h new file mode 100644 index 00000000..025fb387 --- /dev/null +++ b/test/unit/gemm/device/testbed_sanity.h @@ -0,0 +1,233 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include +#include +#include + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" +#include "cutlass/util/distribution.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_norm.h" +#include "cutlass/util/reference/host/gemm.h" +#include "cutlass/core_io.h" + +#include "testbed.h" + + +namespace test { +namespace gemm { +namespace device { + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// +// List of Gemm internal paramters this testbed supports user verification +// +enum class ParameterID { + + // Threadblock-level parameters + kSmemASize, + kSmemBSize, + + // Warp-level parameters + kWarpFragmentASize, + kWarpFragmentBSize, + kWarpFragmentCSize, + kInvalid +}; + +struct Reference { + ParameterID parameter_id; + + union { + int value; + + struct { + int m, n, k; + } gemm_shape; + + struct { + int row, column; + } matrix_shape; + }; + + std::string error_msg; + + Reference( + ParameterID parameter_id_, + int value_=-1, + std::string const &error_msg_="") : parameter_id(parameter_id_), value(value_), error_msg(error_msg_) {} +}; + + +template +struct TestbedSanity { + + // + // Type definitions (All Gemm types top down) + // + + // Unpacking Gemm types in the following order + // Kernel-level > Threadblock-level > Warp-level > Instruction-level + + // kernel-level cutlass Gemm + using GemmKernel = typename Gemm::GemmKernel; + + // + // Threadblock-level gemm types + // + using MmaThreadBlock = typename GemmKernel::Mma; + + // Threadblock-level gemm shape covering one stage + using ThreadblockShape = typename MmaThreadBlock::Shape; + + // Shared memory size covering all stages + using SmemShapeA = typename MmaThreadBlock::Base::SharedStorage::ShapeA; + using SmemPaddingA = typename MmaThreadBlock::Policy::SmemPaddingA; + using SmemShapeB = typename MmaThreadBlock::Base::SharedStorage::ShapeB; + using SmemPaddingB = typename MmaThreadBlock::Policy::SmemPaddingB; + + + /// Number of stages + static int const kStages = MmaThreadBlock::Base::kStages; + + /// Number of warp-level GEMM oeprations + static int const kWarpGemmIterations = MmaThreadBlock::kWarpGemmIterations; + + + // + // Warp-level gemm types + // + + // Warp-level gemm operator + using MmaWarp = typename MmaThreadBlock::Operator; + + // Warp-level gemm shape covering all kgroups + using WarpShape = typename MmaWarp::Shape; + + // Warp-level framents holding operands A & B operand and destination C + using WarpFragmentA = typename MmaWarp::FragmentA; + using WarpFragmentB = typename MmaWarp::FragmentB; + using WarpFragmentC = typename MmaWarp::FragmentC; + + // + // Instruction-level gemm types + // + + // Instruction-level gemm operator + using MmaInstruction = typename MmaWarp::Policy::Operator; + + // Instruction shape + using InstructionShape = typename MmaInstruction::Shape; + + // Instruction-level framents holding operands A & B operand and destination C + using InstructionFragmentA = typename MmaInstruction::FragmentA; + using InstructionFragmentB = typename MmaInstruction::FragmentB; + using InstructionFragmentC = typename MmaInstruction::FragmentC; + + // + // Testbed types + // + + // Vector of values holding user provided reference + using ReferenceVector = std::vector; + + // + // Data members + // + ReferenceVector references; + + // + // Methods + // + + TestbedSanity(ReferenceVector const &references_ = ReferenceVector()) : references(references_){ } + + // verify all parameter in ReferenceVector + bool verify() { + for(auto ref : references) + verify_parameter(ref); + return true; + } + + // verify parameter of type Reference + void verify_parameter(Reference const& ref) { + switch(ref.parameter_id) { + case ParameterID::kWarpFragmentASize : EXPECT_TRUE(WarpFragmentA::kElements == ref.value) << *this; break; + case ParameterID::kWarpFragmentBSize : EXPECT_TRUE(WarpFragmentB::kElements == ref.value) << *this; break; + case ParameterID::kWarpFragmentCSize : EXPECT_TRUE(WarpFragmentC::kElements == ref.value) << *this; break; + } + } + +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Overload output operators for TesbedSanity +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +template +std::ostream & operator<<(std::ostream &out, TestbedSanity const &test) { + + + out << "Gemm internal parameters" << std::endl + << " Threadblock-level parameters:" << std::endl + << " ThreadblockShape = " << typename TestbedSanity::ThreadblockShape() << std::endl + << " kStages = " << TestbedSanity::kStages << std::endl + << " kWarpGemmIterations = "<< TestbedSanity::kWarpGemmIterations << std::endl + <<" Shared memory sizes:" << std::endl + <<" SmemPaddingA = " << typename TestbedSanity::SmemPaddingA() << std::endl + <<" SmemPaddingB = " << typename TestbedSanity::SmemPaddingB() << std::endl + <<" SmemShapeA = " << typename TestbedSanity::SmemShapeA() << std::endl + <<" SmemShapeB = " << typename TestbedSanity::SmemShapeB() << std::endl + <<" Warp-level parameters" << std::endl + <<" WarpShape = " << typename TestbedSanity::WarpShape() << std::endl + <<" Fragment sizes:" << std::endl + <<" WarpFragmentA::kElements = " << TestbedSanity::WarpFragmentA::kElements << std::endl + <<" WarpFragmentB::kElements = " << TestbedSanity::WarpFragmentB::kElements << std::endl + <<" WarpFragmentC::kElements = " << TestbedSanity::WarpFragmentC::kElements << std::endl + <<" Instruction-level parameters" << std::endl + <<" InstructionShape = " << typename TestbedSanity::InstructionShape() << std::endl + <<" Fragment sizes:" << std::endl + <<" InstructionFragmentA::kElements = " << TestbedSanity::InstructionFragmentA::kElements << std::endl + <<" InstructionFragmentB::kElements = " << TestbedSanity::InstructionFragmentB::kElements << std::endl + <<" InstructionFragmentC::kElements = " << TestbedSanity::InstructionFragmentC::kElements << std::endl; + + return out; +} + +} // namespace device +} // namespace gemm +} // namespace test + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/test/unit/gemm/device/testbed_splitk.h b/test/unit/gemm/device/testbed_splitk.h index c8ae4b4a..792d7392 100644 --- a/test/unit/gemm/device/testbed_splitk.h +++ b/test/unit/gemm/device/testbed_splitk.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_universal.h b/test/unit/gemm/device/testbed_universal.h index 44503e0a..a83c27cd 100644 --- a/test/unit/gemm/device/testbed_universal.h +++ b/test/unit/gemm/device/testbed_universal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/device/testbed_utils.h b/test/unit/gemm/device/testbed_utils.h index 5a76c3be..9325b40f 100644 --- a/test/unit/gemm/device/testbed_utils.h +++ b/test/unit/gemm/device/testbed_utils.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -41,6 +41,7 @@ inline char const *to_string(cutlass::Status status) { case cutlass::Status::kErrorWorkspaceNull: return "kErrorWorkspaceNull"; case cutlass::Status::kErrorInternal: return "kErrorInternal"; case cutlass::Status::kInvalid: return "kInvalid"; + default: break; } return "invalid"; } diff --git a/test/unit/gemm/thread/CMakeLists.txt b/test/unit/gemm/thread/CMakeLists.txt index 11d450c7..48ca1157 100644 --- a/test/unit/gemm/thread/CMakeLists.txt +++ b/test/unit/gemm/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm50.cu b/test/unit/gemm/thread/gemm_sm50.cu index 969580f5..42659228 100644 --- a/test/unit/gemm/thread/gemm_sm50.cu +++ b/test/unit/gemm/thread/gemm_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm60.cu b/test/unit/gemm/thread/gemm_sm60.cu index 19b84619..b0b9fdb5 100644 --- a/test/unit/gemm/thread/gemm_sm60.cu +++ b/test/unit/gemm/thread/gemm_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/gemm_sm61.cu b/test/unit/gemm/thread/gemm_sm61.cu index f8cbf2b8..f6e7724d 100644 --- a/test/unit/gemm/thread/gemm_sm61.cu +++ b/test/unit/gemm/thread/gemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/CMakeLists.txt b/test/unit/gemm/thread/host/CMakeLists.txt index 75f76c92..c5854026 100644 --- a/test/unit/gemm/thread/host/CMakeLists.txt +++ b/test/unit/gemm/thread/host/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/gemm_sm60_host.cu b/test/unit/gemm/thread/host/gemm_sm60_host.cu index df2d233a..346b80cb 100644 --- a/test/unit/gemm/thread/host/gemm_sm60_host.cu +++ b/test/unit/gemm/thread/host/gemm_sm60_host.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/host/testbed_host.h b/test/unit/gemm/thread/host/testbed_host.h index d2835efe..4d5e441d 100644 --- a/test/unit/gemm/thread/host/testbed_host.h +++ b/test/unit/gemm/thread/host/testbed_host.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/thread/testbed.h b/test/unit/gemm/thread/testbed.h index 1b1082a5..bdfb8278 100644 --- a/test/unit/gemm/thread/testbed.h +++ b/test/unit/gemm/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/CMakeLists.txt b/test/unit/gemm/threadblock/CMakeLists.txt index 7ec75510..f208b9ef 100644 --- a/test/unit/gemm/threadblock/CMakeLists.txt +++ b/test/unit/gemm/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/batched_gemv.cu b/test/unit/gemm/threadblock/batched_gemv.cu index 79b5ac4e..94ae947b 100644 --- a/test/unit/gemm/threadblock/batched_gemv.cu +++ b/test/unit/gemm/threadblock/batched_gemv.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/epilogue_workspace.cu b/test/unit/gemm/threadblock/epilogue_workspace.cu index c1967e43..1301aeb4 100644 --- a/test/unit/gemm/threadblock/epilogue_workspace.cu +++ b/test/unit/gemm/threadblock/epilogue_workspace.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_simt.cu b/test/unit/gemm/threadblock/mma_pipelined_simt.cu index b5c1a58b..522b029a 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_simt.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_simt.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu index b9302ef3..c9c714bc 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm70.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu index 5585f23f..e4125eb4 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_sm75.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -231,6 +231,7 @@ TEST(SM75_gemm_threadblock_congruous, } //////////////////////////////////////////////////////////////////////////////// + TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x32_64x64x32_16x8x8) { using ElementA = cutlass::half_t; using LayoutA = cutlass::layout::RowMajor; @@ -562,6 +563,7 @@ TEST(SM75_gemm_threadblock_crosswise, } //////////////////////////////////////////////////////////////////////////////// + TEST(SM75_gemm_threadblock_interleaved, tensor_op_32x32x64_16x16x64_8x8x16) { using ElementA = uint8_t; using LayoutA = cutlass::layout::ColumnMajorInterleaved<32>; @@ -1785,4 +1787,337 @@ TEST(SM75_gemm_threadblock_interleaved, } //////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 1, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_32x32x512_16x16x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(32, 32, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 512>; + using WarpShape = cutlass::gemm::GemmShape<16, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x32x512_32x16x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 32, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 512>; + using WarpShape = cutlass::gemm::GemmShape<32, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_32x64x512_16x32x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(32, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<16, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x64x512_32x32x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<32, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_128x64x512_64x32x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(128, 64, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore component + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_64x128x512_32x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(64, 128, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<32, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, tensor_op_128x128x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(128, 128, 2048); + + using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(1, 1); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, + multicta_256x256x1536_128x128x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(256, 256, 1536); + + using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(2, 2); + dim3 block(32, 4, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM75_gemm_threadblock_crosswise, + multicta_512x256x6144_256x128x512_64x64x512_8x8x128) { + using ElementA = cutlass::uint1b_t; + using LayoutA = cutlass::layout::RowMajor; + using ElementB = cutlass::uint1b_t; + using LayoutB = cutlass::layout::ColumnMajor; + using ElementC = int32_t; + using LayoutC = cutlass::layout::ColumnMajor; + + cutlass::gemm::GemmCoord problem_size(512, 256, 6144); + + using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 512>; + using WarpShape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 128>; + + float alpha = 1.f; + float beta = 0.f; + + // Define the MmaCore components + using MmaCore = typename cutlass::gemm::threadblock::DefaultMmaCore< + ThreadBlockShape, WarpShape, InstructionShape, ElementA, LayoutA, + ElementB, LayoutB, ElementC, LayoutC, cutlass::arch::OpClassTensorOp, 2, + cutlass::arch::OpXorPopc>; + + dim3 grid(2, 2); + dim3 block(32, 8, 1); + + test::gemm::threadblock::Testbed(problem_size.m(), problem_size.n(), + problem_size.k(), alpha, beta) + .run(grid, block); +} + +//////////////////////////////////////////////////////////////////////////////// + #endif diff --git a/test/unit/gemm/threadblock/mma_pipelined_testbed.h b/test/unit/gemm/threadblock/mma_pipelined_testbed.h index 498ca496..8190c50a 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_testbed.h +++ b/test/unit/gemm/threadblock/mma_pipelined_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu index 3c1720a1..4fb964c1 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu index e3d900d5..fd2ae356 100644 --- a/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu +++ b/test/unit/gemm/threadblock/mma_pipelined_wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h index 5838e4f3..148e34d9 100644 --- a/test/unit/gemm/threadblock/mma_planar_complex_testbed.h +++ b/test/unit/gemm/threadblock/mma_planar_complex_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without *modification, are permitted provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu index ba54249d..8c687f88 100644 --- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu +++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu index d1c06083..262269b7 100644 --- a/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu +++ b/test/unit/gemm/threadblock/mma_singlestage_wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/CMakeLists.txt b/test/unit/gemm/warp/CMakeLists.txt index 600d1d8e..695508fa 100644 --- a/test/unit/gemm/warp/CMakeLists.txt +++ b/test/unit/gemm/warp/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -27,6 +27,9 @@ cutlass_test_unit_add_executable( gemm_sm61.cu gemm_sm70.cu gemm_sm75.cu + gemm_sm80.cu + gemm_complex_sm80.cu + gemm_gaussian_complex_sm80.cu wmma_sm70.cu wmma_sm72.cu wmma_sm75.cu diff --git a/test/unit/gemm/warp/gemm_complex_sm80.cu b/test/unit/gemm/warp/gemm_complex_sm80.cu new file mode 100644 index 00000000..3fcd70c8 --- /dev/null +++ b/test/unit/gemm/warp/gemm_complex_sm80.cu @@ -0,0 +1,635 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + + \brief Unit tests for thread-level GEMM +*/ + +#include "cutlass/cutlass.h" +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/half.h" + +#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" + +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// complex * complex => complex +// Input data type: complex +// Math instruction: MMA.884.F64.F64 +// Output data type: complex +/////////////////////////////////////////////////////////////////////////////////////////////////// +TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<8, 8, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_nh) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 32x32x4_8x8x4_ct) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 8x8x4_8x8x4_tn) { + + using Shape = cutlass::gemm::GemmShape<8, 8, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f64, 16x16x4_8x8x4_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// complex * complex => complex +// Input data type: complex +// Math instruction: MMA.1688.F32.TF32 +// Output data type: complex +// Shared memory layout: Congrous +//////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 16> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x32x8_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x16x8_16x16x8_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 16, 8> >() + .run(); +} + + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_nh) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_ct) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// complex * complex => complex +// Input data type: complex +// Math instruction: MMA.1688.F32.TF32 +// Output data type: complex +// Shared memory layout: Crosswise +//////////////////////////////////////////////////////////////////////////////////////////////////// +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 8> >() + .run(); +} + +// TEST FAILS crosswise complex TN MMA.1688.F32.TF32 test fails for k = 2*8 = 16 +TEST(SM80_warp_gemm_complex_tensor_op_f32, 16x16x16_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<16, 16, 16> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x32x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 32, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 32x64x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<32, 64, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<32, 64, 8> >() + .run(); +} + +TEST(SM80_warp_gemm_complex_tensor_op_f32, 64x32x8_16x8x8_tn) { + + using Shape = cutlass::gemm::GemmShape<64, 32, 8>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor + >::Type; + + test::gemm::warp::TransformedTestbedComplex< + MmaTensorOp, cutlass::gemm::GemmShape<64, 32, 8> >() + .run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu new file mode 100644 index 00000000..43ad2dfd --- /dev/null +++ b/test/unit/gemm/warp/gemm_gaussian_complex_sm80.cu @@ -0,0 +1,281 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + + \brief Unit tests for thread-level GEMM +*/ + +#include "cutlass/cutlass.h" +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/half.h" + +#include "cutlass/gemm/warp/default_mma_complex_tensor_op.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" + +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 8x8x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<8, 8, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<16, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x16x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_nt) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_nh) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kConjugate, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 32x32x4_8x8x4_ct) { + + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous128b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous128b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kConjugate, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_gaussian_complex_tensor_op, 16x16x4_8x8x4_tn) { + + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + + using Element = cutlass::complex; + using ElementC = cutlass::complex; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise128x4; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise128x4; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaComplexTensorOp< + Shape, + InstructionShape, + Element, + LayoutA, + Element, + LayoutB, + ElementC, + cutlass::layout::RowMajor, + cutlass::ComplexTransform::kNone, + cutlass::ComplexTransform::kNone, + cutlass::arch::OpMultiplyAddGaussianComplex + >::Type; + + test::gemm::warp::TestbedComplex >().run(); +} +/////////////////////////////////////////////////////////////////////////////////////////////////// + +#endif // #if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/warp/gemm_sm50.cu b/test/unit/gemm/warp/gemm_sm50.cu index f6410d1d..bb4ba5be 100644 --- a/test/unit/gemm/warp/gemm_sm50.cu +++ b/test/unit/gemm/warp/gemm_sm50.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm60.cu b/test/unit/gemm/warp/gemm_sm60.cu index cf59d442..4f2f3f15 100644 --- a/test/unit/gemm/warp/gemm_sm60.cu +++ b/test/unit/gemm/warp/gemm_sm60.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm61.cu b/test/unit/gemm/warp/gemm_sm61.cu index 98a16046..63e07165 100644 --- a/test/unit/gemm/warp/gemm_sm61.cu +++ b/test/unit/gemm/warp/gemm_sm61.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm70.cu b/test/unit/gemm/warp/gemm_sm70.cu index d97effea..16f1427e 100644 --- a/test/unit/gemm/warp/gemm_sm70.cu +++ b/test/unit/gemm/warp/gemm_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/gemm_sm75.cu b/test/unit/gemm/warp/gemm_sm75.cu index 7c32de4a..144475ca 100644 --- a/test/unit/gemm/warp/gemm_sm75.cu +++ b/test/unit/gemm/warp/gemm_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -109,6 +109,8 @@ TEST(SM75_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x8) { .run(); } +//////////////////////////////////////////////////////////////////////////////// + TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x8) { using Shape = cutlass::gemm::GemmShape<64, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; @@ -317,6 +319,8 @@ TEST(SM75_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x8) { .run(); } +//////////////////////////////////////////////////////////////////////////////// + TEST(SM75_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_8x8x16) { using Shape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; diff --git a/test/unit/gemm/warp/gemm_sm80.cu b/test/unit/gemm/warp/gemm_sm80.cu new file mode 100644 index 00000000..377e760c --- /dev/null +++ b/test/unit/gemm/warp/gemm_sm80.cu @@ -0,0 +1,1782 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/*! \file + + \brief Unit tests for thread-level GEMM +*/ + +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/aligned_buffer.h" +#include "cutlass/half.h" + +#include "cutlass/gemm/warp/default_mma_tensor_op.h" + +#include "cutlass/core_io.h" +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/tensor_view_io.h" + +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/gemm.h" + +#include "testbed.h" + +#if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x64x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_64x32x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_32x32x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_32x16x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x32_16x16x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<16, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_64x64x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_64x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_32x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_32x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f16, 128x128x64_16x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_64x64x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_64x32x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_32x32x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_32x16x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x16_16x16x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 16>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_64x64x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_64x32x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_32x32x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_32x16x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_tf32, 128x128x32_16x16x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<16, 16, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_64x64x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x32_32x32x32_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_64x64x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f16, 128x128x64_32x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = cutlass::half_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x16_64x64x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x16_32x32x16_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x32_64x64x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_tf32, 128x128x32_32x32x32_16x8x8) { + using Shape = cutlass::gemm::GemmShape<32, 32, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = cutlass::tfloat32_t; + using ElementC = float; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor>::Type; + + test::gemm::warp::Testbed >() + .run(); +} +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_tn, tf32_round_128x128x32_16x16x32_16x8x8) { + + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = float; + using ElementC = float; + + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::TransformTestbed >() + .run(); +} + +TEST(SM80_warp_gemm_tensor_op_nt, tf32_round_128x128x32_16x16x32_16x8x8) { + + using Shape = cutlass::gemm::GemmShape<64, 64, 32>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 8>; + using Element = float; + using ElementC = float; + + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::TransformTestbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_16x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x16x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x32x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x64x64_16x8x16) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 16>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_16x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_32x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x64_64x64x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 32>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x64x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_64x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_32x32x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 32, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_32x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x64_16x16x64_16x8x32) { + using Shape = cutlass::gemm::GemmShape<16, 16, 64>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_64x64x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 64, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_64x32x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<64, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_32x32x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_32x16x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<32, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i8, 128x128x128_16x16x128_16x8x32) { + using Shape = cutlass::gemm::GemmShape<16, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 32>; + using Element = int8_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_64x64x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 64, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_64x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_32x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_32x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x128_16x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<16, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 128>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_64x64x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 64, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_64x32x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 32, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_32x32x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 32, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_32x16x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 16, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_i4, 128x128x256_16x16x256_16x8x64) { + using Shape = cutlass::gemm::GemmShape<16, 16, 256>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 256>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x64x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 64, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_64x32x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x32x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 32, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_32x16x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x512_16x16x512_16x8x256) { + using Shape = cutlass::gemm::GemmShape<16, 16, 512>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 512>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x64x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 64, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_64x32x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<64, 32, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x32x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 32, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_32x16x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<32, 16, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_b1, 128x128x1024_16x16x1024_16x8x256) { + using Shape = cutlass::gemm::GemmShape<16, 16, 1024>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 256>; + using Element = cutlass::uint1b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 1024>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 16x16x4_16x16x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<16, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x16x4_32x16x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 16, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x32x4_32x32x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 32, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_congruous_f64, 32x64x4_32x64x4_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 64, 4>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::ColumnMajorTensorOpMultiplicandCongruous64b; + using LayoutB = cutlass::layout::RowMajorTensorOpMultiplicandCongruous64b; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 16x16x16_16x16x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<16, 16, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 32x32x16_32x32x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 64x32x16_64x32x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<64, 32, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_crosswise_f64, 32x64x16_32x64x16_8x8x4) { + using Shape = cutlass::gemm::GemmShape<32, 64, 16>; + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 4>; + using Element = double; + using ElementC = double; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicand64bCrosswise; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicand64bCrosswise; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAdd>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_16x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<16, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_32x16x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 16, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_32x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<32, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_64x32x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 32, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +TEST(SM80_warp_gemm_tensor_op_interleaved, 128x128x128_64x64x128_16x8x64) { + using Shape = cutlass::gemm::GemmShape<64, 64, 128>; + using InstructionShape = cutlass::gemm::GemmShape<16, 8, 64>; + using Element = cutlass::int4b_t; + using ElementC = int; + using LayoutA = cutlass::layout::RowMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + using LayoutB = cutlass::layout::ColumnMajorTensorOpMultiplicandCrosswise< + cutlass::sizeof_bits::value, 64>; + + using MmaTensorOp = typename cutlass::gemm::warp::DefaultMmaTensorOp< + Shape, InstructionShape, Element, LayoutA, Element, LayoutB, ElementC, + cutlass::layout::RowMajor, cutlass::arch::OpMultiplyAddSaturate>::Type; + + test::gemm::warp::Testbed >() + .run(); +} + +//////////////////////////////////////////////////////////////////////////////// + +#endif // if defined(CUTLASS_ARCH_MMA_SM80_SUPPORTED) + diff --git a/test/unit/gemm/warp/testbed.h b/test/unit/gemm/warp/testbed.h index 9560b910..8a565fd9 100644 --- a/test/unit/gemm/warp/testbed.h +++ b/test/unit/gemm/warp/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -996,7 +996,6 @@ struct TransformedTestbedComplex { ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace warp } // namespace gemm } // namespace test diff --git a/test/unit/gemm/warp/wmma_sm70.cu b/test/unit/gemm/warp/wmma_sm70.cu index d5e1107c..5b9ce63d 100644 --- a/test/unit/gemm/warp/wmma_sm70.cu +++ b/test/unit/gemm/warp/wmma_sm70.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm72.cu b/test/unit/gemm/warp/wmma_sm72.cu index 4f81bbe2..89bfbb59 100644 --- a/test/unit/gemm/warp/wmma_sm72.cu +++ b/test/unit/gemm/warp/wmma_sm72.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/gemm/warp/wmma_sm75.cu b/test/unit/gemm/warp/wmma_sm75.cu index a041610d..3818793e 100644 --- a/test/unit/gemm/warp/wmma_sm75.cu +++ b/test/unit/gemm/warp/wmma_sm75.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/CMakeLists.txt b/test/unit/layout/CMakeLists.txt index ab34df0c..29ebdbdd 100644 --- a/test/unit/layout/CMakeLists.txt +++ b/test/unit/layout/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/layout/matrix.cu b/test/unit/layout/matrix.cu index 0adddb89..2f8d0ea2 100644 --- a/test/unit/layout/matrix.cu +++ b/test/unit/layout/matrix.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/tensor.cu b/test/unit/layout/tensor.cu index a6b3f7cf..b4a43fb3 100644 --- a/test/unit/layout/tensor.cu +++ b/test/unit/layout/tensor.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/layout/tensor_nhwc.cu b/test/unit/layout/tensor_nhwc.cu index 697f753d..46482b2b 100644 --- a/test/unit/layout/tensor_nhwc.cu +++ b/test/unit/layout/tensor_nhwc.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/CMakeLists.txt b/test/unit/nvrtc/CMakeLists.txt index 7261da96..668ea35e 100644 --- a/test/unit/nvrtc/CMakeLists.txt +++ b/test/unit/nvrtc/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/nvrtc/cutlass/nvrtc/environment.h b/test/unit/nvrtc/cutlass/nvrtc/environment.h index e3d493ab..27e99934 100644 --- a/test/unit/nvrtc/cutlass/nvrtc/environment.h +++ b/test/unit/nvrtc/cutlass/nvrtc/environment.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/kernel/thread/testbed_kernel.h b/test/unit/nvrtc/kernel/thread/testbed_kernel.h index c7582351..50087058 100644 --- a/test/unit/nvrtc/kernel/thread/testbed_kernel.h +++ b/test/unit/nvrtc/kernel/thread/testbed_kernel.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/stdlib/stdint.h b/test/unit/nvrtc/stdlib/stdint.h index 50ed027d..38021681 100644 --- a/test/unit/nvrtc/stdlib/stdint.h +++ b/test/unit/nvrtc/stdlib/stdint.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/CMakeLists.txt b/test/unit/nvrtc/thread/CMakeLists.txt index f1d2b7a1..2e12ccfa 100644 --- a/test/unit/nvrtc/thread/CMakeLists.txt +++ b/test/unit/nvrtc/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/gemm_nvrtc.cu b/test/unit/nvrtc/thread/gemm_nvrtc.cu index bf57f1d3..785ebcb2 100644 --- a/test/unit/nvrtc/thread/gemm_nvrtc.cu +++ b/test/unit/nvrtc/thread/gemm_nvrtc.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/nvrtc/thread/testbed.h b/test/unit/nvrtc/thread/testbed.h index 69bf81f4..41ba503a 100644 --- a/test/unit/nvrtc/thread/testbed.h +++ b/test/unit/nvrtc/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/CMakeLists.txt b/test/unit/reduction/CMakeLists.txt index ba1b2a99..7b4f2670 100644 --- a/test/unit/reduction/CMakeLists.txt +++ b/test/unit/reduction/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/CMakeLists.txt b/test/unit/reduction/kernel/CMakeLists.txt index 9ef27c84..e1983153 100644 --- a/test/unit/reduction/kernel/CMakeLists.txt +++ b/test/unit/reduction/kernel/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/reduce_splitk.cu b/test/unit/reduction/kernel/reduce_splitk.cu index f4a7f07d..b169cb60 100644 --- a/test/unit/reduction/kernel/reduce_splitk.cu +++ b/test/unit/reduction/kernel/reduce_splitk.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/kernel/reduce_splitk_testbed.h b/test/unit/reduction/kernel/reduce_splitk_testbed.h index c5cbbd58..8e704070 100644 --- a/test/unit/reduction/kernel/reduce_splitk_testbed.h +++ b/test/unit/reduction/kernel/reduce_splitk_testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/thread/CMakeLists.txt b/test/unit/reduction/thread/CMakeLists.txt index f42276f7..0641590e 100644 --- a/test/unit/reduction/thread/CMakeLists.txt +++ b/test/unit/reduction/thread/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/reduction/thread/reduction_thread.cu b/test/unit/reduction/thread/reduction_thread.cu index ece49345..f71e30f5 100644 --- a/test/unit/reduction/thread/reduction_thread.cu +++ b/test/unit/reduction/thread/reduction_thread.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/reduction/thread/testbed.h b/test/unit/reduction/thread/testbed.h index 3646e5bf..919839b3 100644 --- a/test/unit/reduction/thread/testbed.h +++ b/test/unit/reduction/thread/testbed.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/test_unit.cpp b/test/unit/test_unit.cpp index fc386250..3bb8ac13 100644 --- a/test/unit/test_unit.cpp +++ b/test/unit/test_unit.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/transform/CMakeLists.txt b/test/unit/transform/CMakeLists.txt index ee865cd4..a7b881ae 100644 --- a/test/unit/transform/CMakeLists.txt +++ b/test/unit/transform/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/CMakeLists.txt b/test/unit/transform/threadblock/CMakeLists.txt index e849dc8a..0d5e5c44 100644 --- a/test/unit/transform/threadblock/CMakeLists.txt +++ b/test/unit/transform/threadblock/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/predicated_tile_iterator.cu b/test/unit/transform/threadblock/predicated_tile_iterator.cu index 70502f73..562c7888 100644 --- a/test/unit/transform/threadblock/predicated_tile_iterator.cu +++ b/test/unit/transform/threadblock/predicated_tile_iterator.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu index e032383e..e52af8ed 100644 --- a/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu +++ b/test/unit/transform/threadblock/regular_tile_iterator_tensor_op.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/test/unit/util/complex.cu b/test/unit/util/complex.cu index e4867e19..319bbb2a 100644 --- a/test/unit/util/complex.cu +++ b/test/unit/util/complex.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 0aa594b0..5c140a9a 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/library/CMakeLists.txt b/tools/library/CMakeLists.txt index 8c8c5c47..37bb8990 100644 --- a/tools/library/CMakeLists.txt +++ b/tools/library/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: @@ -22,7 +22,7 @@ include(GNUInstallDirs) -find_package(Python3 3.6 COMPONENTS Interpreter REQUIRED) +find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED) add_library(cutlass_library_includes INTERFACE) add_library(nvidia::cutlass::library::includes ALIAS cutlass_library_includes) @@ -59,7 +59,7 @@ cutlass_add_library( src/operation_table.cu src/singleton.cu src/util.cu - + ) file(GLOB_RECURSE GENERATOR_PYTHON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/scripts/*.py) diff --git a/tools/library/include/cutlass/library/handle.h b/tools/library/include/cutlass/library/handle.h index 1b60eb7d..58c6b30c 100644 --- a/tools/library/include/cutlass/library/handle.h +++ b/tools/library/include/cutlass/library/handle.h @@ -45,6 +45,9 @@ private: /// Host workspace static int const kHostWorkspaceSize = (4 << 10); + /// Provider of operations + Provider provider_; + /// CUDA device properties cudaDeviceProp device_; @@ -90,6 +93,12 @@ public: /// Gets the current CUDA stream cudaStream_t get_stream() const; + /// Gets the current provider + Provider get_provider() const; + + /// Sets the provider of operations + void set_provider(Provider provider); + /// Gets the device workspace size size_t get_workspace_size() const; @@ -149,6 +158,56 @@ public: void * ptr_D, /// Pointer to D matrix int ldd /// Leading dimension of D matrix ); + + /// Executes a GEMM computation: D <= alpha * A*B + beta * C. + // + // Supports batched-strided, batched array or split-K serial or split-K parallel. + // + Status gemm_universal( + + GemmUniversalMode mode, /// indicates the mode in which the kUniversal GEMM is launched + + int M, /// GEMM M dimension + int N, /// GEMM N dimension + int K, /// GEMM K dimension + + NumericTypeID element_compute, /// Data type of internal accumulation + + NumericTypeID element_scalar, /// Data type of alpha/beta scalars + + void const *alpha, /// Pointer to alpha scalar + + NumericTypeID element_A, /// Data type of A matrix elements + LayoutTypeID layout_A, /// Layout of A matrix + ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices + + void const * ptr_A, /// Pointer to A matrix in Global Memory + int lda, /// Leading dimension of A matrix + + NumericTypeID element_B, /// Data type of B matrix elements + LayoutTypeID layout_B, /// Layout of B matrix + ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices + + void const * ptr_B, /// Pointer to B matrix in Global Memory + int ldb, /// Leading dimension of B matrix + + void const * beta, /// Pointer to beta scalar + + NumericTypeID element_C, /// Data type of C and D matrices + + void const * ptr_C, /// Pointer to C matrix + int ldc, /// Leading dimension of C matrix + + void * ptr_D, /// Pointer to D matrix + int ldd, /// Leading dimension of D matrix + + int batch_count = 1, /// Batch count or number of split-K slices + + int64_t batch_stride_A = 0, /// Batch stride of A operand + int64_t batch_stride_B = 0, /// Batch stride of B operand + int64_t batch_stride_C = 0, /// Batch stride of C operand + int64_t batch_stride_D = 0 /// Batch stride of D operand + ); /// Planar complex GEMM /// @@ -276,7 +335,6 @@ public: using HandlePtr = std::unique_ptr; ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace library } // namespace cutlass diff --git a/tools/library/include/cutlass/library/library.h b/tools/library/include/cutlass/library/library.h index f58e3a45..d093b611 100644 --- a/tools/library/include/cutlass/library/library.h +++ b/tools/library/include/cutlass/library/library.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "cutlass/cutlass.h" @@ -93,10 +94,14 @@ enum class NumericTypeID { kS32, kS64, kF16, + kBF16, + kTF32, kF32, kF64, kCF16, + kCBF16, kCF32, + kCTF32, kCF64, kCS4, kCS8, @@ -120,6 +125,7 @@ enum class ComplexTransform { /// Providers enum class Provider { + kNone, kCUTLASS, kReferenceHost, kReferenceDevice, @@ -132,6 +138,8 @@ enum class Provider { /// Enumeration indicating the kind of operation enum class OperationKind { kGemm, + kEqGemm, + kReduction, kInvalid }; @@ -160,9 +168,11 @@ enum class OpcodeClassID { }; enum class MathOperationID { + kAdd, kMultiplyAdd, kMultiplyAddSaturate, kMultiplyAddComplex, + kMultiplyAddGaussianComplex, kXorPopc, kInvalid }; @@ -180,12 +190,17 @@ enum class GemmKind { kInvalid }; -/// Mode of GEMM -enum class GemmUniversalMode { - kGemm, - kGemmSplitKParallel, - kBatched, - kArray, +/// Mode of Universal GEMM +using GemmUniversalMode = cutlass::gemm::GemmUniversalMode; + +enum class EpilogueKind { + kUnknown, + kConversion, + kLinearCombination, + kLinearCombinationClamp, + kLinearCombinationPlanarComplex, + kLinearCombinationRelu, + kLinearCombinationSigmoid, kInvalid }; @@ -220,6 +235,22 @@ struct MathInstructionDescription { opcode_class(opcode_class), math_operation(math_operation) {} + // Equality operator + inline + bool operator==(MathInstructionDescription const& rhs) const{ + return ( + (instruction_shape == rhs.instruction_shape) && + (element_accumulator == rhs.element_accumulator) && + (opcode_class == rhs.opcode_class) && + (math_operation == rhs.math_operation)); + } + + // Inequality operator + inline + bool operator!=(MathInstructionDescription const& rhs) const { + return !(*this == rhs); + } + }; /// Structure describing the tiled structure of a GEMM-like computation @@ -261,6 +292,24 @@ struct TileDescription { math_instruction(math_instruction), minimum_compute_capability(minimum_compute_capability), maximum_compute_capability(maximum_compute_capability) { } + + // Equality operator + inline + bool operator==(TileDescription const& rhs) const{ + return ( + (threadblock_shape == rhs.threadblock_shape) && + (threadblock_stages == rhs.threadblock_stages) && + (warp_count == rhs.warp_count) && + (math_instruction == rhs.math_instruction) && + (minimum_compute_capability == rhs.minimum_compute_capability) && + (maximum_compute_capability == rhs.maximum_compute_capability)); + } + + // Inequality operator + inline + bool operator!=(TileDescription const& rhs) const { + return !(*this == rhs); + } }; /// High-level description of an operation @@ -379,6 +428,20 @@ struct GemmDescription : public OperationDescription { transform_B(transform_B) {} }; + +/// Description of all Reduction operations +struct ReductionDescription : public OperationDescription { + + /// Describes the data type of workspace + NumericTypeID element_workspace; + + /// Describes the data type of final output + NumericTypeID element_output; + + /// Describes the data type of the scalars passed to the epilogue + NumericTypeID element_epilogue; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -549,6 +612,42 @@ struct GemmArrayArguments { ///////////////////////////////////////////////////////////////////////////////////////////////// +/// Universal GEMM supporting multiple split-K modes, multiple batched modes, real and complex +// +// OperationKind: Gemm +// GemmKind: Universal + +struct GemmUniversalConfiguration { + + GemmUniversalMode mode; + gemm::GemmCoord problem_size; + int batch_count; + + int64_t lda; + int64_t ldb; + int64_t ldc; + int64_t ldd; +}; + +struct GemmUniversalArguments { + + void const *A; + void const *B; + void const *C; + void *D; + + void const *alpha; + void const *beta; + ScalarPointerMode pointer_mode; + + int64_t batch_stride_A; + int64_t batch_stride_B; + int64_t batch_stride_C; + int64_t batch_stride_D; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + /// Complex valued GEMM in which real and imaginary parts are separated by a stride // // OperationKind: Gemm @@ -648,7 +747,6 @@ struct GemmPlanarComplexArrayArguments { ScalarPointerMode pointer_mode; }; - ///////////////////////////////////////////////////////////////////////////////////////////////// } // namespace library diff --git a/tools/library/include/cutlass/library/manifest.h b/tools/library/include/cutlass/library/manifest.h index eaa90b3a..54e51c1f 100644 --- a/tools/library/include/cutlass/library/manifest.h +++ b/tools/library/include/cutlass/library/manifest.h @@ -45,6 +45,13 @@ namespace cutlass { namespace library { /////////////////////////////////////////////////////////////////////////////////////////////////// +// Forward declaration +class Manifest; + +// init and insert all cutlass gemm and conv2d op in manifest object (procedurally generated using generator.py) +void initialize_all(Manifest &manifest); + +///////////////////////////////////////////////////////////////////////////////////////////////////////// /// List of operations using OperationVector = std::vector>; diff --git a/tools/library/include/cutlass/library/operation_table.h b/tools/library/include/cutlass/library/operation_table.h index 80ce1e15..3821f65a 100644 --- a/tools/library/include/cutlass/library/operation_table.h +++ b/tools/library/include/cutlass/library/operation_table.h @@ -29,24 +29,28 @@ */ #pragma once - +#include #include #include #include #include "cutlass/library/library.h" #include "cutlass/library/manifest.h" - +#include "cutlass/library/util.h" ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { namespace library { +///////////////////////////////////////////////////////////////////////////////////////////////// +// Data Structures for Gemm Functional Maps ///////////////////////////////////////////////////////////////////////////////////////////////// -/// Tuple uniquely identifying functional behavior +/// Tuple uniquely identifying Gemm functional behavior struct GemmFunctionalKey { + Provider provider; + GemmKind gemm_kind; NumericTypeID element_compute; NumericTypeID element_scalar; NumericTypeID element_A; @@ -63,6 +67,8 @@ struct GemmFunctionalKey { inline GemmFunctionalKey( + Provider provider, + GemmKind gemm_kind = GemmKind::kGemm, NumericTypeID element_compute = NumericTypeID::kF32, NumericTypeID element_scalar = NumericTypeID::kF32, NumericTypeID element_A = NumericTypeID::kF16, @@ -73,6 +79,8 @@ struct GemmFunctionalKey { ComplexTransform transform_B = ComplexTransform::kNone, NumericTypeID element_C = NumericTypeID::kF16 ): + provider(provider), + gemm_kind(gemm_kind), element_compute(element_compute), element_scalar(element_scalar), element_A(element_A), @@ -87,6 +95,8 @@ struct GemmFunctionalKey { inline bool operator==(GemmFunctionalKey const &rhs) const { return + (provider == rhs.provider) && + (gemm_kind == rhs.gemm_kind) && (element_compute == rhs.element_compute) && (element_scalar == rhs.element_scalar) && (element_A == rhs.element_A) && @@ -104,6 +114,28 @@ struct GemmFunctionalKey { } }; + +///////////////////////////////////////////////////////////////////////////////////////////////// +inline +std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) { + + out << "{\n" + << " provider: " << to_string(k.provider) << "\n" + << " gemm_kind: " << to_string(k.gemm_kind) << "\n" + << " element_compute: " << to_string(k.element_compute) << "\n" + << " element_scalar: " << to_string(k.element_scalar) << "\n" + << " element_A: " << to_string(k.element_A) << "\n" + << " layout_A: " << to_string(k.layout_A) << "\n" + << " transform_A: " << to_string(k.transform_A) << "\n" + << " element_B: " << to_string(k.element_B) << "\n" + << " layout_B: " << to_string(k.layout_B) << "\n" + << " transform_B: " << to_string(k.transform_B) << "\n" + << " element_C: " << to_string(k.element_C) << "\n" + << "}"; + + return out; +} + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Hash function for GemmFunctionalKey @@ -120,15 +152,17 @@ struct GemmFunctionalKeyHasher { IntHash hash; return - rotl(hash(int(key.element_compute)), 2) ^ - rotl(hash(int(key.element_scalar)), 3) ^ - rotl(hash(int(key.element_A)), 4) ^ - rotl(hash(int(key.layout_A)), 5) ^ - rotl(hash(int(key.transform_A)), 6) ^ - rotl(hash(int(key.element_B)), 7) ^ - rotl(hash(int(key.layout_B)), 8) ^ - rotl(hash(int(key.transform_B)), 9) ^ - rotl(hash(int(key.element_C)), 10); + rotl(hash(int(key.provider)), 1) ^ + rotl(hash(int(key.gemm_kind)), 2) ^ + rotl(hash(int(key.element_compute)), 3) ^ + rotl(hash(int(key.element_scalar)), 4) ^ + rotl(hash(int(key.element_A)), 5) ^ + rotl(hash(int(key.layout_A)), 6) ^ + rotl(hash(int(key.transform_A)), 7) ^ + rotl(hash(int(key.element_B)), 8) ^ + rotl(hash(int(key.layout_B)), 9) ^ + rotl(hash(int(key.transform_B)), 10) ^ + rotl(hash(int(key.element_C)), 11); } }; @@ -172,6 +206,7 @@ using GemmOperationFunctionalMap = std::unordered_map< GemmOperationVectorMap, GemmFunctionalKeyHasher >; +///////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -179,15 +214,10 @@ using GemmOperationFunctionalMap = std::unordered_map< class OperationTable { public: - /// Map of all operations of type kGemm and gemm_kind of type kGemm + /// Map of all operations of type kGemm + // provider (kCUTLASS) GemmOperationFunctionalMap gemm_operations; - /// Map of all operations of type kGemm and gemm_kind of type kPlanarComplex - GemmOperationFunctionalMap gemm_planar_complex_operations; - - /// Map of all operations of type kGemm and gemm_kind of type kPlanarComplexArray - GemmOperationFunctionalMap gemm_planar_complex_array_operations; - public: void append(Manifest const &manifest); @@ -202,4 +232,3 @@ public: ///////////////////////////////////////////////////////////////////////////////////////////////// std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k); - diff --git a/tools/library/include/cutlass/library/util.h b/tools/library/include/cutlass/library/util.h index 5ff678e8..526f836b 100644 --- a/tools/library/include/cutlass/library/util.h +++ b/tools/library/include/cutlass/library/util.h @@ -49,6 +49,9 @@ char const *to_string(Provider provider, bool pretty = false); /// Parses a Provider enumerant from a string template <> Provider from_string(std::string const &str); +/// Converts a GemmKind enumerant to a string +char const *to_string(GemmKind type, bool pretty = false); + /// Converts a NumericType enumerant to a string char const *to_string(OperationKind type, bool pretty = false); @@ -111,6 +114,14 @@ char const *to_string(ComplexTransform type, bool pretty = false); template <> ComplexTransform from_string(std::string const &str); + +/// Converts a SplitKMode enumerant to a string +char const *to_string(SplitKMode split_k_mode, bool pretty = false); + +/// Converts a SplitKMode enumerant from a string +template <> +SplitKMode from_string(std::string const &str); + /// Lexical cast from int64_t to string std::string lexical_cast(int64_t int_value); diff --git a/tools/library/scripts/gemm_operation.py b/tools/library/scripts/gemm_operation.py index cc7d35d2..66ecc05e 100644 --- a/tools/library/scripts/gemm_operation.py +++ b/tools/library/scripts/gemm_operation.py @@ -23,7 +23,7 @@ from library import * class GemmOperation: # def __init__(self, gemm_kind, arch, tile_description, A, B, C, element_epilogue, \ - epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Cohort): + epilogue_functor = EpilogueFunctor.LinearCombination, swizzling_functor = SwizzlingFunctor.Identity8): self.operation_kind = OperationKind.Gemm self.arch = arch @@ -40,6 +40,7 @@ class GemmOperation: def is_complex(self): complex_operators = [ MathOperation.multiply_add_complex, + MathOperation.multiply_add_complex_gaussian ] return self.tile_description.math_instruction.math_operation in complex_operators @@ -58,6 +59,8 @@ class GemmOperation: # def short_math_name(self): + if self.tile_description.math_instruction.math_operation == MathOperation.multiply_add_complex_gaussian: + return "g%s" % ShortDataTypeNames[self.accumulator_type()] return ShortDataTypeNames[self.accumulator_type()] @@ -259,6 +262,135 @@ class EmitGemmInstance: ################################################################################################### +# +class EmitGemmUniversalInstance: + ''' Responsible for emitting a CUTLASS template definition''' + + def __init__(self): + self.gemm_template = """ +// Gemm operator ${operation_name} +using ${operation_name}_base = + typename cutlass::gemm::kernel::DefaultGemmUniversal< + ${element_b}, ${layout_b}, ${transform_b}, ${align_b}, // transposed B operand + ${element_a}, ${layout_a}, ${transform_a}, ${align_a}, // transposed A operand + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${math_operation} +>::GemmKernel; + +// Define named type +struct ${operation_name} : + public ${operation_name}_base { }; +""" + self.gemm_template_interleaved = """ +// Gemm operator ${operation_name} +using ${operation_name}_base = + typename cutlass::gemm::kernel::DefaultGemmUniversal< + ${element_a}, ${layout_a}, ${transform_a}, ${align_a}, + ${element_b}, ${layout_b}, ${transform_b}, ${align_b}, + ${element_c}, ${layout_c}, + ${element_accumulator}, + ${opcode_class}, + ${arch}, + cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>, + cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>, + cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>, + ${epilogue_functor}< + ${element_c}, + ${epilogue_vector_length}, + ${element_accumulator}, + ${element_epilogue} + >, + ${swizzling_functor}, + ${stages}, + ${math_operation} +>::GemmKernel; + +// Define named type +struct ${operation_name} : + public ${operation_name}_base { }; +""" + + def emit(self, operation): + + threadblock_shape = operation.tile_description.threadblock_shape + warp_count = operation.tile_description.warp_count + + warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)] + warp_shape[2] = operation.tile_description.threadblock_shape[2] + + epilogue_vector_length = int(min(operation.C.alignment * DataTypeSize[operation.C.element], 128) / DataTypeSize[operation.C.element]) + + transpose_layouts = { + LayoutType.ColumnMajor: LayoutType.RowMajor, + LayoutType.RowMajor: LayoutType.ColumnMajor + } + + if operation.A.layout in transpose_layouts.keys() and \ + operation.B.layout in transpose_layouts.keys() and \ + operation.C.layout in transpose_layouts.keys(): + + instance_layout_A = transpose_layouts[operation.A.layout] + instance_layout_B = transpose_layouts[operation.B.layout] + instance_layout_C = transpose_layouts[operation.C.layout] + + gemm_template = self.gemm_template + else: + instance_layout_A, instance_layout_B, instance_layout_C = \ + (operation.A.layout, operation.B.layout, operation.C.layout) + + gemm_template = self.gemm_template_interleaved + # + + values = { + 'operation_name': operation.procedural_name(), + 'element_a': DataTypeTag[operation.A.element], + 'layout_a': LayoutTag[instance_layout_A], + 'element_b': DataTypeTag[operation.B.element], + 'layout_b': LayoutTag[instance_layout_B], + 'element_c': DataTypeTag[operation.C.element], + 'layout_c': LayoutTag[instance_layout_C], + 'element_accumulator': DataTypeTag[operation.accumulator_type()], + 'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class], + 'arch': "cutlass::arch::Sm%d" % operation.arch, + 'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]), + 'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]), + 'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]), + 'warp_shape_m': str(warp_shape[0]), + 'warp_shape_n': str(warp_shape[1]), + 'warp_shape_k': str(warp_shape[2]), + 'instruction_shape_m': str(operation.tile_description.math_instruction.instruction_shape[0]), + 'instruction_shape_n': str(operation.tile_description.math_instruction.instruction_shape[1]), + 'instruction_shape_k': str(operation.tile_description.math_instruction.instruction_shape[2]), + 'epilogue_vector_length': str(epilogue_vector_length), + 'element_epilogue': str(DataTypeTag[operation.element_epilogue]), + 'epilogue_functor': EpilogueFunctorTag[operation.epilogue_functor], + 'swizzling_functor': SwizzlingFunctorTag[operation.swizzling_functor], + 'stages': str(operation.tile_description.stages), + 'align_a': str(operation.A.alignment), + 'align_b': str(operation.B.alignment), + 'transform_a': ComplexTransformTag[operation.A.complex_transform], + 'transform_b': ComplexTransformTag[operation.B.complex_transform], + 'math_operation': MathOperationTag[operation.tile_description.math_instruction.math_operation] + } + + return SubstituteTemplate(gemm_template, values) + +################################################################################################### + # class EmitGemmPlanarComplexInstance: ''' Responsible for emitting a CUTLASS template definition''' @@ -282,12 +414,13 @@ class EmitGemmPlanarComplexInstance: ${element_accumulator}, ${element_epilogue} >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, ${stages}, ${math_operator} >::GemmKernel; - struct ${operation_name} : public Operation_${operation_name} { }; + struct ${operation_name} : + public Operation_${operation_name} { }; """ def emit(self, operation): @@ -355,7 +488,7 @@ class EmitGemmPlanarComplexArrayInstance: ${element_accumulator}, ${element_epilogue} >, - cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle, + cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<>, ${stages}, ${math_operator} >::GemmArrayKernel; @@ -419,12 +552,14 @@ class EmitGemmConfigurationLibrary: self.instance_emitter = { GemmKind.Gemm: EmitGemmInstance, + GemmKind.Universal: EmitGemmUniversalInstance, GemmKind.PlanarComplex: EmitGemmPlanarComplexInstance, GemmKind.PlanarComplexArray: EmitGemmPlanarComplexArrayInstance } self.gemm_kind_wrappers = { GemmKind.Gemm: 'GemmOperation', + GemmKind.Universal: 'GemmUniversalOperation', GemmKind.PlanarComplex: 'GemmPlanarComplexOperation', GemmKind.PlanarComplexArray: 'GemmPlanarComplexArrayOperation' } @@ -436,6 +571,13 @@ class EmitGemmConfigurationLibrary: ${compile_guard_start} manifest.append(new ${gemm_kind}("${operation_name}")); ${compile_guard_end} +""", + GemmKind.Universal: """ +${compile_guard_start} + manifest.append(new ${gemm_kind}< + cutlass::gemm::device::GemmUniversalAdapter<${operation_name}> + >("${operation_name}")); +${compile_guard_end} """, GemmKind.PlanarComplex: """ ${compile_guard_start} @@ -542,3 +684,4 @@ void initialize_${configuration_name}(Manifest &manifest) { ################################################################################################### ################################################################################################### + diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py index 4b1a483e..29578645 100644 --- a/tools/library/scripts/generator.py +++ b/tools/library/scripts/generator.py @@ -18,7 +18,7 @@ from gemm_operation import * def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): # by default, use the latest CUDA Toolkit version - cuda_version = [10, 2, 82] + cuda_version = [11, 0, 132] # Update cuda_version based on parsed string if semantic_ver_string != '': @@ -36,7 +36,7 @@ def CudaToolkitVersionSatisfies(semantic_ver_string, major, minor, patch = 0): # def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ alignment_constraints, complex_transforms = None, epilogue_functor = EpilogueFunctor.LinearCombination, \ - swizzling_functor = SwizzlingFunctor.Cohort): + swizzling_functor = SwizzlingFunctor.Identity8): if complex_transforms is None: complex_transforms = [(ComplexTransform.none, ComplexTransform.none),] @@ -61,7 +61,7 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \ B = TensorDescription(element_b, layout[1], alignment, complex_transform[1]) C = TensorDescription(element_c, layout[2], alignment_c) - new_operation = GemmOperation(GemmKind.Gemm, tile_description.minimum_compute_capability, \ + new_operation = GemmOperation(GemmKind.Universal, tile_description.minimum_compute_capability, \ tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor) manifest.append(new_operation) @@ -466,6 +466,9 @@ def GenerateSM70_WmmaTensorOp_161616(manifest, args): def GenerateSM70(manifest, args): GenerateSM70_TensorOp_884(manifest, args) GenerateSM70_PlanarComplexTensorOp_884(manifest, args) + + # To limit build size, WMMA GEMMs are disabled for now. + # #GenerateSM70_WmmaTensorOp_161616(manifest, args) ################################################################################################### @@ -621,6 +624,11 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args): DataType.s8, DataType.s8, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 16], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -654,7 +662,7 @@ def GenerateSM75_TensorOp_8816_TN(manifest, args): data_type_mixed = [ math_inst.element_a, math_inst.element_b, - math_inst.element_a, + DataType.s8, DataType.f32, ] @@ -687,6 +695,11 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): DataType.s8, DataType.s8, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 16], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -712,8 +725,7 @@ def GenerateSM75_TensorOp_8816_Interleaved(manifest, args): ] operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ - data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \ - SwizzlingFunctor.Identity) + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 8 @@ -736,6 +748,11 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args): DataType.s4, DataType.s4, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 32], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -769,7 +786,7 @@ def GenerateSM75_TensorOp_8832_TN(manifest, args): data_type_mixed = [ math_inst.element_a, math_inst.element_b, - math_inst.element_a, + DataType.s4, DataType.f32, ] @@ -804,6 +821,11 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): DataType.s4, DataType.s4, DataType.s32, \ OpcodeClass.TensorOp, \ MathOperation.multiply_add_saturate), + MathInstruction( \ + [8, 8, 32], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), ] min_cc = 75 @@ -832,8 +854,7 @@ def GenerateSM75_TensorOp_8832_Interleaved(manifest, args): ] operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ - data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp, \ - SwizzlingFunctor.Identity) + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) for op in operations: op.C.alignment = 16 @@ -911,6 +932,831 @@ def GenerateSM75(manifest, args): ################################################################################################### ################################################################################################### +# +def GenerateSM80_TensorOp_16816(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f16, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.bf16, DataType.bf16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [8, 4, 2] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 3, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 3, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 4, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 5, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 64], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) + if math_inst.element_a != math_inst.element_accumulator: + + data_type_mixed = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_a, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints) + +# + +# +def GenerateSM80_PlanarComplexTensorOp_16816(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.bf16, DataType.bf16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 16], \ + DataType.f16, DataType.f16, DataType.f16, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [8, ] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([ 64, 128, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) + + # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation) + if math_inst.element_a != math_inst.element_accumulator: + + data_type_mixed = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_a, + math_inst.element_accumulator, + ] + + CreateGemmPlanarComplexOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, complex_transforms) + +# +def GenerateSM80_TensorOp_16832_TN(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 32], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 32], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [16,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 128], 3, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([64, 256, 128], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [math_inst.element_a, math_inst.element_b, DataType.s32, DataType.s32] + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s8, DataType.f32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + operations = [] + + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + if op.tile_description.threadblock_shape[1] >= 128: + op.C.alignment = 16 + else: + op.C.alignment = 8 + +# + +# +def GenerateSM80_TensorOp_16832_Interleaved(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajorInterleaved32, LayoutType.RowMajorInterleaved32, LayoutType.ColumnMajorInterleaved32), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 32], \ + DataType.s8, DataType.s8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 32], \ + DataType.u8, DataType.u8, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [16,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s8, DataType.f32] + + operations = CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + op.C.alignment = 8 + +# + +# +def GenerateSM80_TensorOp_16864_TN(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 64], \ + DataType.s4, DataType.s4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 64], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [32,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 256], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 256], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 256], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 256], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [math_inst.element_a, math_inst.element_b, DataType.s32, DataType.s32] + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s4, DataType.f32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + operations = [] + + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + if op.tile_description.threadblock_shape[1] >= 128: + op.C.alignment = 8 + elif op.tile_description.threadblock_shape[1] == 64: + op.C.alignment = 8 + else: + op.C.alignment = 4 +# + +# +def GenerateSM80_TensorOp_16864_Interleaved(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajorInterleaved64, LayoutType.RowMajorInterleaved64, LayoutType.ColumnMajorInterleaved64), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 64], \ + DataType.s4, DataType.s4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + MathInstruction( \ + [16, 8, 64], \ + DataType.u4, DataType.u4, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_saturate), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [32,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 128], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 128], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 128], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 128], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type_mixed = [math_inst.element_a, math_inst.element_b, DataType.s4, DataType.f32] + + operations = [] + + operations += CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + + for op in operations: + op.C.alignment = 16 +# + +# +def GenerateSM80_TensorOp_168256(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 256], \ + DataType.b1, DataType.b1, DataType.s32, \ + OpcodeClass.TensorOp, \ + MathOperation.xor_popc), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [128,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 512], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 512], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 512], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 512], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 1024], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 1024], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 1024], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 1024], 5, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.b1, DataType.b1, DataType.s32, DataType.s32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, None, EpilogueFunctor.LinearCombinationClamp) + +# + +# +def GenerateSM80_TensorOp_1688(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 8], \ + DataType.tf32, DataType.tf32, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add) + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [4, 2, 1] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 3, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 4, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 5, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 32], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + data_type_mixed = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_a, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type_mixed, alignment_constraints) + +# + +# +def GenerateSM80_TensorOp_1688_fast_math(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [16, 8, 8], \ + DataType.tf32, DataType.tf32, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add), + MathInstruction( \ + [16, 8, 8], \ + DataType.f16, DataType.f16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_fast_f16), + MathInstruction( \ + [16, 8, 8], \ + DataType.bf16, DataType.bf16, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_fast_bf16) + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [4, 2, 1] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 16], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 16], 4, [1, 4, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 16], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 6, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 3, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 3, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([128, 64, 32], 4, [2, 1, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 16], 10, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 4, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 32], 5, [1, 2, 2], math_inst, min_cc, max_cc), + TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 256, 32], 3, [1, 4, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.f32, DataType.f32, DataType.f32, DataType.f32] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) + +# + +# +def GenerateSM80_TensorOp_1688_complex(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = MathInstruction( \ + [16, 8, 8], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_complex) + + min_cc = 80 + max_cc = 1024 + + tile_descriptions = [ + TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 4, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 16], 4, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 16], 4, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + DataType.cf32, DataType.cf32, DataType.cf32, DataType.cf32 + ] + + alignment_constraints = [1,] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) +# + +# +def GenerateSM80_TensorOp_884(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = \ + MathInstruction( \ + [8, 8, 4], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add) + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + tile_descriptions = [ + TileDescription([128, 128, 16], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 16], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 16], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 16], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 32, 16], 5, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 16, 16], 5, [2, 1, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.f64, DataType.f64, DataType.f64, DataType.f64] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) +# + +# +def GenerateSM80_TensorOp_884_complex(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = \ + MathInstruction( \ + [8, 8, 4], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_complex) + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + tile_descriptions = [ + TileDescription([128, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 128, 8], 3, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([64, 64, 8], 3, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) + +# +def GenerateSM80_TensorOp_884_complex_gaussian(manifest, args): + + if not CudaToolkitVersionSatisfies(args.cuda_version, 11, 0): + return + + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_inst = \ + MathInstruction( \ + [8, 8, 4], \ + DataType.f64, DataType.f64, DataType.f64, \ + OpcodeClass.TensorOp, \ + MathOperation.multiply_add_complex_gaussian) + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + tile_descriptions = [ + TileDescription([64, 64, 8], 3, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([64, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 64, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 32, 8], 4, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([16, 32, 8], 4, [1, 2, 1], math_inst, min_cc, max_cc), + TileDescription([32, 16, 8], 4, [2, 1, 1], math_inst, min_cc, max_cc), + ] + + data_type = [DataType.cf64, DataType.cf64, DataType.cf64, DataType.cf64] + + complex_transforms = [ + (ComplexTransform.none, ComplexTransform.none), + (ComplexTransform.conj, ComplexTransform.none), + (ComplexTransform.none, ComplexTransform.conj), + (ComplexTransform.conj, ComplexTransform.conj) + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints, complex_transforms) +# + +################################################################################################### + +# +def GenerateSM80_Simt(manifest, args): + layouts = [ + (LayoutType.ColumnMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.ColumnMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.ColumnMajor), + (LayoutType.RowMajor, LayoutType.RowMajor, LayoutType.ColumnMajor), + ] + + math_instructions = [ + MathInstruction( \ + [1, 1, 1], \ + DataType.f32, DataType.f32, DataType.f32, \ + OpcodeClass.Simt, \ + MathOperation.multiply_add), + ] + + min_cc = 80 + max_cc = 1024 + + alignment_constraints = [1,] + + for math_inst in math_instructions: + tile_descriptions = [ + TileDescription([256, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 8], 5, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 8], 5, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([256, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 256, 8], 4, [2, 4, 1], math_inst, min_cc, max_cc), + TileDescription([128, 128, 8], 4, [4, 2, 1], math_inst, min_cc, max_cc), + TileDescription([128, 64, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 128, 8], 5, [2, 2, 1], math_inst, min_cc, max_cc), + TileDescription([ 64, 64, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([128, 32, 8], 5, [2, 1, 1], math_inst, min_cc, max_cc), + TileDescription([ 32, 128, 8], 5, [1, 2, 1], math_inst, min_cc, max_cc), + ] + + data_type = [ + math_inst.element_a, + math_inst.element_b, + math_inst.element_accumulator, + math_inst.element_accumulator, + ] + + CreateGemmOperator(manifest, layouts, tile_descriptions, \ + data_type, alignment_constraints) +# + +################################################################################################### + +# +def GenerateSM80(manifest, args): + + GenerateSM80_TensorOp_16816(manifest, args) + GenerateSM80_PlanarComplexTensorOp_16816(manifest, args) + GenerateSM80_TensorOp_1688(manifest, args) + GenerateSM80_TensorOp_1688_fast_math(manifest, args) + GenerateSM80_TensorOp_1688_complex(manifest, args) + GenerateSM80_TensorOp_884(manifest, args) + GenerateSM80_TensorOp_884_complex(manifest, args) + GenerateSM80_TensorOp_884_complex_gaussian(manifest, args) + GenerateSM80_TensorOp_16832_TN(manifest, args) + GenerateSM80_TensorOp_16832_Interleaved(manifest, args) + GenerateSM80_TensorOp_16864_TN(manifest, args) + GenerateSM80_TensorOp_16864_Interleaved(manifest, args) + GenerateSM80_TensorOp_168256(manifest, args) + GenerateSM80_Simt(manifest, args) +# + ################################################################################################### if __name__ == "__main__": @@ -920,7 +1766,7 @@ if __name__ == "__main__": parser.add_argument("--build-dir", default=".", required=False, help="CUTLASS top-level build directory") parser.add_argument("--curr-build-dir", default=".", help="CUTLASS current build directory. cmake files will be emitted in this directory") parser.add_argument("--generator-target", default='library', help="Target of CUTLASS Library Generator.") - parser.add_argument("--architectures", default='50;60;61;75', help="Target compute architectures") + parser.add_argument("--architectures", default='53;60;61;70;75;80', help="Target compute architectures") parser.add_argument("--kernels", default='', help='Comma delimited list to filter kernels by name.') parser.add_argument("--cuda-version", default="11.0.0", help="Semantic version string of CUDA Toolkit") @@ -933,6 +1779,8 @@ if __name__ == "__main__": GenerateSM61(manifest, args) GenerateSM70(manifest, args) GenerateSM75(manifest, args) + GenerateSM80(manifest, args) + if 'library' in args.generator_target.split(','): manifest.emit(GeneratorTarget.Library) diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py index 71f521e6..bdc43483 100644 --- a/tools/library/scripts/library.py +++ b/tools/library/scripts/library.py @@ -4,14 +4,32 @@ # \brief Generates the CUTLASS Library's instances # -import enum import re ################################################################################################### +import enum + +# The following block implements enum.auto() for Python 3.5 variants that don't include it such +# as the default 3.5.2 on Ubuntu 16.04. +# +# https://codereview.stackexchange.com/questions/177309/reimplementing-pythons-enum-auto-for-compatibility + +try: + from enum import auto as enum_auto +except ImportError: + __cutlass_library_auto_enum = 0 + def enum_auto() -> int: + global __cutlass_library_auto_enum + i = __cutlass_library_auto_enum + __cutlass_library_auto_enum += 1 + return i + +################################################################################################### + # class GeneratorTarget(enum.Enum): - Library = enum.auto() + Library = enum_auto() # GeneratorTargetNames = { GeneratorTarget.Library: 'library' @@ -22,33 +40,37 @@ GeneratorTargetNames = { # class DataType(enum.Enum): - b1 = enum.auto() - u4 = enum.auto() - u8 = enum.auto() - u16 = enum.auto() - u32 = enum.auto() - u64 = enum.auto() - s4 = enum.auto() - s8 = enum.auto() - s16 = enum.auto() - s32 = enum.auto() - s64 = enum.auto() - f16 = enum.auto() - f32 = enum.auto() - f64 = enum.auto() - cf16 = enum.auto() - cf32 = enum.auto() - cf64 = enum.auto() - cs4 = enum.auto() - cs8 = enum.auto() - cs16 = enum.auto() - cs32 = enum.auto() - cs64 = enum.auto() - cu4 = enum.auto() - cu8 = enum.auto() - cu16 = enum.auto() - cu32 = enum.auto() - cu64 = enum.auto() + b1 = enum_auto() + u4 = enum_auto() + u8 = enum_auto() + u16 = enum_auto() + u32 = enum_auto() + u64 = enum_auto() + s4 = enum_auto() + s8 = enum_auto() + s16 = enum_auto() + s32 = enum_auto() + s64 = enum_auto() + f16 = enum_auto() + bf16 = enum_auto() + f32 = enum_auto() + tf32 = enum_auto() + f64 = enum_auto() + cf16 = enum_auto() + cbf16 = enum_auto() + cf32 = enum_auto() + ctf32 = enum_auto() + cf64 = enum_auto() + cs4 = enum_auto() + cs8 = enum_auto() + cs16 = enum_auto() + cs32 = enum_auto() + cs64 = enum_auto() + cu4 = enum_auto() + cu8 = enum_auto() + cu16 = enum_auto() + cu32 = enum_auto() + cu64 = enum_auto() # ShortDataTypeNames = { @@ -74,10 +96,14 @@ DataTypeNames = { DataType.s32: "s32", DataType.s64: "s64", DataType.f16: "f16", + DataType.bf16: "bf16", DataType.f32: "f32", + DataType.tf32: "tf32", DataType.f64: "f64", DataType.cf16: "cf16", + DataType.cbf16: "cbf16", DataType.cf32: "cf32", + DataType.ctf32: "ctf32", DataType.cf64: "cf64", DataType.cu4: "cu4", DataType.cu8: "cu8", @@ -104,10 +130,14 @@ DataTypeTag = { DataType.s32: "int32_t", DataType.s64: "int64_t", DataType.f16: "cutlass::half_t", + DataType.bf16: "cutlass::bfloat16_t", DataType.f32: "float", + DataType.tf32: "cutlass::tfloat32_t", DataType.f64: "double", DataType.cf16: "cutlass::complex", + DataType.cbf16: "cutlass::complex", DataType.cf32: "cutlass::complex", + DataType.ctf32: "cutlass::complex", DataType.cf64: "cutlass::complex", DataType.cu4: "cutlass::complex", DataType.cu8: "cutlass::complex", @@ -134,10 +164,14 @@ DataTypeSize = { DataType.s32: 32, DataType.s64: 64, DataType.f16: 16, + DataType.bf16: 16, DataType.f32: 32, + DataType.tf32: 32, DataType.f64: 64, DataType.cf16: 32, + DataType.cbf16: 32, DataType.cf32: 64, + DataType.ctf32: 32, DataType.cf64: 128, DataType.cu4: 8, DataType.cu8: 16, @@ -155,8 +189,8 @@ DataTypeSize = { # class ComplexTransform(enum.Enum): - none = enum.auto() - conj = enum.auto() + none = enum_auto() + conj = enum_auto() # ComplexTransformTag = { @@ -194,40 +228,47 @@ def get_real_from_complex(complex_type): # class ComplexMultiplyOp(enum.Enum): - multiply_add = enum.auto() - gaussian = enum.auto() + multiply_add = enum_auto() + gaussian = enum_auto() ################################################################################################### # class MathOperation(enum.Enum): - multiply_add = enum.auto() - multiply_add_saturate = enum.auto() - xor_popc = enum.auto() - multiply_add_complex = enum.auto() + multiply_add = enum_auto() + multiply_add_saturate = enum_auto() + xor_popc = enum_auto() + multiply_add_fast_bf16 = enum_auto() + multiply_add_fast_f16 = enum_auto() + multiply_add_complex = enum_auto() + multiply_add_complex_gaussian = enum_auto() + # MathOperationTag = { MathOperation.multiply_add: 'cutlass::arch::OpMultiplyAdd', MathOperation.multiply_add_saturate: 'cutlass::arch::OpMultiplyAddSaturate', MathOperation.xor_popc: 'cutlass::arch::OpXorPopc', + MathOperation.multiply_add_fast_bf16: 'cutlass::arch::OpMultiplyAddFastBF16', + MathOperation.multiply_add_fast_f16: 'cutlass::arch::OpMultiplyAddFastF16', MathOperation.multiply_add_complex: 'cutlass::arch::OpMultiplyAddComplex', + MathOperation.multiply_add_complex_gaussian: 'cutlass::arch::OpMultiplyAddGaussianComplex', } ################################################################################################### # class LayoutType(enum.Enum): - ColumnMajor = enum.auto() - RowMajor = enum.auto() - ColumnMajorInterleaved32 = enum.auto() - RowMajorInterleaved32 = enum.auto() - ColumnMajorInterleaved64 = enum.auto() - RowMajorInterleaved64 = enum.auto() - TensorNHWC = enum.auto() - TensorNCHW = enum.auto() - TensorNGHWC = enum.auto() - TensorNCxHW32 = enum.auto() - TensorNCxHW64 = enum.auto() + ColumnMajor = enum_auto() + RowMajor = enum_auto() + ColumnMajorInterleaved32 = enum_auto() + RowMajorInterleaved32 = enum_auto() + ColumnMajorInterleaved64 = enum_auto() + RowMajorInterleaved64 = enum_auto() + TensorNHWC = enum_auto() + TensorNCHW = enum_auto() + TensorNGHWC = enum_auto() + TensorNCxHW32 = enum_auto() + TensorNCxHW64 = enum_auto() # LayoutTag = { @@ -282,9 +323,9 @@ ShortComplexLayoutNames = { # class OpcodeClass(enum.Enum): - Simt = enum.auto() - TensorOp = enum.auto() - WmmaTensorOp = enum.auto() + Simt = enum_auto() + TensorOp = enum_auto() + WmmaTensorOp = enum_auto() OpcodeClassNames = { OpcodeClass.Simt: 'simt', @@ -302,7 +343,7 @@ OpcodeClassTag = { # class OperationKind(enum.Enum): - Gemm = enum.auto() + Gemm = enum_auto() # OperationKindNames = { OperationKind.Gemm: 'gemm' @@ -310,7 +351,7 @@ OperationKindNames = { # class Target(enum.Enum): - library = enum.auto() + library = enum_auto() ArchitectureNames = { 50: 'maxwell', @@ -318,6 +359,7 @@ ArchitectureNames = { 61: 'pascal', 70: 'volta', 75: 'turing', + 80: 'ampere', } ################################################################################################### @@ -340,27 +382,27 @@ def SubstituteTemplate(template, values): # class GemmKind(enum.Enum): - Gemm = enum.auto() - Batched = enum.auto() - Array = enum.auto() - Universal = enum.auto() - PlanarComplex = enum.auto() - PlanarComplexArray = enum.auto() + Gemm = enum_auto() + Batched = enum_auto() + Array = enum_auto() + Universal = enum_auto() + PlanarComplex = enum_auto() + PlanarComplexArray = enum_auto() # GemmKindNames = { GemmKind.Gemm: "gemm", GemmKind.Batched: "gemm_batched", GemmKind.Array: "gemm_array", - GemmKind.Universal: "gemm_universal", + GemmKind.Universal: "gemm", GemmKind.PlanarComplex: "gemm_planar_complex", GemmKind.PlanarComplexArray: "gemm_planar_complex_array", } # class EpilogueFunctor(enum.Enum): - LinearCombination = enum.auto() - LinearCombinationClamp = enum.auto() + LinearCombination = enum_auto() + LinearCombinationClamp = enum_auto() # EpilogueFunctorTag = { @@ -370,13 +412,17 @@ EpilogueFunctorTag = { # class SwizzlingFunctor(enum.Enum): - Cohort = enum.auto() - Identity = enum.auto() + Identity1 = enum_auto() + Identity2 = enum_auto() + Identity4 = enum_auto() + Identity8 = enum_auto() # SwizzlingFunctorTag = { - SwizzlingFunctor.Cohort: 'cutlass::gemm::threadblock::GemmCohortThreadblockSwizzle<${layout_a}, ${layout_b}>', - SwizzlingFunctor.Identity: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle', + SwizzlingFunctor.Identity1: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>', + SwizzlingFunctor.Identity2: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>', + SwizzlingFunctor.Identity4: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>', + SwizzlingFunctor.Identity8: 'cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>', } ################################################################################################### diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py index 38182a1b..756ddc72 100644 --- a/tools/library/scripts/manifest.py +++ b/tools/library/scripts/manifest.py @@ -127,7 +127,7 @@ class Manifest: if args.kernels == 'all': self.kernel_names = [] else: - self.kernel_names = args.kernels.split(',') + self.kernel_names = [x for x in args.kernels.split(',') if x != ''] self.operation_count = 0 self.operations_by_name = {} diff --git a/tools/library/src/gemm_operation.h b/tools/library/src/gemm_operation.h index 102c549a..23781b25 100644 --- a/tools/library/src/gemm_operation.h +++ b/tools/library/src/gemm_operation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -29,13 +29,14 @@ #pragma once #include "cutlass/cutlass.h" -#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" #include "cutlass/gemm/device/gemm.h" #include "cutlass/gemm/device/gemm_complex.h" #include "cutlass/gemm/device/gemm_batched.h" #include "cutlass/gemm/device/gemm_array.h" #include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/default_gemm_universal.h" +#include "cutlass/gemm/kernel/default_gemm_planar_complex_universal.h" #include "cutlass/library/library.h" #include "library_internal.h" @@ -104,10 +105,10 @@ public: MathOperationMap::kId; description_.tile_description.minimum_compute_capability = - ArchMap::kMin; + ArchMap::kMin; description_.tile_description.maximum_compute_capability = - ArchMap::kMax; + ArchMap::kMax; description_.A = make_TensorDescription(Operator::kAlignmentA); description_.B = make_TensorDescription(Operator::kAlignmentB); @@ -698,6 +699,201 @@ public: } }; +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +class GemmUniversalOperation : public GemmOperationBase { +public: + + using Operator = Operator_; + using ElementA = typename Operator::ElementA; + using LayoutA = typename Operator::LayoutA; + using ElementB = typename Operator::ElementB; + using LayoutB = typename Operator::LayoutB; + using ElementC = typename Operator::ElementC; + using LayoutC = typename Operator::LayoutC; + using ElementAccumulator = typename Operator::ElementAccumulator; + using ElementCompute = typename Operator::EpilogueOutputOp::ElementCompute; + + using OperatorArguments = typename Operator::Arguments; + +public: + + /// Constructor + GemmUniversalOperation(char const *name = "unknown_gemm"): + GemmOperationBase(name) { + + this->description_.gemm_kind = GemmKind::kUniversal; + } + +protected: + + /// Constructs the arguments structure given the configuration and arguments + static Status construct_arguments_( + OperatorArguments &operator_args, + GemmUniversalConfiguration const *configuration) { + + operator_args.mode = configuration->mode; + + operator_args.problem_size = configuration->problem_size; + operator_args.batch_count = configuration->batch_count; + + operator_args.lda = int(configuration->lda); + operator_args.ldb = int(configuration->ldb); + operator_args.ldc = int(configuration->ldc); + operator_args.ldd = int(configuration->ldd); + + return Status::kSuccess; + } + + /// Constructs the arguments structure given the configuration and arguments + static Status update_arguments_( + OperatorArguments &operator_args, + GemmUniversalArguments const *arguments) { + + if (arguments->pointer_mode == ScalarPointerMode::kHost) { + typename Operator::EpilogueOutputOp::Params params( + *static_cast(arguments->alpha), + *static_cast(arguments->beta) + ); + operator_args.epilogue = params; + } + else if (arguments->pointer_mode == ScalarPointerMode::kDevice){ + typename Operator::EpilogueOutputOp::Params params( + static_cast(arguments->alpha), + static_cast(arguments->beta) + ); + operator_args.epilogue = params; + } + else { + return Status::kErrorInvalidProblem; + } + + // update arguments + operator_args.ptr_A = arguments->A; + operator_args.ptr_B = arguments->B; + operator_args.ptr_C = arguments->C; + operator_args.ptr_D = arguments->D; + + operator_args.batch_stride_A = arguments->batch_stride_A; + operator_args.batch_stride_B = arguments->batch_stride_B; + operator_args.batch_stride_C = arguments->batch_stride_C; + operator_args.batch_stride_D = arguments->batch_stride_D; + + return Status::kSuccess; + } + +public: + + /// Returns success if the operation can proceed + virtual Status can_implement( + void const *configuration_ptr, + void const *arguments_ptr) const { + + GemmUniversalConfiguration const *configuration = + static_cast(configuration_ptr); + + GemmUniversalArguments const *arguments = + static_cast(arguments_ptr); + + OperatorArguments args; + + Status status = construct_arguments_(args, configuration); + + if (status != Status::kSuccess) { + return status; + } + + status = update_arguments_(args, arguments); + + if (status != Status::kSuccess) { + return status; + } + + return Operator::can_implement(args); + } + + /// Gets the host-side workspace + virtual uint64_t get_host_workspace_size( + void const *configuration) const { + + return sizeof(Operator); + } + + /// Gets the device-side workspace + virtual uint64_t get_device_workspace_size( + void const *configuration_ptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return 0; + } + + uint64_t size = Operator::get_workspace_size(args); + + return size; + } + + /// Initializes the workspace + virtual Status initialize( + void const *configuration_ptr, + void *host_workspace, + void *device_workspace, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = construct_arguments_( + args, + static_cast(configuration_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = new (host_workspace) Operator; + + status = op->initialize(args, device_workspace, stream); + + return status; + } + + /// Runs the kernel + virtual Status run( + void const *arguments_ptr, + void *host_workspace, + void *device_workspace = nullptr, + cudaStream_t stream = nullptr) const { + + OperatorArguments args; + + Status status = update_arguments_( + args, + static_cast(arguments_ptr)); + + if (status != Status::kSuccess) { + return status; + } + + Operator *op = static_cast(host_workspace); + + status = op->update(args, device_workspace); + + if (status != Status::kSuccess) { + return status; + } + + status = op->run(stream); + + return status; + } +}; + /////////////////////////////////////////////////////////////////////////////////////////////////// template diff --git a/tools/library/src/handle.cu b/tools/library/src/handle.cu index b2345932..bdddf2d7 100644 --- a/tools/library/src/handle.cu +++ b/tools/library/src/handle.cu @@ -26,7 +26,7 @@ /*! \file \brief CUTLASS Library handle. */ - +#include #include #include @@ -43,7 +43,8 @@ namespace library { Handle::Handle( cudaStream_t stream, size_t workspace_size -): +): + provider_(Provider::kCUTLASS), stream_(stream), workspace_(nullptr), workspace_size_(0), @@ -95,6 +96,7 @@ Handle::Handle(Handle && handle) { /// Move assignment operator Handle & Handle::operator=(Handle && handle) { + provider_ = handle.provider_; device_ = handle.device_; workspace_size_ = handle.workspace_size_; workspace_ = handle.workspace_; @@ -121,6 +123,16 @@ cudaStream_t Handle::get_stream() const { return stream_; } +/// Gets the current provider +Provider Handle::get_provider() const { + return provider_; +} + +/// Sets the provider of operations +void Handle::set_provider(Provider provider) { + provider_ = provider; +} + /// Gets the device workspace size size_t Handle::get_workspace_size() const { return workspace_size_; @@ -351,6 +363,8 @@ Status Handle::gemm( // GemmFunctionalKey key( + provider_, + GemmKind::kGemm, element_compute, element_scalar, element_A, @@ -457,6 +471,188 @@ Status Handle::gemm( /////////////////////////////////////////////////////////////////////////////////////////////////// +/// Executes a GEMM computation: D <= alpha * A*B + beta * C. +// +// Supports batched-strided, batched array or split-K serial or split-K parallel. +// +Status Handle::gemm_universal( + + GemmUniversalMode mode, /// indicates the mode in which the kUniversal GEMM is launched + + int M, /// GEMM M dimension + int N, /// GEMM N dimension + int K, /// GEMM K dimension + + NumericTypeID element_compute, /// Data type of internal accumulation + + NumericTypeID element_scalar, /// Data type of alpha/beta scalars + + void const *alpha, /// Pointer to alpha scalar + + NumericTypeID element_A, /// Data type of A matrix elements + LayoutTypeID layout_A, /// Layout of A matrix + ComplexTransform transform_A, /// Complex transformation applied to A matrix - ignored for real-valued matrices + + void const * ptr_A, /// Pointer to A matrix in Global Memory + int lda, /// Leading dimension of A matrix + + NumericTypeID element_B, /// Data type of B matrix elements + LayoutTypeID layout_B, /// Layout of B matrix + ComplexTransform transform_B, /// Complex transformation applied to B matrix - ignored for real-valued matrices + + void const * ptr_B, /// Pointer to B matrix in Global Memory + int ldb, /// Leading dimension of B matrix + + void const * beta, /// Pointer to beta scalar + + NumericTypeID element_C, /// Data type of C and D matrices + + void const * ptr_C, /// Pointer to C matrix + int ldc, /// Leading dimension of C matrix + + void * ptr_D, /// Pointer to D matrix + int ldd, /// Leading dimension of D matrix + + int batch_count, /// Batch count or number of split-K slices + + int64_t batch_stride_A, /// Batch stride of A operand + int64_t batch_stride_B, /// Batch stride of B operand + int64_t batch_stride_C, /// Batch stride of C operand + int64_t batch_stride_D /// Batch stride of D operand +) { + + // + // Find the operation + // + + GemmFunctionalKey key( + provider_, + GemmKind::kUniversal, + element_compute, + element_scalar, + element_A, + layout_A, + transform_A, + element_B, + layout_B, + transform_B, + element_C + ); + + auto operators_it = Singleton::get().operation_table.gemm_operations.find(key); + + if (operators_it == Singleton::get().operation_table.gemm_operations.end()) { + return cutlass::Status::kErrorNotSupported; + } + + if (operators_it->second.empty()) { + return cutlass::Status::kErrorNotSupported; + } + + // + // Compute the largest alignment restriction the kernel can satisfy. + // + + // Maximum alignment expectation among all kernels (in units of bytes) + int const kMaximumAlignmentSize = 16; + + void const *ptr_A_check = ptr_A; + void const *ptr_B_check = ptr_B; + void const *ptr_C_check = ptr_C; + void * ptr_D_check = ptr_D; + + // Ignore alignment of pointers to pointers. We can't check this from the host, + // as each batch index has its own pointer in device memory. + if (mode == GemmUniversalMode::kArray) { + ptr_A_check = nullptr; + ptr_B_check = nullptr; + ptr_C_check = nullptr; + ptr_D_check = nullptr; + } + + int alignment = gemm_problem_alignment( + M, N, K, + element_A, ptr_A_check, lda, 0, + element_B, ptr_B_check, ldb, 0, + element_C, ptr_C_check, ldc, 0, + ptr_D_check, ldd, 0, kMaximumAlignmentSize + ); + + // + // Find the best kernel in descending order of preference. + // + + GemmPreferenceKey preference_key(compute_capability(), alignment); + + Operation const *operation = find_gemm_operation(operators_it, preference_key); + + if (!operation) { + return cutlass::Status::kErrorNotSupported; + } + + last_operation_ = operation; + + // + // Configure operation + // + + GemmUniversalConfiguration configuration{ + mode, + {M, N, K}, + batch_count, + lda, + ldb, + ldc, + ldd + }; + + // Query host work space size + uint64_t host_workspace_size_needed = operation->get_host_workspace_size(&configuration); + + if (uint64_t(kHostWorkspaceSize) < host_workspace_size_needed) { + return cutlass::Status::kErrorNotSupported; + } + + char host_workspace[kHostWorkspaceSize]; + + // Query device workspace size + uint64_t device_workspace_size_needed = operation->get_device_workspace_size(&configuration); + + if (uint64_t(workspace_size_) < device_workspace_size_needed) { + return cutlass::Status::kErrorNotSupported; + } + + // Initialize host and device workspaces + Status status = operation->initialize( + &configuration, + host_workspace, + workspace_, + stream_); + + if (status != cutlass::Status::kSuccess) { + return status; + } + + // Run the operator + GemmUniversalArguments arguments{ + ptr_A, + ptr_B, + ptr_C, + ptr_D, + alpha, + beta, + scalar_pointer_mode_, + batch_stride_A, + batch_stride_B, + batch_stride_C, + batch_stride_D + }; + + return operation->run(&arguments, host_workspace, workspace_, stream_); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// + /// Planar complex GEMM Status Handle::gemm_planar_complex( @@ -522,6 +718,8 @@ Status Handle::gemm_planar_complex( // GemmFunctionalKey key( + provider_, + GemmKind::kPlanarComplex, element_compute, element_scalar, element_A, @@ -533,9 +731,9 @@ Status Handle::gemm_planar_complex( element_C ); - auto operators_it = Singleton::get().operation_table.gemm_planar_complex_operations.find(key); + auto operators_it = Singleton::get().operation_table.gemm_operations.find(key); - if (operators_it == Singleton::get().operation_table.gemm_planar_complex_operations.end()) { + if (operators_it == Singleton::get().operation_table.gemm_operations.end()) { return cutlass::Status::kErrorNotSupported; } @@ -714,6 +912,8 @@ Status Handle::gemm_planar_complex_array( // GemmFunctionalKey key( + provider_, + GemmKind::kPlanarComplexArray, element_compute, element_scalar, element_A, @@ -725,9 +925,9 @@ Status Handle::gemm_planar_complex_array( element_C ); - auto operators_it = Singleton::get().operation_table.gemm_planar_complex_array_operations.find(key); + auto operators_it = Singleton::get().operation_table.gemm_operations.find(key); - if (operators_it == Singleton::get().operation_table.gemm_planar_complex_array_operations.end()) { + if (operators_it == Singleton::get().operation_table.gemm_operations.end()) { return cutlass::Status::kErrorNotSupported; } @@ -837,7 +1037,6 @@ Status Handle::gemm_planar_complex_array( } ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace library } // namespace cutlass diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h index 252d474e..73847b11 100644 --- a/tools/library/src/library_internal.h +++ b/tools/library/src/library_internal.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -125,6 +125,14 @@ template <> struct NumericTypeMap > { static NumericTypeID const kId = NumericTypeID::kCF64; }; +template <> struct NumericTypeMap { + static NumericTypeID const kId = NumericTypeID::kBF16; +}; + +template <> struct NumericTypeMap { + static NumericTypeID const kId = NumericTypeID::kTF32; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// template struct MathOperationMap { @@ -143,6 +151,10 @@ template <> struct MathOperationMap { static MathOperationID const kId = MathOperationID::kMultiplyAddComplex; }; +template <> struct MathOperationMap { + static MathOperationID const kId = MathOperationID::kMultiplyAddGaussianComplex; +}; + template <> struct MathOperationMap { static MathOperationID const kId = MathOperationID::kXorPopc; }; @@ -217,33 +229,43 @@ template <> struct ComplexTransformMap { ///////////////////////////////////////////////////////////////////////////////////////////////// -template struct ArchMap; +template struct ArchMap; -template <> struct ArchMap { +template <> struct ArchMap { static int const kMin = 50; static int const kMax = 1024; }; -template <> struct ArchMap { +template <> struct ArchMap { static int const kMin = 60; static int const kMax = 1024; }; -template <> struct ArchMap { +template <> struct ArchMap { static int const kMin = 61; static int const kMax = 1024; }; -template <> struct ArchMap { +template <> struct ArchMap { + static int const kMin = 70; + static int const kMax = 1024; +}; + +template <> struct ArchMap { static int const kMin = 70; static int const kMax = 75; }; -template <> struct ArchMap { +template struct ArchMap { static int const kMin = 75; static int const kMax = 1024; }; +template struct ArchMap { + static int const kMin = 80; + static int const kMax = 1024; +}; + ///////////////////////////////////////////////////////////////////////////////////////////////// template diff --git a/tools/library/src/manifest.cpp b/tools/library/src/manifest.cpp index ca6d1781..d4e8a884 100644 --- a/tools/library/src/manifest.cpp +++ b/tools/library/src/manifest.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -37,11 +37,6 @@ namespace library { ////////////////////////////////////////////////////////////////////////////////////////////////////////// -// init and insert all cutlass op in manifest object (procedurally generated using generator.py) -void initialize_all(Manifest &manifest); - -///////////////////////////////////////////////////////////////////////////////////////////////////////// - /// Top-level initialization Status Manifest::initialize() { @@ -49,13 +44,8 @@ Status Manifest::initialize() { operations_.clear(); } - switch(provider_) { - case Provider::kCUTLASS: - initialize_all(*this); break; - - default: - break; - } + // initialize procedurally generated cutlass op in manifest object + initialize_all(*this); return Status::kSuccess; } diff --git a/tools/library/src/operation_table.cu b/tools/library/src/operation_table.cu index 8fb0fe63..64e4f264 100644 --- a/tools/library/src/operation_table.cu +++ b/tools/library/src/operation_table.cu @@ -28,30 +28,7 @@ instances may be queried. */ -#include - -#include "cutlass/library/library.h" #include "cutlass/library/operation_table.h" -#include "cutlass/library/util.h" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -std::ostream & operator<<(std::ostream &out, cutlass::library::GemmFunctionalKey const &k) { - - out << "{\n" - << " element_compute: " << to_string(k.element_compute) << "\n" - << " element_scalar: " << to_string(k.element_scalar) << "\n" - << " element_A: " << to_string(k.element_A) << "\n" - << " layout_A: " << to_string(k.layout_A) << "\n" - << " transform_A: " << to_string(k.transform_A) << "\n" - << " element_B: " << to_string(k.element_B) << "\n" - << " layout_B: " << to_string(k.layout_B) << "\n" - << " transform_B: " << to_string(k.transform_B) << "\n" - << " element_C: " << to_string(k.element_C) << "\n" - << "}"; - - return out; -} ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -67,85 +44,38 @@ void OperationTable::append(Manifest const &manifest) { OperationDescription const &desc = operation->description(); + // insert all gemm operation into operation table if (desc.kind == OperationKind::kGemm) { GemmDescription const &gemm_desc = static_cast(desc); - if (gemm_desc.gemm_kind == GemmKind::kGemm) { - GemmFunctionalKey functional_key( - gemm_desc.tile_description.math_instruction.element_accumulator, - gemm_desc.element_epilogue, - gemm_desc.A.element, - gemm_desc.A.layout, - gemm_desc.transform_A, - gemm_desc.B.element, - gemm_desc.B.layout, - gemm_desc.transform_B, - gemm_desc.C.element - ); + GemmFunctionalKey functional_key( + gemm_desc.provider, + gemm_desc.gemm_kind, + gemm_desc.tile_description.math_instruction.element_accumulator, + gemm_desc.element_epilogue, + gemm_desc.A.element, + gemm_desc.A.layout, + gemm_desc.transform_A, + gemm_desc.B.element, + gemm_desc.B.layout, + gemm_desc.transform_B, + gemm_desc.C.element + ); - Operation const *op = operation.get(); + Operation const *op = operation.get(); - int cc = gemm_desc.tile_description.minimum_compute_capability; + int cc = gemm_desc.tile_description.minimum_compute_capability; - int alignment = std::max(std::max( - gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); + int alignment = std::max(std::max( + gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); - GemmPreferenceKey preference_key(cc, alignment); + GemmPreferenceKey preference_key(cc, alignment); - gemm_operations[functional_key][preference_key].push_back(op); - } - else if (gemm_desc.gemm_kind == GemmKind::kPlanarComplex) { - - GemmFunctionalKey functional_key( - gemm_desc.tile_description.math_instruction.element_accumulator, - gemm_desc.element_epilogue, - gemm_desc.A.element, - gemm_desc.A.layout, - gemm_desc.transform_A, - gemm_desc.B.element, - gemm_desc.B.layout, - gemm_desc.transform_B, - gemm_desc.C.element - ); - - Operation const *op = operation.get(); - - int cc = gemm_desc.tile_description.minimum_compute_capability; - - int alignment = std::max(std::max( - gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); - - GemmPreferenceKey preference_key(cc, alignment); - - gemm_planar_complex_operations[functional_key][preference_key].push_back(op); - } - else if (gemm_desc.gemm_kind == GemmKind::kPlanarComplexArray) { - - GemmFunctionalKey functional_key( - gemm_desc.tile_description.math_instruction.element_accumulator, - gemm_desc.element_epilogue, - gemm_desc.A.element, - gemm_desc.A.layout, - gemm_desc.transform_A, - gemm_desc.B.element, - gemm_desc.B.layout, - gemm_desc.transform_B, - gemm_desc.C.element - ); - - Operation const *op = operation.get(); - - int cc = gemm_desc.tile_description.minimum_compute_capability; - - int alignment = std::max(std::max( - gemm_desc.A.alignment, gemm_desc.B.alignment), gemm_desc.C.alignment); - - GemmPreferenceKey preference_key(cc, alignment); - - gemm_planar_complex_array_operations[functional_key][preference_key].push_back(op); - } + gemm_operations[functional_key][preference_key].push_back(op); } + + } } diff --git a/tools/library/src/util.cu b/tools/library/src/util.cu index 12757292..427f0a2c 100644 --- a/tools/library/src/util.cu +++ b/tools/library/src/util.cu @@ -45,6 +45,7 @@ static struct { Provider enumerant; } Provider_enumerants[] = { + {"none", "None", Provider::kNone}, {"cutlass", "CUTLASS", Provider::kCUTLASS}, {"host", "reference_host", Provider::kReferenceHost}, {"device", "reference_device", Provider::kReferenceDevice}, @@ -83,6 +84,38 @@ Provider from_string(std::string const &str) { } +/////////////////////////////////////////////////////////////////////////////////////////////////// + +static struct { + char const *text; + char const *pretty; + GemmKind enumerant; +} +GemmKind_enumerants[] = { + {"gemm", "", GemmKind::kGemm}, + {"batched", "", GemmKind::kBatched}, + {"array", "", GemmKind::kArray}, + {"universal", "", GemmKind::kUniversal}, + {"planar_complex", "", GemmKind::kPlanarComplex}, + {"planar_complex_array", "", GemmKind::kPlanarComplexArray}, +}; + +/// Converts a ConvKind enumerant to a string +char const *to_string(GemmKind type, bool pretty) { + + for (auto const & possible : GemmKind_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -92,6 +125,7 @@ static struct { OperationKind enumerant; } OperationKind_enumerants[] = { + {"eq_gemm", "EqGemm", OperationKind::kEqGemm}, {"gemm", "Gemm", OperationKind::kGemm}, }; @@ -194,10 +228,14 @@ NumericTypeID_enumerants[] = { {"s32", "S32", NumericTypeID::kS32}, {"s64", "S64", NumericTypeID::kS64}, {"f16", "F16", NumericTypeID::kF16}, + {"bf16", "BF16", NumericTypeID::kBF16}, {"f32", "F32", NumericTypeID::kF32}, + {"tf32", "TF32", NumericTypeID::kTF32}, {"f64", "F64", NumericTypeID::kF64}, {"cf16", "CF16", NumericTypeID::kCF16}, + {"cbf16", "CBF16", NumericTypeID::kCBF16}, {"cf32", "CF32", NumericTypeID::kCF32}, + {"ctf32", "CTF32", NumericTypeID::kCTF32}, {"cf64", "CF64", NumericTypeID::kCF64}, {"cu4", "CU4", NumericTypeID::kCU4}, {"cu8", "CU8", NumericTypeID::kCU8}, @@ -249,10 +287,14 @@ NumericTypeID from_string(std::string const &str) { int sizeof_bits(NumericTypeID type) { switch (type) { case NumericTypeID::kF16: return 16; + case NumericTypeID::kBF16: return 16; + case NumericTypeID::kTF32: return 32; case NumericTypeID::kF32: return 32; case NumericTypeID::kF64: return 64; case NumericTypeID::kCF16: return 32; + case NumericTypeID::kCBF16: return 32; case NumericTypeID::kCF32: return 64; + case NumericTypeID::kCTF32: return 64; case NumericTypeID::kCF64: return 128; case NumericTypeID::kS4: return 4; case NumericTypeID::kS8: return 8; @@ -276,6 +318,8 @@ bool is_complex_type(NumericTypeID type) { case NumericTypeID::kCF16: return true; case NumericTypeID::kCF32: return true; case NumericTypeID::kCF64: return true; + case NumericTypeID::kCBF16: return true; + case NumericTypeID::kCTF32: return true; default: break; } return false; @@ -287,6 +331,8 @@ NumericTypeID get_real_type(NumericTypeID type) { case NumericTypeID::kCF16: return NumericTypeID::kF16; case NumericTypeID::kCF32: return NumericTypeID::kF32; case NumericTypeID::kCF64: return NumericTypeID::kF64; + case NumericTypeID::kCBF16: return NumericTypeID::kBF16; + case NumericTypeID::kCTF32: return NumericTypeID::kTF32; default: break; } return type; @@ -314,6 +360,8 @@ bool is_integer_type(NumericTypeID type) { bool is_signed_type(NumericTypeID type) { switch (type) { case NumericTypeID::kF16: return true; + case NumericTypeID::kBF16: return true; + case NumericTypeID::kTF32: return true; case NumericTypeID::kF32: return true; case NumericTypeID::kF64: return true; case NumericTypeID::kS4: return true; @@ -340,9 +388,13 @@ bool is_unsigned_integer(NumericTypeID type) { bool is_float_type(NumericTypeID type) { switch (type) { case NumericTypeID::kF16: return true; + case NumericTypeID::kBF16: return true; + case NumericTypeID::kTF32: return true; case NumericTypeID::kF32: return true; case NumericTypeID::kF64: return true; case NumericTypeID::kCF16: return true; + case NumericTypeID::kCBF16: return true; + case NumericTypeID::kCTF32: return true; case NumericTypeID::kCF32: return true; case NumericTypeID::kCF64: return true; default: break; @@ -431,7 +483,7 @@ OpcodeClassID_enumerants[] = { {"simt", "", OpcodeClassID::kSimt}, {"tensorop", "", OpcodeClassID::kTensorOp}, {"wmmatensorop", "", OpcodeClassID::kWmmaTensorOp}, - {"wmma", "", OpcodeClassID::kWmmaTensorOp} + {"wmma", "", OpcodeClassID::kWmmaTensorOp}, }; /// Converts a OpcodeClassID enumerant to a string @@ -509,6 +561,47 @@ ComplexTransform from_string(std::string const &str) { } +static struct { + char const *text; + char const *pretty; + SplitKMode enumerant; +} +SplitKMode_enumerants[] = { + {"serial", "", SplitKMode::kSerial}, + {"parallel", "", SplitKMode::kParallel}, +}; + +/// Converts a SplitKMode enumerant to a string +char const *to_string(SplitKMode type, bool pretty) { + + for (auto const & possible : SplitKMode_enumerants) { + if (type == possible.enumerant) { + if (pretty) { + return possible.pretty; + } + else { + return possible.text; + } + } + } + + return pretty ? "Invalid" : "invalid"; +} + +/// Converts a SplitKMode enumerant from a string +template <> +SplitKMode from_string(std::string const &str) { + + for (auto const & possible : SplitKMode_enumerants) { + if ((str.compare(possible.text) == 0) || + (str.compare(possible.pretty) == 0)) { + return possible.enumerant; + } + } + + return SplitKMode::kInvalid; +} + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Lexical cast a string to a byte array. Returns true if cast is successful or false if invalid. bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string const &str) { @@ -570,6 +663,20 @@ bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string c *reinterpret_cast(bytes.data()) = static_cast(tmp); } break; + case NumericTypeID::kBF16: + { + float tmp; + ss >> tmp; + *reinterpret_cast(bytes.data()) = static_cast(tmp); + } + break; + case NumericTypeID::kTF32: + { + float tmp; + ss >> tmp; + *reinterpret_cast(bytes.data()) = static_cast(tmp); + } + break; case NumericTypeID::kF32: { ss >> *reinterpret_cast(bytes.data()); @@ -589,11 +696,29 @@ bool lexical_cast(std::vector &bytes, NumericTypeID type, std::string c x->imag() = static_cast(std::imag(tmp)); } break; + case NumericTypeID::kCBF16: + { + std::complex tmp; + ss >> tmp; + cutlass::complex *x = reinterpret_cast *>(bytes.data()); + x->real() = static_cast(std::real(tmp)); + x->imag() = static_cast(std::imag(tmp)); + } + break; case NumericTypeID::kCF32: { ss >> *reinterpret_cast*>(bytes.data()); } break; + case NumericTypeID::kCTF32: + { + std::complex tmp; + ss >> tmp; + cutlass::complex *x = reinterpret_cast *>(bytes.data()); + x->real() = static_cast(std::real(tmp)); + x->imag() = static_cast(std::imag(tmp)); + } + break; case NumericTypeID::kCF64: { ss >> *reinterpret_cast*>(bytes.data()); @@ -674,6 +799,18 @@ std::string lexical_cast(std::vector &bytes, NumericTypeID type) { ss << tmp; } break; + case NumericTypeID::kBF16: + { + float tmp = *reinterpret_cast(bytes.data());; + ss << tmp; + } + break; + case NumericTypeID::kTF32: + { + float tmp = *reinterpret_cast(bytes.data());; + ss << tmp; + } + break; case NumericTypeID::kF32: { ss << *reinterpret_cast(bytes.data()); @@ -696,6 +833,18 @@ std::string lexical_cast(std::vector &bytes, NumericTypeID type) { } } break; + case NumericTypeID::kCBF16: + { + cutlass::complex const *x = + reinterpret_cast const *>(bytes.data()); + + ss << float(x->real()); + + if (x->imag() != cutlass::bfloat16_t()) { + ss << "+i" << float(x->imag()); + } + } + break; case NumericTypeID::kCF32: { cutlass::complex const * x = reinterpret_cast const *>(bytes.data()); @@ -707,6 +856,17 @@ std::string lexical_cast(std::vector &bytes, NumericTypeID type) { } } break; + case NumericTypeID::kCTF32: + { + cutlass::complex const * x = reinterpret_cast const *>(bytes.data()); + + ss << float(x->real()); + + if (x->imag() != tfloat32_t()) { + ss << "+i" << float(x->imag()); + } + } + break; case NumericTypeID::kCF64: { cutlass::complex const * x = reinterpret_cast const *>(bytes.data()); @@ -780,6 +940,16 @@ bool cast_from_int64(std::vector &bytes, NumericTypeID type, int64_t sr *reinterpret_cast(bytes.data()) = static_cast(float(src)); } break; + case NumericTypeID::kBF16: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; + case NumericTypeID::kTF32: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; case NumericTypeID::kF32: { *reinterpret_cast(bytes.data()) = static_cast(src); @@ -870,6 +1040,16 @@ bool cast_from_uint64(std::vector &bytes, NumericTypeID type, uint64_t *reinterpret_cast(bytes.data()) = static_cast(float(src)); } break; + case NumericTypeID::kBF16: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; + case NumericTypeID::kTF32: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; case NumericTypeID::kF32: { *reinterpret_cast(bytes.data()) = static_cast(src); @@ -961,6 +1141,16 @@ bool cast_from_double(std::vector &bytes, NumericTypeID type, double sr *reinterpret_cast(bytes.data()) = static_cast(float(src)); } break; + case NumericTypeID::kBF16: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; + case NumericTypeID::kTF32: + { + *reinterpret_cast(bytes.data()) = static_cast(float(src)); + } + break; case NumericTypeID::kF32: { *reinterpret_cast(bytes.data()) = static_cast(src); @@ -978,11 +1168,23 @@ bool cast_from_double(std::vector &bytes, NumericTypeID type, double sr x->imag() = static_cast(float(0)); } break; + case NumericTypeID::kCBF16: + { + cutlass::complex *x = reinterpret_cast *>(bytes.data()); + x->real() = static_cast(bfloat16_t(src)); + x->imag() = static_cast(bfloat16_t(0)); + } + break; case NumericTypeID::kCF32: { *reinterpret_cast*>(bytes.data()) = std::complex(float(src), float(0)); } break; + case NumericTypeID::kCTF32: + { + *reinterpret_cast*>(bytes.data()) = std::complex(tfloat32_t(src), tfloat32_t(0)); + } + break; case NumericTypeID::kCF64: { *reinterpret_cast*>(bytes.data()) = std::complex(src, double(0)); diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt index 6e822c68..a47c8314 100644 --- a/tools/profiler/CMakeLists.txt +++ b/tools/profiler/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/profiler/src/cublas_helpers.cpp b/tools/profiler/src/cublas_helpers.cpp index 5e5e2cb0..05262a22 100644 --- a/tools/profiler/src/cublas_helpers.cpp +++ b/tools/profiler/src/cublas_helpers.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -52,15 +52,35 @@ Status get_cutlass_status(cublasStatus_t cublas) { } /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation -cublasOperation_t get_cublas_transpose_operation(library::LayoutTypeID layout) { +bool get_cublas_transpose_operation( + cublasOperation_t &operation, + library::LayoutTypeID layout, + library::ComplexTransform transform) { + switch (layout) { case library::LayoutTypeID::kColumnMajor: - return CUBLAS_OP_N; + if (transform == library::ComplexTransform::kNone) { + operation = CUBLAS_OP_N; + return true; + } + else { + return false; + } + break; case library::LayoutTypeID::kRowMajor: - return CUBLAS_OP_T; + if (transform == library::ComplexTransform::kNone) { + operation = CUBLAS_OP_T; + return true; + } + else if (transform == library::ComplexTransform::kConjugate) { + operation = CUBLAS_OP_C; + return true; + } + break; default: break; } - throw std::runtime_error("CUTLASS layout type does not correspond to cublas type"); + + return false; } /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration @@ -114,6 +134,14 @@ bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID ele case library::NumericTypeID::kB1: break; + + case library::NumericTypeID::kCF32: + data_type = CUDA_C_32F; + return true; + + case library::NumericTypeID::kCF64: + data_type = CUDA_C_64F; + return true; case library::NumericTypeID::kInvalid: @@ -157,6 +185,104 @@ Status cublas_satisfies(library::GemmDescription const &desc) { ///////////////////////////////////////////////////////////////////////////////////////////////// +namespace detail { + +cublasGemmExDispatcher::cublasGemmExDispatcher( + library::GemmDescription const &op_desc, + library::GemmUniversalConfiguration configuration_, + library::GemmUniversalArguments arguments_, + cublasGemmAlgo_t algorithm +): + configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) { + + bool good = true; + + good = (good && get_cublas_transpose_operation(trans_A, op_desc.A.layout, op_desc.transform_A)); + good = (good && get_cublas_transpose_operation(trans_B, op_desc.B.layout, op_desc.transform_B)); + good = (good && get_cublas_datatype(data_type_A, op_desc.A.element)); + good = (good && get_cublas_datatype(data_type_B, op_desc.B.element)); + good = (good && get_cublas_datatype(data_type_C, op_desc.C.element)); + + good = (good && get_cublas_datatype( + compute_data_type, + op_desc.tile_description.math_instruction.element_accumulator)); + + // cuBLAS introduces a separate cublasComputeType enumerant to more precisely describe + // internal numerical data types used in the computation. +#if (__CUDA_VER_MAJOR__ >= 11) + library::OpcodeClassID const & opcode_class = + op_desc.tile_description.math_instruction.opcode_class; + + if (good && + op_desc.A.element == library::NumericTypeID::kF32 && + op_desc.B.element == library::NumericTypeID::kF32 && + opcode_class == library::OpcodeClassID::kTensorOp) { + + compute_type = CUBLAS_COMPUTE_32F_FAST_TF32; + } + else if (good) { + bool const isPedantic = false; + switch (compute_data_type) { + case CUDA_R_32F: + case CUDA_C_32F: + compute_type = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F; + break; + case CUDA_R_64F: + case CUDA_C_64F: + compute_type = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F; + break; + case CUDA_R_16F: + compute_type = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F; + break; + case CUDA_R_32I: + compute_type = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I; + break; + default: + good = false; + break; + } + } +#endif // __CUDA_VER_MAJOR__ >= 11 + + if (!good) { + status = Status::kErrorNotSupported; + } +} + +/// Executes GEMM using these arguments +cublasStatus_t cublasGemmExDispatcher::operator()(cublasHandle_t handle) { + + return cublasGemmEx( + handle, + trans_A, + trans_B, + configuration.problem_size.m(), + configuration.problem_size.n(), + configuration.problem_size.k(), + arguments.alpha, + arguments.A, + data_type_A, + int(configuration.lda), + arguments.B, + data_type_B, + int(configuration.ldb), + arguments.beta, + arguments.D, + data_type_C, + int(configuration.ldc), +#if (__CUDA_VER_MAJOR__ >= 11) + compute_type, +#else + compute_data_type, +#endif + algo + ); +} + + +///////////////////////////////////////////////////////////////////////////////////////////////// +} // namespace detail + } // namespace profiler } // namespace cutlass diff --git a/tools/profiler/src/cublas_helpers.h b/tools/profiler/src/cublas_helpers.h index 0ade0961..9c807846 100644 --- a/tools/profiler/src/cublas_helpers.h +++ b/tools/profiler/src/cublas_helpers.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -33,7 +33,10 @@ #include "cutlass/cutlass.h" #include "cutlass/library/library.h" +#include "cutlass/library/util.h" + #include "options.h" + ///////////////////////////////////////////////////////////////////////////////////////////////// namespace cutlass { @@ -45,7 +48,10 @@ namespace profiler { Status get_cutlass_status(cublasStatus_t cublas); /// Maps a CUTLASS tensor layout to a cuBLAS transpose operation -cublasOperation_t get_cublas_transpose_operation(library::LayoutTypeID layout); +bool get_cublas_transpose_operation( + cublasOperation_t &operation, + library::LayoutTypeID layout, + library::ComplexTransform transform = library::ComplexTransform::kNone); /// Maps a CUTLASS numeric type to a cuBLAS data type enumeration bool get_cublas_datatype(cublasDataType_t &data_type, library::NumericTypeID element_type); @@ -168,8 +174,8 @@ struct cublasGemmExDispatcher { // // Data members // - library::GemmConfiguration configuration; - library::GemmArguments arguments; + library::GemmUniversalConfiguration configuration; + library::GemmUniversalArguments arguments; // cublass-specific data structures to fill cublas API call arguments cublasOperation_t trans_A; @@ -177,7 +183,12 @@ struct cublasGemmExDispatcher { cudaDataType_t data_type_A; cudaDataType_t data_type_B; cudaDataType_t data_type_C; - cudaDataType_t compute_type; + cudaDataType_t compute_data_type; + +#if (__CUDA_VER_MAJOR__ >= 11) + cublasComputeType_t compute_type; +#endif + cublasGemmAlgo_t algo; Status status; @@ -187,54 +198,13 @@ struct cublasGemmExDispatcher { cublasGemmExDispatcher( library::GemmDescription const &op_desc, - library::GemmConfiguration configuration_, - library::GemmArguments arguments_, + library::GemmUniversalConfiguration configuration_, + library::GemmUniversalArguments arguments_, cublasGemmAlgo_t algorithm = CUBLAS_GEMM_DFALT - ): - configuration(configuration_), arguments(arguments_), algo(algorithm), status(Status::kSuccess) { - - trans_A = get_cublas_transpose_operation(op_desc.A.layout); - trans_B = get_cublas_transpose_operation(op_desc.B.layout); - - bool good = true; - good = (good && get_cublas_datatype(data_type_A, op_desc.A.element)); - good = (good && get_cublas_datatype(data_type_B, op_desc.B.element)); - good = (good && get_cublas_datatype(data_type_C, op_desc.C.element)); - - good = (good && get_cublas_datatype( - compute_type, - op_desc.tile_description.math_instruction.element_accumulator)); - - if (!good) { - status = Status::kErrorNotSupported; - } - } + ); /// Executes GEMM using these arguments - cublasStatus_t operator()(cublasHandle_t handle) { - - return cublasGemmEx( - handle, - trans_A, - trans_B, - configuration.problem_size.m(), - configuration.problem_size.n(), - configuration.problem_size.k(), - arguments.alpha, - arguments.A, - data_type_A, - int(configuration.lda), - arguments.B, - data_type_B, - int(configuration.ldb), - arguments.beta, - arguments.D, - data_type_C, - int(configuration.ldc), - compute_type, - algo - ); - } + cublasStatus_t operator()(cublasHandle_t handle); }; /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu index b36f897b..90f4a959 100644 --- a/tools/profiler/src/cutlass_profiler.cu +++ b/tools/profiler/src/cutlass_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -44,7 +44,7 @@ CutlassProfiler::CutlassProfiler( ): options_(options) { - operation_profilers_.emplace_back(new GemmOperationProfiler); + operation_profilers_.emplace_back(new GemmOperationProfiler(options)); } @@ -108,13 +108,6 @@ void CutlassProfiler::enumerate_() { /// Profiles all operations int CutlassProfiler::profile_() { - library::Manifest manifest(library::Provider::kCUTLASS); - Status status = manifest.initialize(); - - if (status != Status::kSuccess) { - return -1; - } - int result = 0; DeviceContext device_context; @@ -124,7 +117,7 @@ int CutlassProfiler::profile_() { if (options_.operation_kind == library::OperationKind::kInvalid || options_.operation_kind == profiler->kind()) { - result = profiler->profile_all(options_, manifest, device_context); + result = profiler->profile_all(options_, library::Singleton::get().manifest, device_context); if (result) { return result; diff --git a/tools/profiler/src/cutlass_profiler.h b/tools/profiler/src/cutlass_profiler.h index eda24c5b..d3b592a4 100644 --- a/tools/profiler/src/cutlass_profiler.h +++ b/tools/profiler/src/cutlass_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -30,6 +30,7 @@ // CUTLASS Library includes #include "cutlass/library/library.h" #include "cutlass/library/manifest.h" +#include "cutlass/library/singleton.h" #include "options.h" #include "operation_profiler.h" diff --git a/tools/profiler/src/debug.h b/tools/profiler/src/debug.h index 8aad2ee9..aed11ca1 100644 --- a/tools/profiler/src/debug.h +++ b/tools/profiler/src/debug.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu index c97f0de4..4045abfe 100644 --- a/tools/profiler/src/device_allocation.cu +++ b/tools/profiler/src/device_allocation.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -431,6 +431,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) { dist ); break; + case library::NumericTypeID::kCF32: + cutlass::reference::device::BlockFillRandom>( + reinterpret_cast *>(pointer_), + capacity_, + seed, + dist + ); + break; case library::NumericTypeID::kF64: cutlass::reference::device::BlockFillRandom( reinterpret_cast(pointer_), @@ -548,6 +556,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) { dist ); break; + case library::NumericTypeID::kCF32: + cutlass::reference::host::BlockFillRandom>( + reinterpret_cast *>(host_data.data()), + capacity_, + seed, + dist + ); + break; case library::NumericTypeID::kF64: cutlass::reference::host::BlockFillRandom( reinterpret_cast(host_data.data()), @@ -655,6 +671,12 @@ bool DeviceAllocation::block_compare_equal( reinterpret_cast(ptr_A), reinterpret_cast(ptr_B), capacity); + + case library::NumericTypeID::kCF32: + return reference::device::BlockCompareEqual >( + reinterpret_cast const *>(ptr_A), + reinterpret_cast const *>(ptr_B), + capacity); case library::NumericTypeID::kCF16: return reference::device::BlockCompareEqual>( @@ -825,6 +847,23 @@ bool DeviceAllocation::block_compare_relatively_equal( static_cast(epsilon), static_cast(nonzero_floor)); + // No relatively equal comparison for complex numbers. + // + // As a simplification, we can require bitwise equality. This avoids false positives. + // (i.e. "pass" really means passing. "Fail" may not actually mean failure given appropriate epsilon.) + // + case library::NumericTypeID::kCF32: + return reference::device::BlockCompareEqual >( + reinterpret_cast const *>(ptr_A), + reinterpret_cast const *>(ptr_B), + capacity); + + case library::NumericTypeID::kCF64: + return reference::device::BlockCompareEqual >( + reinterpret_cast const *>(ptr_A), + reinterpret_cast const *>(ptr_B), + capacity); + default: throw std::runtime_error("Unsupported numeric type"); } @@ -970,6 +1009,14 @@ void DeviceAllocation::write_tensor_csv( case library::NumericTypeID::kU64: write_tensor_csv_static_type(out, *this); break; + + case library::NumericTypeID::kCF32: + write_tensor_csv_static_type >(out, *this); + break; + + case library::NumericTypeID::kCF64: + write_tensor_csv_static_type >(out, *this); + break; default: throw std::runtime_error("Unsupported numeric type"); diff --git a/tools/profiler/src/device_allocation.h b/tools/profiler/src/device_allocation.h index be69f037..f57cda14 100644 --- a/tools/profiler/src/device_allocation.h +++ b/tools/profiler/src/device_allocation.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_context.cu b/tools/profiler/src/device_context.cu index 780e0447..f9cfe9ab 100644 --- a/tools/profiler/src/device_context.cu +++ b/tools/profiler/src/device_context.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/device_context.h b/tools/profiler/src/device_context.h index 7be0349a..aea872ef 100644 --- a/tools/profiler/src/device_context.h +++ b/tools/profiler/src/device_context.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/enumerated_types.cpp b/tools/profiler/src/enumerated_types.cpp index 1acefb1f..29be6f8b 100644 --- a/tools/profiler/src/enumerated_types.cpp +++ b/tools/profiler/src/enumerated_types.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/enumerated_types.h b/tools/profiler/src/enumerated_types.h index 051406d1..e7e713bd 100644 --- a/tools/profiler/src/enumerated_types.h +++ b/tools/profiler/src/enumerated_types.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -50,7 +50,7 @@ T from_string(std::string const &); enum class ExecutionMode { kProfile, ///< regular verification and profiling kDryRun, ///< no kernels are launched or workspaces allocated; used to assess what operators might be launched - kEnumerate, ///< no kernels launched or workspaces allocated; lists all function types and functions + kEnumerate, ///< no kernels launched or workspaces allocated; lists all operation kind and operations kTrace, ///< executes a single device-side computation with no other kernel launches kInvalid }; diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu index cb430968..f494eeee 100644 --- a/tools/profiler/src/gemm_operation_profiler.cu +++ b/tools/profiler/src/gemm_operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,8 @@ #include #include +#include "cutlass/core_io.h" + #include "cublas_helpers.h" #include "gemm_operation_profiler.h" #include "gpu_timer.h" @@ -44,22 +46,27 @@ namespace profiler { ///////////////////////////////////////////////////////////////////////////////////////////////// /// Ctor -GemmOperationProfiler::GemmOperationProfiler(): - OperationProfiler(library::OperationKind::kGemm,{ - {ArgumentTypeID::kEnumerated, {"Gemm_kind"}, "Variant of GEMM (e.g. gemm, planar complex, batched, ...)"}, - {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"}, - {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"}, - {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"}, - {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"}, - {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"}, - {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"}, - {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, - {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, - {ArgumentTypeID::kInteger, {"split_k_slices"}, "Number of partitions of K dimension"}, - {ArgumentTypeID::kInteger, {"batch_count"}, "Number of GEMMs computed in one batch"}, - }) { +GemmOperationProfiler::GemmOperationProfiler(Options const &options): + OperationProfiler( + options, + library::OperationKind::kGemm, + { + {ArgumentTypeID::kEnumerated, {"gemm_kind"}, "Variant of GEMM (gemm, batched, array, universal, planar_complex, planar_complex_array)"}, + {ArgumentTypeID::kInteger, {"m", "problem-size::m"}, "M dimension of the GEMM problem space"}, + {ArgumentTypeID::kInteger, {"n", "problem-size::n"}, "N dimension of the GEMM problem space"}, + {ArgumentTypeID::kInteger, {"k", "problem-size::k"}, "K dimension of the GEMM problem space"}, + {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"}, + {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"}, + {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"}, + {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"}, + {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"}, + {ArgumentTypeID::kInteger, {"split_k_slices", "split-k-slices"}, "Number of partitions of K dimension"}, + {ArgumentTypeID::kInteger, {"batch_count", "batch-count"}, "Number of GEMMs computed in one batch"}, + }, + { library::Provider::kCUBLAS} + ) { - description_ = "General matrix-matrix product. D = alpha * A*B + beta * C"; + description_ = " General matrix-matrix product. D = alpha * A*B + beta * C"; } /// Destructor @@ -107,6 +114,8 @@ void GemmOperationProfiler::print_examples(std::ostream &out) const { << " --providers=cutlass --output=functional-test.csv\n\n"; } +///////////////////////////////////////////////////////////////////////////////////////////////// + #if 0 // used this for debugging static std::string byte_string(std::vector const &bytes) { @@ -122,47 +131,34 @@ static std::string byte_string(std::vector const &bytes) { } #endif -///////////////////////////////////////////////////////////////////////////////////////////////// - -/// Extracts the problem dimensions -Status GemmOperationProfiler::initialize_configuration( - Options const &options, - PerformanceReport &report, - DeviceContext &device_context, - library::Operation const *operation, +Status GemmOperationProfiler::GemmProblem::parse( + library::GemmDescription const &operation_desc, ProblemSpace const &problem_space, ProblemSpace::Problem const &problem) { - - library::GemmDescription const &operation_desc = - static_cast(operation->description()); - - if (operation_desc.gemm_kind != library::GemmKind::kGemm) { - return Status::kErrorInvalidProblem; + + if (!arg_as_int(this->m, "m", problem_space, problem)) { + // default value + this->m = 1024; } - if (!arg_as_int(problem_.m, "m", problem_space, problem)) { + if (!arg_as_int(this->n, "n", problem_space, problem)) { // default value - problem_.m = 1024; - } - - if (!arg_as_int(problem_.n, "n", problem_space, problem)) { - // default value - problem_.n = 1024; + this->n = 1024; } - if (!arg_as_int(problem_.k, "k", problem_space, problem)) { + if (!arg_as_int(this->k, "k", problem_space, problem)) { // default value - problem_.k = 1024; + this->k = 1024; } - if (!arg_as_int(problem_.split_k_slices, "split_k_slices", problem_space, problem)) { + if (!arg_as_int(this->split_k_slices, "split_k_slices", problem_space, problem)) { // default value - problem_.split_k_slices = 1; + this->split_k_slices = 1; } - if (!arg_as_int(problem_.batch_count, "batch_count", problem_space, problem)) { + if (!arg_as_int(this->batch_count, "batch_count", problem_space, problem)) { // default value - problem_.batch_count = 1; + this->batch_count = 1; } if (!tensor_description_satisfies(operation_desc.A, "A", problem_space, problem)) { @@ -178,37 +174,97 @@ Status GemmOperationProfiler::initialize_configuration( } if (!arg_as_scalar( - problem_.alpha, + this->alpha, operation_desc.element_epilogue, "alpha", problem_space, problem)) { - if (!cast_from_double(problem_.alpha, operation_desc.element_epilogue, 1)) { + if (!cast_from_double(this->alpha, operation_desc.element_epilogue, 1)) { return Status::kErrorInternal; } } if (!arg_as_scalar( - problem_.beta, + this->beta, operation_desc.element_epilogue, "beta", problem_space, problem)) { - if (!cast_from_double(problem_.beta, operation_desc.element_epilogue, 0)) { + if (!cast_from_double(this->beta, operation_desc.element_epilogue, 0)) { return Status::kErrorInternal; } } - problem_.lda = DeviceAllocation::get_packed_layout( - operation_desc.A.layout, {int(problem_.m), int(problem_.k)}).front(); + this->lda = DeviceAllocation::get_packed_layout( + operation_desc.A.layout, {int(this->m), int(this->k)}).front(); - problem_.ldb = DeviceAllocation::get_packed_layout( - operation_desc.B.layout, {int(problem_.k), int(problem_.n)}).front(); + this->ldb = DeviceAllocation::get_packed_layout( + operation_desc.B.layout, {int(this->k), int(this->n)}).front(); - problem_.ldc = DeviceAllocation::get_packed_layout( - operation_desc.C.layout, {int(problem_.m), int(problem_.n)}).front(); + this->ldc = DeviceAllocation::get_packed_layout( + operation_desc.C.layout, {int(this->m), int(this->n)}).front(); + + return Status::kSuccess; +} + +/// Initializes a performance result +void GemmOperationProfiler::GemmProblem::initialize_result( + PerformanceResult &result, + library::GemmDescription const &operation_desc, + ProblemSpace const &problem_space) { + + result.arguments.resize(problem_space.rank()); + + set_argument(result, "gemm_kind", problem_space, library::to_string(operation_desc.gemm_kind)); + + set_argument(result, "A", problem_space, + std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout)); + + set_argument(result, "B", problem_space, + std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout)); + + set_argument(result, "C", problem_space, + std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout)); + + set_argument(result, "m", problem_space, m); + set_argument(result, "n", problem_space, n); + set_argument(result, "k", problem_space, k); + + set_argument(result, "split_k_slices", problem_space, split_k_slices); + set_argument(result, "batch_count", problem_space, batch_count); + + set_argument(result, "alpha", problem_space, + library::lexical_cast(alpha, operation_desc.element_epilogue)); + + set_argument(result, "beta", problem_space, + library::lexical_cast(beta, operation_desc.element_epilogue)); +} + +///////////////////////////////////////////////////////////////////////////////////////////////// + +/// Extracts the problem dimensions +Status GemmOperationProfiler::initialize_configuration( + Options const &options, + PerformanceReport &report, + DeviceContext &device_context, + library::Operation const *operation, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + library::GemmDescription const &operation_desc = + static_cast(operation->description()); + + if (operation_desc.gemm_kind != library::GemmKind::kUniversal) { + return Status::kErrorInvalidProblem; + } + + Status status = problem_.parse(operation_desc, problem_space, problem); + + if (status != Status::kSuccess) { + return status; + } gemm_workspace_.configuration.problem_size.m() = int(problem_.m); gemm_workspace_.configuration.problem_size.n() = int(problem_.n); @@ -217,7 +273,8 @@ Status GemmOperationProfiler::initialize_configuration( gemm_workspace_.configuration.ldb = problem_.ldb; gemm_workspace_.configuration.ldc = problem_.ldc; gemm_workspace_.configuration.ldd = problem_.ldc; - gemm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices); + //gemm_workspace_.configuration.split_k_slices = int(problem_.split_k_slices); + gemm_workspace_.configuration.batch_count = int(problem_.split_k_slices); gemm_workspace_.arguments.A = nullptr; gemm_workspace_.arguments.B = nullptr; @@ -243,37 +300,24 @@ void GemmOperationProfiler::initialize_result_( result.disposition = Disposition::kNotRun; result.status = Status::kSuccess; result.operation_name = operation_desc.name; - - result.arguments.resize(problem_space.rank()); - - set_argument_(result, "A", problem_space, - std::string(library::to_string(operation_desc.A.element)) + ":" + library::to_string(operation_desc.A.layout)); - - set_argument_(result, "B", problem_space, - std::string(library::to_string(operation_desc.B.element)) + ":" + library::to_string(operation_desc.B.layout)); - - set_argument_(result, "C", problem_space, - std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout)); - - set_argument_(result, "m", problem_space, problem_.m); - set_argument_(result, "n", problem_space, problem_.n); - set_argument_(result, "k", problem_space, problem_.k); - - set_argument_(result, "split_k_slices", problem_space, problem_.split_k_slices); - set_argument_(result, "batch_count", problem_space, problem_.batch_count); - - set_argument_(result, "alpha", problem_space, - library::lexical_cast(problem_.alpha, operation_desc.element_epilogue)); - - set_argument_(result, "beta", problem_space, - library::lexical_cast(problem_.beta, operation_desc.element_epilogue)); + + problem_.initialize_result(result, operation_desc, problem_space); OperationProfiler::initialize_result_(result, operation_desc, problem_space); + // Input bytes read and Output bytes written for the gemm problem result.bytes = int64_t(library::sizeof_bits(operation_desc.A.element) * problem_.m / 8) * problem_.k + int64_t(library::sizeof_bits(operation_desc.B.element) * problem_.n / 8) * problem_.k + - int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n * 2; + int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n; + + // Set is_beta_zero true if beta is zero + bool is_beta_zero = std::all_of(problem_.beta.begin(), problem_.beta.end(), [](uint8_t i) { return i==0; }); + + // Output bytes read for the gemm problem for non-zero beta values + if (!is_beta_zero) { + result.bytes += int64_t(library::sizeof_bits(operation_desc.C.element) * problem_.m / 8) * problem_.n; + } result.flops = 2 * (problem_.m * problem_.n * problem_.k + problem_.m * problem_.n); result.runtime = 0; @@ -378,8 +422,9 @@ Status GemmOperationProfiler::initialize_workspace( results_.back().provider = library::Provider::kCUTLASS; results_.back().op_kind = library::OperationKind::kGemm; results_.back().disposition = Disposition::kNotRun; - for(auto &verification_provider : options.verification.providers) { - results_.back().verification_map[verification_provider] = Disposition::kNotRun; + + for(auto provider : verification_providers_) { + results_.back().verification_map[provider] = Disposition::kNotRun; } } @@ -559,8 +604,7 @@ bool GemmOperationProfiler::verify_with_cublas_( ); if (gemm_op.status != Status::kSuccess) { - - results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kFailed; + results_.back().verification_map[library::Provider::kCUBLAS] = Disposition::kNotRun; return true; } diff --git a/tools/profiler/src/gemm_operation_profiler.h b/tools/profiler/src/gemm_operation_profiler.h index 3bd0bb62..e4d23212 100644 --- a/tools/profiler/src/gemm_operation_profiler.h +++ b/tools/profiler/src/gemm_operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,7 @@ #include #include #include +#include #include // CUTLASS Library includes @@ -75,6 +76,18 @@ public: GemmProblem(): m(16), n(16), k(16), lda(0), ldb(0), ldc(0), split_k_slices(1), batch_count(1) { } + + /// Parses the problem + Status parse( + library::GemmDescription const &operation_desc, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + + /// Initializes a performance result + void initialize_result( + PerformanceResult &result, + library::GemmDescription const &operation_desc, + ProblemSpace const &problem_space); }; /// Workspace used @@ -86,8 +99,8 @@ public: DeviceAllocation *Computed; DeviceAllocation *Reference; - library::GemmConfiguration configuration; - library::GemmArguments arguments; + library::GemmUniversalConfiguration configuration; + library::GemmUniversalArguments arguments; /// Buffer used for the operation's host workspace std::vector host_workspace; @@ -122,7 +135,7 @@ public: // /// Ctor - GemmOperationProfiler(); + GemmOperationProfiler(Options const &options); /// Destructor virtual ~GemmOperationProfiler(); diff --git a/tools/profiler/src/gpu_timer.cpp b/tools/profiler/src/gpu_timer.cpp index 218e09d3..eb3a8411 100644 --- a/tools/profiler/src/gpu_timer.cpp +++ b/tools/profiler/src/gpu_timer.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/gpu_timer.h b/tools/profiler/src/gpu_timer.h index ca00ad7a..5cd4b003 100644 --- a/tools/profiler/src/gpu_timer.h +++ b/tools/profiler/src/gpu_timer.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/main.cpp b/tools/profiler/src/main.cpp index a76fcf9a..a1e52311 100644 --- a/tools/profiler/src/main.cpp +++ b/tools/profiler/src/main.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu index 6d21f87e..754118a7 100644 --- a/tools/profiler/src/operation_profiler.cu +++ b/tools/profiler/src/operation_profiler.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __unix__ #include @@ -55,30 +56,41 @@ OperationProfiler::OperationProfiler(): kind_(library::OperationKind::kInvalid) /// Ctor OperationProfiler::OperationProfiler( + Options const &options, library::OperationKind kind, ArgumentDescriptionVector const &arguments, - ProviderVector const & reference_providers + ProviderVector const & verification_providers ): - kind_(kind), arguments_(arguments), reference_providers_(reference_providers) { + kind_(kind), arguments_(arguments) { ArgumentDescriptionVector tile_description_arguments{ - {ArgumentTypeID::kEnumerated, {"op_class", "opcode-class"}, "Class of math instruction (SIMT or TensorOp)."}, - {ArgumentTypeID::kEnumerated, {"accum", "accumulator-type"}, "Math instruction accumulator data type."}, - {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension."}, - {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension."}, - {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension."}, - {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply."}, - {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension."}, - {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension."}, - {ArgumentTypeID::kInteger, {"warps_k", "warp-count::k"}, "Number of warps within threadblock along the K dimension."}, - {ArgumentTypeID::kInteger, {"inst_m", "instruction-shape::m"}, "Math instruction shape in the M dimension."}, - {ArgumentTypeID::kInteger, {"inst_n", "instruction-shape::n"}, "Math instruction shape in the N dimension."}, - {ArgumentTypeID::kInteger, {"inst_k", "instruction-shape::k"}, "Math instruction shape in the K dimension."}, - {ArgumentTypeID::kInteger, {"min_cc", "minimum-compute-capability"}, "Minimum device compute capability."}, - {ArgumentTypeID::kInteger, {"max_cc", "maximum-compute-capability"}, "Maximum device compute capability."} + {ArgumentTypeID::kEnumerated, {"op_class", "opcode-class"}, "Class of math instruction (simt, tensorop, wmmatensorop, wmma)"}, + {ArgumentTypeID::kEnumerated, {"accum", "accumulator-type"}, "Math instruction accumulator data type"}, + {ArgumentTypeID::kInteger, {"cta_m", "threadblock-shape::m"}, "Threadblock shape in the M dimension"}, + {ArgumentTypeID::kInteger, {"cta_n", "threadblock-shape::n"}, "Threadblock shape in the N dimension"}, + {ArgumentTypeID::kInteger, {"cta_k", "threadblock-shape::k"}, "Threadblock shape in the K dimension"}, + {ArgumentTypeID::kInteger, {"stages", "threadblock-stages"}, "Number of stages of threadblock-scoped matrix multiply"}, + {ArgumentTypeID::kInteger, {"warps_m", "warp-count::m"}, "Number of warps within threadblock along the M dimension"}, + {ArgumentTypeID::kInteger, {"warps_n", "warp-count::n"}, "Number of warps within threadblock along the N dimension"}, + {ArgumentTypeID::kInteger, {"warps_k", "warp-count::k"}, "Number of warps within threadblock along the K dimension"}, + {ArgumentTypeID::kInteger, {"inst_m", "instruction-shape::m"}, "Math instruction shape in the M dimension"}, + {ArgumentTypeID::kInteger, {"inst_n", "instruction-shape::n"}, "Math instruction shape in the N dimension"}, + {ArgumentTypeID::kInteger, {"inst_k", "instruction-shape::k"}, "Math instruction shape in the K dimension"}, + {ArgumentTypeID::kInteger, {"min_cc", "minimum-compute-capability"}, "Minimum device compute capability"}, + {ArgumentTypeID::kInteger, {"max_cc", "maximum-compute-capability"}, "Maximum device compute capability"} }; arguments_.insert(arguments_.end(), tile_description_arguments.begin(), tile_description_arguments.end()); + + for (auto provider : verification_providers) { + if (std::find( + options.verification.providers.begin(), + options.verification.providers.end(), + provider) != options.verification.providers.end()) { + + verification_providers_.push_back(provider); + } + } } /// Destructor @@ -248,8 +260,9 @@ int OperationProfiler::profile_all( auto min_cc = operation->description().tile_description.minimum_compute_capability; auto max_cc = operation->description().tile_description.maximum_compute_capability; - // Execute compatible operations if they satisfy the current device's compute capability + // Execute compatible cutlass operations if they satisfy the current device's compute capability if (operation->description().kind == kind_ && + operation->description().provider == library::Provider::kCUTLASS && options.device.compute_capability() >= min_cc && options.device.compute_capability() <= max_cc) { @@ -259,7 +272,7 @@ int OperationProfiler::profile_all( if (!filtered_by_name) { for (auto const & op_name : options.operation_names) { - if (operation_name.find(op_name) !=std::string::npos) { + if (find_string_matches_(op_name, operation_name)) { filtered_by_name = true; break; } @@ -278,7 +291,7 @@ int OperationProfiler::profile_all( operation, problem_space, problem); - + if (status == Status::kErrorInternal) { // Stop profiling if there was an internal error return false; @@ -548,29 +561,28 @@ void OperationProfiler::initialize_result_( library::OperationDescription const &operation_desc, ProblemSpace const &problem_space) { - set_argument_(result, "op_class", problem_space, + set_argument(result, "op_class", problem_space, library::to_string(operation_desc.tile_description.math_instruction.opcode_class)); - set_argument_(result, "accum", problem_space, + set_argument(result, "accum", problem_space, library::to_string(operation_desc.tile_description.math_instruction.element_accumulator)); - set_argument_(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m()); - set_argument_(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n()); - set_argument_(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k()); - set_argument_(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages); - set_argument_(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m()); - set_argument_(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n()); - set_argument_(result, "warps_k", problem_space, operation_desc.tile_description.warp_count.k()); - set_argument_(result, "inst_m", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.m()); - set_argument_(result, "inst_n", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.n()); - set_argument_(result, "inst_k", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.k()); - set_argument_(result, "min_cc", problem_space, operation_desc.tile_description.minimum_compute_capability); - set_argument_(result, "max_cc", problem_space, operation_desc.tile_description.maximum_compute_capability); + set_argument(result, "cta_m", problem_space, operation_desc.tile_description.threadblock_shape.m()); + set_argument(result, "cta_n", problem_space, operation_desc.tile_description.threadblock_shape.n()); + set_argument(result, "cta_k", problem_space, operation_desc.tile_description.threadblock_shape.k()); + set_argument(result, "stages", problem_space, operation_desc.tile_description.threadblock_stages); + set_argument(result, "warps_m", problem_space, operation_desc.tile_description.warp_count.m()); + set_argument(result, "warps_n", problem_space, operation_desc.tile_description.warp_count.n()); + set_argument(result, "warps_k", problem_space, operation_desc.tile_description.warp_count.k()); + set_argument(result, "inst_m", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.m()); + set_argument(result, "inst_n", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.n()); + set_argument(result, "inst_k", problem_space, operation_desc.tile_description.math_instruction.instruction_shape.k()); + set_argument(result, "min_cc", problem_space, operation_desc.tile_description.minimum_compute_capability); + set_argument(result, "max_cc", problem_space, operation_desc.tile_description.maximum_compute_capability); } - /// Helper -void OperationProfiler::set_argument_( +void OperationProfiler::set_argument( PerformanceResult &result, char const *name, ProblemSpace const &problem_space, @@ -579,7 +591,7 @@ void OperationProfiler::set_argument_( result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), value); } -void OperationProfiler::set_argument_( +void OperationProfiler::set_argument( PerformanceResult &result, char const *name, ProblemSpace const &problem_space, @@ -588,6 +600,39 @@ void OperationProfiler::set_argument_( result.arguments.at(problem_space.argument_index(name)) = make_pair(std::string(name), library::lexical_cast(value)); } + +/// finds string matches filter_string in operation_name +bool OperationProfiler::find_string_matches_( + std::string const &filter_string, + std::string const &operation_name) { + // Returns true if all substrings appear in the operation_name in order + + // Split filter_string of the format "gemm*f32*nt" to tokens ["gemm", "f32", "nt"] + std::string item; + std::istringstream iss(filter_string); + std::vector filter_tokens; + while (std::getline(iss, item, '*')) { + filter_tokens.push_back(item); + } + + // Search filter_tokens in operation_name in order + size_t start = 0, idx = 0; + for(auto & token : filter_tokens) { + // Check if characters left to be parsed in operation_name + if (start < operation_name.length()) { + // Find token in operation_name[start:] + idx = operation_name.substr(start).find(token); + if (idx == std::string::npos) { + return false; + } + } + start += (idx + token.length()); + } + + // All tokens in filter_string found in operation_name + return true; +} + /////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace profiler diff --git a/tools/profiler/src/operation_profiler.h b/tools/profiler/src/operation_profiler.h index ce06b1c9..c7e20f36 100644 --- a/tools/profiler/src/operation_profiler.h +++ b/tools/profiler/src/operation_profiler.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -73,7 +73,7 @@ protected: ArgumentDescriptionVector arguments_; /// List of providers used to verify and compare each result - ProviderVector reference_providers_; + ProviderVector verification_providers_; /// Model performance result initailized by the operation profiler with workload statistics /// and reasonable default state. @@ -92,9 +92,10 @@ public: OperationProfiler(); OperationProfiler( + Options const &options, library::OperationKind kind, ArgumentDescriptionVector const &arguments = ArgumentDescriptionVector(), - ProviderVector const & reference_providers = ProviderVector()); + ProviderVector const & verification_providers = ProviderVector()); /// Destructor virtual ~OperationProfiler(); @@ -196,6 +197,20 @@ public: library::OperationDescription const &desc, library::Provider provider, library::Provider verification_provider = library::Provider::kInvalid); + + /// Helper to set a performance result member + static void set_argument( + PerformanceResult &result, + char const *name, + ProblemSpace const &problem_space, + std::string const &value); + + /// Helper to set a performance result member + static void set_argument( + PerformanceResult &result, + char const *name, + ProblemSpace const &problem_space, + int64_t value); protected: @@ -205,20 +220,6 @@ protected: library::OperationDescription const &operation_desc, ProblemSpace const &problem_space); - /// Helper to set a performance result member - static void set_argument_( - PerformanceResult &result, - char const *name, - ProblemSpace const &problem_space, - std::string const &value); - - /// Helper to set a performance result member - static void set_argument_( - PerformanceResult &result, - char const *name, - ProblemSpace const &problem_space, - int64_t value); - /// Method to profile an initialized CUTLASS operation virtual Status profile_cutlass_( double &runtime, @@ -227,6 +228,12 @@ protected: void const *arguments, void *host_workspace, void *device_workspace); + +private: + /// finds string matches filter_string in operation_name + bool find_string_matches_( + std::string const &filter_string, + std::string const &operation_name); }; ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu index 946e536c..5f62a81e 100644 --- a/tools/profiler/src/options.cu +++ b/tools/profiler/src/options.cu @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -76,7 +76,7 @@ Options::Device::Device(cutlass::CommandLine const &cmdline) { void Options::Device::print_usage(std::ostream &out) const { out << "Device:\n" - << " --device= " + << " --device= " << " CUDA Device ID\n\n"; int device_count = 0; @@ -106,7 +106,7 @@ void Options::Device::print_usage(std::ostream &out) const { } out - << " --compute-capability= " + << " --compute-capability= " << " Override the compute capability.\n\n"; } @@ -255,12 +255,6 @@ void Options::Initialization::get_distribution( continue; // next token } - // Casts as integer without scaling - if (it->first.compare("integer") == 0) { - dist.int_scale = 0; - continue; // next token - } - // initialize other members for (int m = 0; members[m].label; ++m) { if (it->first == members[m].label && !it->second.empty()) { @@ -276,19 +270,23 @@ void Options::Initialization::print_usage(std::ostream &out) const { out << "Initialization:\n" - << " --initialization= " + << " --initialization= " << " Enables initialization (default: true). If false, device memory is" << end_of_line - << "not initialized after allocation.\n\n" + << " not initialized after allocation.\n\n" - << " --initialization-provider= " - << " Selects 'device' or 'host' initialization.\n\n" + << " --initialization-provider= " + << " Selects initialization provider {host, device*}. (default: '*')\n\n" - << " --dist= " - << " Data distribution of input tensors\n\n" + << " --dist= " + << " Data distribution of input tensors {uniform*, gaussian, identity, sequential}" << end_of_line + << " --dist=uniform,min:,max:,scale:" << end_of_line + << " --dist=gaussian,mean:,stddev:,scale:" << end_of_line + << " --dist=sequential,start:,delta:,scale:" << end_of_line + << " --dist=identity\n\n" - << " --seed= " + << " --seed= " << " Random number generator seed. Used to enforce deterministic" << end_of_line - << "initialization.\n\n"; + << " initialization.\n\n"; } @@ -339,12 +337,12 @@ void Options::Library::print_usage(std::ostream &out) const { out << "Library:\n" - << " --library-algo-mode= " + << " --library-algo-mode= " << " Indicates algorithm mode used to call libraries such as cuBLAS and cuDNN.\n" - << " " + << " " << " mode={default*,matching,best}\n\n" - << " --library-algos= " + << " --library-algos= " << " If --algorithm-mode=best, permits specifying a selection of algorithms.\n\n"; } @@ -393,21 +391,25 @@ void Options::Profiling::print_usage(std::ostream &out) const { out << "Profiling:\n" - << " --profiling-iterations= " + << " --profiling-iterations= " << " Number of iterations to profile each kernel. If zero, kernels" << end_of_line - << "are launched up to the profiling duration.\n\n" + << " are launched up to the profiling duration.\n\n" - << " --warmup-iterations= " + << " --warmup-iterations= " << " Number of iterations to execute each kernel prior to profiling.\n\n" - << " --sleep-duration= " - << " Number of ms to sleep between profiling periods (ms)\n\n" + << " --sleep-duration= " + << " Number of ms to sleep between profiling periods (ms).\n\n" - << " --profiling-enabled= " + << " --profiling-enabled= " << " If true, profiling is actually conducted.\n\n" - << " --providers= " - << " List of providers to be profiled for performance\n\n"; + << " --providers= " + << " List of providers to be profiled for performance. (default: '*')" << end_of_line + << " Gemm providers {cutlass*" + << "}" << end_of_line + << "\n\n"; + } void Options::Profiling::print_options(std::ostream &out, int indent) const { @@ -477,6 +479,7 @@ Options::Verification::Verification(cutlass::CommandLine const &cmdline) { } else { providers.push_back(library::Provider::kCUBLAS); + providers.push_back(library::Provider::kReferenceDevice); } } @@ -484,22 +487,27 @@ void Options::Verification::print_usage(std::ostream &out) const { out << "Verification:\n" - << " --verification-enabled= " + << " --verification-enabled= " << " Whether to perform verification checks.\n\n" - << " --epsilon= " + << " --epsilon= " << " Error threshold. Setting to zero (default) requires" << end_of_line - << "bit-level equivalence.\n\n" + << " bit-level equivalence.\n\n" - << " --nonzero-floor= " + << " --nonzero-floor= " << " Results whose absolute value is less than this quantity" << end_of_line - << "are treated as zero for comparisons.\n\n" + << " are treated as zero for comparisons.\n\n" - << " --save-workspace={*never,incorrect,always}" - << " Specifies when to save the GEMM inputs and results to the filesystem.\n\n" + << " --save-workspace= " + << " Specifies when to save the GEMM inputs and results to the filesystem." << end_of_line + << " --save-workspace=never never save workspace (default)" << end_of_line + << " --save-workspace=incorrect save workspace for incorrect results" << end_of_line + << " --save-workspace=always always save workspace\n\n" - << " --verification-providers= " - << " List of providers used to verify result. (default: device)\n\n"; + << " --verification-providers= " + << " List of providers used to verify result. (default: '*')" << end_of_line + << " Gemm verification-providers {cublas*}" << end_of_line + << "\n\n"; } void Options::Verification::print_options(std::ostream &out, int indent) const { @@ -554,22 +562,22 @@ void Options::Report::print_usage(std::ostream &out) const { out << "Report:\n" - << " --append= " + << " --append= " << " If true, result is appended to possibly existing file. Otherwise, " << end_of_line - << "any existing file is overwritten.\n\n" + << " any existing file is overwritten.\n\n" - << " --output= " - << " Path to output file for machine readable results.\n\n" + << " --output= " + << " Path to output file for machine readable results. Operation kind and '.csv' is appended.\n\n" - << " --report-not-run= " + << " --report-not-run= " << " If true, reports the status of all kernels including those that" << end_of_line - << "do not satisfy the given arguments.\n\n" + << " do not satisfy the given arguments.\n\n" - << " --tags= " + << " --tags= " << " Inserts leading columns in output table and uniform values for each" << end_of_line - << "column. Useful for generating pivot tables.\n\n" + << " column. Useful for generating pivot tables.\n\n" - << " --verbose= " + << " --verbose= " << " Prints human-readable text to stdout. If false, nothing is written to stdout.\n\n"; } @@ -600,7 +608,7 @@ Options::About::About(cutlass::CommandLine const &cmdline) { void Options::About::print_usage(std::ostream &out) const { out << "About:\n" - << " --version "; + << " --version "; print_version(out); @@ -675,22 +683,29 @@ Options::Options(cutlass::CommandLine const &cmdline): void Options::print_usage(std::ostream &out) const { out - << "CUTLASS Performance Tool\n" + << "CUTLASS Profiler\n" << "usage:\n\n" << " cutlass_profiler [options]\n\n" << " --help\n\n" - << " --mode={profile*,single,dry,trace,enumerate} " - << " Regular profiling, single kernel mode only, or no profiling.\n\n" + << " --mode= " + << " Cutlass profiler execution mode." << end_of_line + << " --mode=profile regular verification and profiling (default)" << end_of_line + << " --mode=dry_run no kernels are launched or workspaces allocated" << end_of_line + << " --mode=enumerate lists all operation kind and operations" << end_of_line + << " --mode=trace executes a single device-side computation with" << end_of_line + << " no other kernel launches\n\n" - << " --device-info " + << " --device-info " << " Prints information on all GPUs present in the system\n\n" - << " --operation= " + << " --operation= " << " CUTLASS operation to profile.\n\n" - << " --kernels= " - << " List of substrings to filter operations by name.\n\n" + << " --kernels= " + << " Filter operations by kernel names. For example, call all kernels with" << end_of_line + << " (\"s1688\" and \"nt\") or (\"s844\" and \"tn\" and \"align8\") in their" << end_of_line + << " operation name using --kernels=\"s1688*nt, s884*tn*align8\"\n\n" ; // @@ -755,4 +770,3 @@ std::string Options::indent_str(int indent) { } // namespace profiler } // namespace cutlass - diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h index 4f723fa5..f4b5f0a1 100644 --- a/tools/profiler/src/options.h +++ b/tools/profiler/src/options.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_report.cpp b/tools/profiler/src/performance_report.cpp index 52a82099..0ab70449 100644 --- a/tools/profiler/src/performance_report.cpp +++ b/tools/profiler/src/performance_report.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -68,9 +68,11 @@ PerformanceReport::PerformanceReport( ): options_(options), argument_names_(argument_names), problem_index_(0), good_(true), op_kind_(op_kind) { - std::string file_name = options_.report.output_path.substr(0, options_.report.output_path.rfind(".")); - std::string file_extension = options_.report.output_path.substr(options_.report.output_path.rfind(".") + 1); - op_file_name_ = file_name + "." + to_string(op_kind_) + "." + file_extension; + // Strip '.csv' if present + std::string base_path = options_.report.output_path.substr( + 0, options_.report.output_path.rfind(".csv")); + + op_file_name_ = base_path + "." + to_string(op_kind_) + ".csv"; // // Open output file for operation of PerformanceReport::op_kind @@ -166,6 +168,7 @@ void PerformanceReport::close() { static const char *disposition_status_color(Disposition disposition) { switch (disposition) { case Disposition::kPassed: return SHELL_COLOR_GREEN(); + case Disposition::kIncorrect: return SHELL_COLOR_RED(); case Disposition::kFailed: return SHELL_COLOR_RED(); default: break; @@ -195,16 +198,17 @@ std::ostream & PerformanceReport::print_result_pretty_( out << "\n" - << " Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n" - << " Operation: " << result.operation_name << "\n\n" - << " Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n" - << " Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n" - << " Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n"; + << " Provider: " << SHELL_COLOR_BRIGHT() << library::to_string(result.provider, true) << SHELL_COLOR_END() << "\n" + << " OperationKind: " << SHELL_COLOR_BRIGHT() << library::to_string(result.op_kind) << SHELL_COLOR_END() << "\n" + << " Operation: " << result.operation_name << "\n\n" + << " Status: " << SHELL_COLOR_BRIGHT() << library::to_string(result.status, true) << SHELL_COLOR_END() << "\n" + << " Verification: " << SHELL_COLOR_BRIGHT() << (options_.verification.enabled ? "ON":"OFF") << SHELL_COLOR_END() << "\n" + << " Disposition: " << disposition_status_color(result.disposition) << to_string(result.disposition, true) << SHELL_COLOR_END() << "\n\n"; // Display individual verification results for each verification-provider if (options_.verification.enabled) { - static int const indent_spaces = 22; + static int const indent_spaces = 16; for(auto & m : result.verification_map) { out << std::right << std::setw(indent_spaces) << library::to_string(m.first, true) << ": " << to_string(m.second, true) << "\n"; @@ -212,15 +216,15 @@ std::ostream & PerformanceReport::print_result_pretty_( } out - << "\n Arguments: "; + << "\n Arguments:"; int column_idx = 0; for (auto const &arg : result.arguments) { if (!arg.second.empty()) { out << " --" << arg.first << "=" << arg.second; column_idx += int(4 + arg.first.size() + arg.second.size()); - if (column_idx > 90) { - out << " \\\n "; + if (column_idx > 98) { + out << " \\\n "; column_idx = 0; } } @@ -228,15 +232,15 @@ std::ostream & PerformanceReport::print_result_pretty_( out << "\n\n"; out - << " Bytes: " << result.bytes << " bytes\n" - << " FLOPs: " << result.flops << " flops\n\n"; + << " Bytes: " << result.bytes << " bytes\n" + << " FLOPs: " << result.flops << " flops\n\n"; if (result.good()) { out - << " Runtime: " << result.runtime << " ms\n" - << " Memory: " << result.gbytes_per_sec() << " GiB/s\n" - << "\n Math: " << result.gflops_per_sec() << " GFLOP/s\n"; + << " Runtime: " << result.runtime << " ms\n" + << " Memory: " << result.gbytes_per_sec() << " GiB/s\n" + << "\n Math: " << result.gflops_per_sec() << " GFLOP/s\n"; } @@ -256,7 +260,7 @@ std::ostream & PerformanceReport::print_csv_header_( out << (column_idx ? "," : "") << "Problem,Provider" - << ",Operation,Disposition,Status"; + << ",OperationKind,Operation,Disposition,Status"; for (auto const &arg_name : argument_names_) { out << "," << arg_name; @@ -289,6 +293,7 @@ std::ostream & PerformanceReport::print_result_csv_( << (column_idx ? "," : "") << result.problem_index << "," << to_string(result.provider, true) + << "," << to_string(result.op_kind) << "," << result.operation_name << "," << to_string(result.disposition) << "," << library::to_string(result.status); diff --git a/tools/profiler/src/performance_report.h b/tools/profiler/src/performance_report.h index 573a049e..1c086e61 100644 --- a/tools/profiler/src/performance_report.h +++ b/tools/profiler/src/performance_report.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/profiler/src/performance_result.cu b/tools/profiler/src/performance_result.cu new file mode 100644 index 00000000..86cabfb7 --- /dev/null +++ b/tools/profiler/src/performance_result.cu @@ -0,0 +1,55 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief +*/ + +#pragma once + +#include + +#include "cutlass/cutlass.h" + +// CUTLASS Profiler includes +#include "enumerated_types.h" +#include "performance_result.h" + +// CUTLASS Library includes +#include "cutlass/library/library.h" +#include "cutlass/library/util.h" + +namespace cutlass { +namespace profiler { + +///////////////////////////////////////////////////////////////////////////////////////////////// + + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace profiler +} // namespace cutlass + +///////////////////////////////////////////////////////////////////////////////////////////////// + diff --git a/tools/profiler/src/performance_result.h b/tools/profiler/src/performance_result.h index 23eb60f2..9e3ebeb5 100644 --- a/tools/profiler/src/performance_result.h +++ b/tools/profiler/src/performance_result.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -65,7 +65,7 @@ struct PerformanceResult { /// Outcome of verification (all verification results) DispositionMap verification_map; - /// Operation object + /// Operation name std::string operation_name; /// Stringified vector of argument values @@ -119,3 +119,4 @@ using PerformanceResultVector = std::vector; } // namespace profiler } // namespace cutlass + diff --git a/tools/profiler/src/problem_space.cpp b/tools/profiler/src/problem_space.cpp index e95b9e1b..adede0ea 100644 --- a/tools/profiler/src/problem_space.cpp +++ b/tools/profiler/src/problem_space.cpp @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -849,6 +849,47 @@ bool arg_as_OpcodeClassID( return arg_as_OpcodeClassID(opcode_class, value_ptr); } + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID( + library::SplitKMode &split_k_mode, + KernelArgument::Value const *value_ptr) { + + if (value_ptr->not_null) { + if (value_ptr->argument->description->type == ArgumentTypeID::kEnumerated) { + + split_k_mode = library::from_string( + static_cast(value_ptr)->element); + + if (split_k_mode == library::SplitKMode::kInvalid) { + throw std::runtime_error( + "arg_as_SplitKModeID() - illegal cast."); + } + } + else { + + throw std::runtime_error( + "arg_as_SplitKModeID() - illegal cast."); + } + return true; + } + return false; +} + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID( + library::SplitKMode &split_k_mode, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem) { + + size_t idx = problem_space.argument_index(name); + KernelArgument::Value const *value_ptr = problem.at(idx).get(); + + return arg_as_SplitKModeID(split_k_mode, value_ptr); +} + + ///////////////////////////////////////////////////////////////////////////////////////////////// /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null. bool arg_as_scalar( @@ -939,7 +980,6 @@ bool tensor_description_satisfies( } ///////////////////////////////////////////////////////////////////////////////////////////////// - } // namespace profiler } // namespace cutlass diff --git a/tools/profiler/src/problem_space.h b/tools/profiler/src/problem_space.h index 8dfd216c..77a79ca2 100644 --- a/tools/profiler/src/problem_space.h +++ b/tools/profiler/src/problem_space.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -811,6 +811,17 @@ bool arg_as_OpcodeClassID( ProblemSpace const &problem_space, ProblemSpace::Problem const &problem); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID(library::SplitKMode &split_k_mode, KernelArgument::Value const *value_ptr); + +/// Lexically casts an argument to an int64 if it is defined. Returns true if not null. +bool arg_as_SplitKModeID( + library::SplitKMode &split_k_mode, + char const *name, + ProblemSpace const &problem_space, + ProblemSpace::Problem const &problem); + /// Lexically casts an argument to a given type stored in a byte array. Returns true if not null. bool arg_as_scalar( std::vector &bytes, diff --git a/tools/util/CMakeLists.txt b/tools/util/CMakeLists.txt index 51be4b54..0d2f86fb 100644 --- a/tools/util/CMakeLists.txt +++ b/tools/util/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without modification, are permitted # provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/command_line.h b/tools/util/include/cutlass/util/command_line.h index 31fa7f34..c158ef97 100644 --- a/tools/util/include/cutlass/util/command_line.h +++ b/tools/util/include/cutlass/util/command_line.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/debug.h b/tools/util/include/cutlass/util/debug.h index 065a94e4..3ebbd4d8 100644 --- a/tools/util/include/cutlass/util/debug.h +++ b/tools/util/include/cutlass/util/debug.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/device_dump.h b/tools/util/include/cutlass/util/device_dump.h index 2dd67c89..dac6029c 100644 --- a/tools/util/include/cutlass/util/device_dump.h +++ b/tools/util/include/cutlass/util/device_dump.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/device_memory.h b/tools/util/include/cutlass/util/device_memory.h index 52229425..79b12368 100644 --- a/tools/util/include/cutlass/util/device_memory.h +++ b/tools/util/include/cutlass/util/device_memory.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/distribution.h b/tools/util/include/cutlass/util/distribution.h index d9b61ca5..03377377 100644 --- a/tools/util/include/cutlass/util/distribution.h +++ b/tools/util/include/cutlass/util/distribution.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/exceptions.h b/tools/util/include/cutlass/util/exceptions.h index ab5623bf..b6cf2fcd 100644 --- a/tools/util/include/cutlass/util/exceptions.h +++ b/tools/util/include/cutlass/util/exceptions.h @@ -1,5 +1,5 @@ /****************************************************************************** - * Copyright (c) 2011-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are not permitted. diff --git a/tools/util/include/cutlass/util/host_reorder.h b/tools/util/include/cutlass/util/host_reorder.h index bb9ed621..d46d4594 100644 --- a/tools/util/include/cutlass/util/host_reorder.h +++ b/tools/util/include/cutlass/util/host_reorder.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/host_tensor.h b/tools/util/include/cutlass/util/host_tensor.h index b43186a0..c734a5f5 100644 --- a/tools/util/include/cutlass/util/host_tensor.h +++ b/tools/util/include/cutlass/util/host_tensor.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/host_tensor_planar_complex.h b/tools/util/include/cutlass/util/host_tensor_planar_complex.h index a5e990cf..ed85cf22 100644 --- a/tools/util/include/cutlass/util/host_tensor_planar_complex.h +++ b/tools/util/include/cutlass/util/host_tensor_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/detail/inner_product.h b/tools/util/include/cutlass/util/reference/detail/inner_product.h index 77a3076e..f75f8b88 100644 --- a/tools/util/include/cutlass/util/reference/detail/inner_product.h +++ b/tools/util/include/cutlass/util/reference/detail/inner_product.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm.h b/tools/util/include/cutlass/util/reference/device/gemm.h index 9dc66cca..5aef19ff 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h index 10ce474e..b3003409 100644 --- a/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h +++ b/tools/util/include/cutlass/util/reference/device/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h index 6e389102..4c8e361e 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h index cf47c9a4..4d9de515 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_elementwise.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h index b7c2f073..64cb37be 100644 --- a/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/kernel/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_compare.h b/tools/util/include/cutlass/util/reference/device/tensor_compare.h index dca50c2f..3323bed5 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_compare.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_fill.h b/tools/util/include/cutlass/util/reference/device/tensor_fill.h index 34ba2475..962ded09 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_fill.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h index aa6610e1..d03080b2 100644 --- a/tools/util/include/cutlass/util/reference/device/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/device/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/device/tensor_relu.h b/tools/util/include/cutlass/util/reference/device/tensor_relu.h new file mode 100644 index 00000000..d78e1953 --- /dev/null +++ b/tools/util/include/cutlass/util/reference/device/tensor_relu.h @@ -0,0 +1,135 @@ +/*************************************************************************************************** + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted + * provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright notice, this list of + * conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of + * conditions and the following disclaimer in the documentation and/or other materials + * provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used + * to endorse or promote products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND + * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ +/* \file + \brief Defines device-side elementwise operations on TensorView. Note, the operations defined + in this header are not specialized for any particular data layout and are therefore not + intended to offer the best possible performance. Rather, they are intended to be generic + reference implementations to support the CUTLASS unit tests. +*/ + +#pragma once + +// Cutlass includes +#include "cutlass/cutlass.h" +#include "cutlass/tensor_view.h" + +#include "cutlass/util/reference/device/tensor_foreach.h" + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass { +namespace reference { +namespace device { + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template < + typename Element, ///< Element type + typename Layout> ///< Layout function +struct TensorReLuFunc { + + /// View type + using TensorView = TensorView; + + /// Coordinate in tensor's index space + using TensorCoord = typename TensorView::TensorCoord; + + /// Parameters structure + struct Params { + + // + // Data members + // + + TensorView view; + Element threshold; + + + // + // Methods + // + + Params( + TensorView view_ = TensorView(), + Element threshold_ = Element(0) + ): + view(view_), threshold(threshold_) { + + } + }; + + // + // Data members + // + + Params params; + + // + // Methods + // + + CUTLASS_DEVICE + TensorReLuFunc(Params const ¶ms): params(params) { + + } + + CUTLASS_DEVICE + void operator()(TensorCoord const &coord) { + + Element const & value = params.view.at(coord); + params.view.at(coord) = (value < params.threshold) ? params.threshold : value; + } +}; + +} // namespace detail + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +/// Apply ReLu on a tensor +template < + typename Element, ///< Element type + typename Layout> ///< Layout function +void TensorReLu( + TensorView view, ///< destination tensor + Element threshold = Element(0)) { ///< ReLu threshold + + using Func = detail::TensorReLuFunc; + using Params = typename Func::Params; + + TensorForEach( + view.extent(), + Params(view, threshold) + ); +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace device +} // namespace reference +} // namespace cutlass diff --git a/tools/util/include/cutlass/util/reference/device/thread/gemm.h b/tools/util/include/cutlass/util/reference/device/thread/gemm.h index fefc4131..11485a91 100644 --- a/tools/util/include/cutlass/util/reference/device/thread/gemm.h +++ b/tools/util/include/cutlass/util/reference/device/thread/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm.h b/tools/util/include/cutlass/util/reference/host/gemm.h index 13dbd5cf..3e38886d 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm.h +++ b/tools/util/include/cutlass/util/reference/host/gemm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: @@ -37,11 +37,41 @@ #include "cutlass/tensor_view.h" #include "cutlass/gemm/gemm.h" #include "cutlass/arch/mma.h" +#include "cutlass/util/host_tensor.h" namespace cutlass { namespace reference { namespace host { +template +struct CastIfScalar { + static Out cast(In in) { + return Out(in); + } +}; + +template +struct CastIfScalar, In> { + typedef cutlass::complex Out; + static Out cast(In in) { + return Out(static_cast(in)); + } +}; + +template +struct CastIfScalar, cutlass::complex> { + typedef cutlass::complex Out; + typedef cutlass::complex In; + static Out cast(In in) { + return Out(in); + } +}; + +template +Out cast_if_scalar(In in) { + return CastIfScalar::cast(in); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Computes a general matrix product among matrices (tensors of rank=2) pointed to by TensorRef @@ -107,7 +137,10 @@ void compute_gemm( ElementA a = tensor_a.at(MatrixCoord(row, k_block)); ElementB b = tensor_b.at(MatrixCoord(k_block, col)); - accum[i][j] = inner_product_op(ComputeType(a), ComputeType(b), accum[i][j]); + ComputeType compute_a(cast_if_scalar(a)); + ComputeType compute_b(cast_if_scalar(b)); + + accum[i][j] = inner_product_op(compute_a, compute_b, accum[i][j]); } } } diff --git a/tools/util/include/cutlass/util/reference/host/gemm_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_complex.h index 0f067691..27f36820 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm_complex.h +++ b/tools/util/include/cutlass/util/reference/host/gemm_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h index 4d02747d..2a23fd27 100644 --- a/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h +++ b/tools/util/include/cutlass/util/reference/host/gemm_planar_complex.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_compare.h b/tools/util/include/cutlass/util/reference/host/tensor_compare.h index bf05a099..2d7545e9 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_compare.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_compare.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_copy.h b/tools/util/include/cutlass/util/reference/host/tensor_copy.h index 737119e8..a81f0211 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_copy.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_copy.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h index 73eb328d..88bbb39f 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_elementwise.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_fill.h b/tools/util/include/cutlass/util/reference/host/tensor_fill.h index b298e4c2..87c14d61 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_fill.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_fill.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h index 23ee9f93..feb439d7 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_foreach.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_foreach.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/reference/host/tensor_norm.h b/tools/util/include/cutlass/util/reference/host/tensor_norm.h index 6c73d91f..1d494b9f 100644 --- a/tools/util/include/cutlass/util/reference/host/tensor_norm.h +++ b/tools/util/include/cutlass/util/reference/host/tensor_norm.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/tensor_view_io.h b/tools/util/include/cutlass/util/tensor_view_io.h index 590462f7..0043d745 100644 --- a/tools/util/include/cutlass/util/tensor_view_io.h +++ b/tools/util/include/cutlass/util/tensor_view_io.h @@ -1,5 +1,5 @@ /*************************************************************************************************** -* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +* Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: diff --git a/tools/util/include/cutlass/util/type_traits.h b/tools/util/include/cutlass/util/type_traits.h index 059a23ab..d97af0a4 100644 --- a/tools/util/include/cutlass/util/type_traits.h +++ b/tools/util/include/cutlass/util/type_traits.h @@ -1,5 +1,5 @@ /*************************************************************************************************** - * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, are permitted * provided that the following conditions are met: