From f115c3f85467d5d9619119d1dbeb9c03c3d73864 Mon Sep 17 00:00:00 2001 From: Kihiro Bando <39278362+bandokihiro@users.noreply.github.com> Date: Tue, 13 May 2025 15:55:29 -0400 Subject: [PATCH] Release v4.0.0 (#2294) --- CHANGELOG.md | 488 +- CMakeLists.txt | 23 +- CONTRIBUTORS.md | 51 +- EULA.txt | 188 + LICENSE.txt | 7 + README.md | 184 +- customConfigs.cmake | 12 +- .../56_hopper_ptr_array_batched_gemm.cu | 2 +- .../57_hopper_grouped_gemm.cu | 73 +- examples/58_ada_fp8_gemm/ada_fp8_gemm.cu | 4 - .../65_distributed_gemm.cu | 14 +- ...specialized_gemm_with_blockwise_scaling.cu | 8 - ...specialized_gemm_with_groupwise_scaling.cu | 8 - .../75_blackwell_grouped_gemm.cu | 16 +- .../75_blackwell_grouped_gemm_block_scaled.cu | 17 +- ..._fmha_fwd_mainloop_tma_warpspecialized.hpp | 4 +- ...m100_fmha_gen_mainloop_warpspecialized.hpp | 4 +- ...9d_blackwell_geforce_nvfp4_grouped_gemm.cu | 2 +- .../82_blackwell_distributed_gemm.cu | 14 +- examples/cute/tutorial/hopper/wgmma_sm90.cu | 8 +- .../cute/tutorial/hopper/wgmma_tma_sm90.cu | 4 +- .../python/CuTeDSL/ampere/elementwise_add.py | 392 + .../CuTeDSL/ampere/elementwise_apply.py | 395 + .../CuTeDSL/ampere/flash_attention_v2.py | 1353 ++++ examples/python/CuTeDSL/ampere/sgemm.py | 780 ++ .../python/CuTeDSL/ampere/tensorop_gemm.py | 968 +++ .../python/CuTeDSL/blackwell/dense_gemm.py | 1922 +++++ .../blackwell/dense_gemm_persistent.py | 2144 ++++++ examples/python/CuTeDSL/blackwell/fmha.py | 2984 ++++++++ .../python/CuTeDSL/blackwell/grouped_gemm.py | 2287 ++++++ examples/python/CuTeDSL/notebooks/README.md | 31 + .../CuTeDSL/notebooks/cuda_graphs.ipynb | 648 ++ .../notebooks/cute_layout_algebra.ipynb | 1001 +++ .../python/CuTeDSL/notebooks/data_types.ipynb | 310 + .../CuTeDSL/notebooks/elementwise_add.ipynb | 838 +++ .../CuTeDSL/notebooks/hello_world.ipynb | 173 + .../notebooks/images/cuda_graphs_image.png | Bin 0 -> 8586 bytes examples/python/CuTeDSL/notebooks/print.ipynb | 425 ++ .../python/CuTeDSL/notebooks/tensor.ipynb | 390 + .../python/CuTeDSL/notebooks/tensorssa.ipynb | 558 ++ .../{ => deprecated}/00_basic_gemm.ipynb | 0 .../python/{ => deprecated}/01_epilogue.ipynb | 0 .../02_pytorch_extension_grouped_gemm.ipynb | 0 .../{ => deprecated}/03_basic_conv2d.ipynb | 0 .../04_epilogue_visitor.ipynb | 0 examples/python/{ => deprecated}/README.md | 0 include/cute/arch/config.hpp | 1 + include/cute/arch/mma_sm100_umma.hpp | 32 + include/cute/atom/copy_traits_sm100_tma.hpp | 4 +- include/cute/atom/copy_traits_sm90_tma.hpp | 8 +- include/cute/tensor_impl.hpp | 6 +- include/cutlass/arch/config.h | 8 +- .../collective/builders/sm120_builder.inl | 40 + .../collective/builders/sm90_builder.inl | 49 +- .../collective/builders/sm90_common.inl | 26 +- .../epilogue/collective/default_epilogue.hpp | 4 +- ...100_epilogue_array_tma_warpspecialized.hpp | 20 +- .../collective/sm100_epilogue_nosmem.hpp | 4 +- .../sm100_epilogue_tma_warpspecialized.hpp | 8 +- ...m90_epilogue_array_tma_warpspecialized.hpp | 18 +- .../sm90_epilogue_tma_warpspecialized.hpp | 4 +- include/cutlass/epilogue/thread/activation.h | 34 +- include/cutlass/functional.h | 2 +- .../builders/sm100_blockwise_umma_builder.inl | 16 +- .../collective/builders/sm90_gmma_builder.inl | 17 +- ..._blockscaled_mma_array_warpspecialized.hpp | 7 +- ...rray_warpspecialized_blockwise_scaling.hpp | 14 +- ..._mma_warpspecialized_blockwise_scaling.hpp | 14 +- .../sm100_mma_warpspecialized_mixed_input.hpp | 824 --- .../sm120_blockscaled_mma_array_tma.hpp | 21 +- ...ma_gmma_rs_warpspecialized_mixed_input.hpp | 4 + ..._mma_array_tma_gmma_ss_warpspecialized.hpp | 7 +- ..._array_tma_gmma_ss_warpspecialized_fp8.hpp | 4 + ..._warpspecialized_fp8_blockwise_scaling.hpp | 277 +- ..._warpspecialized_fp8_blockwise_scaling.hpp | 17 +- include/cutlass/gemm/dispatch_policy.hpp | 2 + .../sm100_gemm_array_tma_warpspecialized.hpp | 8 +- ...ay_tma_warpspecialized_input_transform.hpp | 7 +- ...rray_tma_warpspecialized_mma_transform.hpp | 3 + .../kernel/sm100_gemm_tma_warpspecialized.hpp | 6 +- ...mm_tma_warpspecialized_input_transform.hpp | 7 +- ...gemm_tma_warpspecialized_mma_transform.hpp | 6 +- .../sm100_sparse_gemm_tma_warpspecialized.hpp | 5 +- ...specialized_cooperative_asymmetric_dma.hpp | 7 +- ..._array_tma_warpspecialized_cooperative.hpp | 34 +- ...emm_array_tma_warpspecialized_pingpong.hpp | 19 +- ...0_gemm_tma_warpspecialized_cooperative.hpp | 2 + include/cutlass/pipeline/sm100_pipeline.hpp | 6 +- include/cutlass/version.h | 6 +- media/docs/cpp/blackwell.rst | 10 + .../cpp/blackwell_cluster_launch_control.md | 8 +- media/docs/cpp/blackwell_functionality.md | 2 +- .../building_in_windows_with_visual_studio.md | 12 +- .../building_with_clang_as_host_compiler.md | 8 +- media/docs/cpp/build/index.rst | 10 + media/docs/cpp/code_organization.md | 2 +- media/docs/cpp/cute/02_layout_algebra.md | 44 +- media/docs/cpp/cute/03_tensor.md | 8 +- media/docs/cpp/cute/0t_mma_atom.md | 36 +- media/docs/cpp/cute/0x_gemm_tutorial.md | 12 +- media/docs/cpp/cute/0z_tma_tensors.md | 20 +- media/docs/cpp/cute/index.rst | 2 +- media/docs/cpp/cutlass_2x.rst | 12 + media/docs/cpp/cutlass_3x.rst | 11 + .../cpp/cutlass_3x_backwards_compatibility.md | 2 +- media/docs/cpp/cutlass_3x_design.md | 2 +- media/docs/cpp/functionality.md | 2 +- media/docs/cpp/fundamental_types.md | 2 +- media/docs/cpp/gemm_api.md | 18 +- media/docs/cpp/gemm_api_3x.md | 8 +- media/docs/cpp/getting_started.rst | 16 + media/docs/cpp/grouped_scheduler.md | 2 +- media/docs/cpp/ide_setup.md | 2 +- media/docs/cpp/layout.md | 6 +- media/docs/cpp/overview.md | 619 -- media/docs/cpp/profiler.md | 8 +- media/docs/cpp/programming_guidelines.md | 2 +- media/docs/cpp/quickstart.md | 6 +- media/docs/cpp/terminology.md | 5 +- media/docs/cpp/tile_iterator_concept.md | 2 +- media/docs/cpp/utilities.md | 2 +- media/docs/pythonDSL/cute_dsl.rst | 18 + media/docs/pythonDSL/cute_dsl_api.rst | 12 + media/docs/pythonDSL/cute_dsl_api/cute.rst | 11 + .../docs/pythonDSL/cute_dsl_api/cute_arch.rst | 24 + .../pythonDSL/cute_dsl_api/cute_nvgpu.rst | 18 + .../cute_dsl_api/cute_nvgpu_common.rst | 9 + .../cute_dsl_api/cute_nvgpu_cpasync.rst | 10 + .../cute_dsl_api/cute_nvgpu_tcgen05.rst | 10 + .../cute_dsl_api/cute_nvgpu_warp.rst | 10 + .../cute_dsl_api/cute_nvgpu_warpgroup.rst | 10 + media/docs/pythonDSL/cute_dsl_api/utils.rst | 9 + .../cute_dsl_general/autotuning_gemm.rst | 154 + .../pythonDSL/cute_dsl_general/debugging.rst | 133 + .../cute_dsl_general/dsl_code_generation.rst | 90 + .../cute_dsl_general/dsl_control_flow.rst | 140 + .../cute_dsl_general/dsl_dynamic_layout.rst | 198 + .../cute_dsl_general/dsl_introduction.rst | 128 + .../dsl_jit_arg_generation.rst | 196 + .../cute_dsl_general/dsl_jit_caching.rst | 152 + .../pythonDSL/cute_dsl_general/dsl_modes.png | Bin 0 -> 1134058 bytes .../framework_integration.rst | 412 ++ .../pythonDSL/cute_dsl_general/notebooks.rst | 16 + media/docs/pythonDSL/faqs.rst | 137 + media/docs/pythonDSL/functionality.rst | 34 + media/docs/pythonDSL/limitations.rst | 279 + media/docs/pythonDSL/overview.rst | 108 + media/docs/pythonDSL/quick_start.rst | 31 + python/CuTeDSL/EULA.txt | 188 + python/CuTeDSL/base_dsl/__init__.py | 17 + .../base_dsl/_mlir_helpers/__init__.py | 27 + .../CuTeDSL/base_dsl/_mlir_helpers/arith.py | 691 ++ python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py | 64 + .../base_dsl/_mlir_helpers/lru_cache_ir.py | 76 + python/CuTeDSL/base_dsl/_mlir_helpers/op.py | 34 + python/CuTeDSL/base_dsl/ast_helpers.py | 584 ++ python/CuTeDSL/base_dsl/ast_preprocessor.py | 1459 ++++ python/CuTeDSL/base_dsl/cache_helpers.py | 154 + python/CuTeDSL/base_dsl/common.py | 268 + python/CuTeDSL/base_dsl/compiler.py | 221 + python/CuTeDSL/base_dsl/dsl.py | 1637 +++++ python/CuTeDSL/base_dsl/env_manager.py | 303 + python/CuTeDSL/base_dsl/jit_executor.py | 301 + python/CuTeDSL/base_dsl/runtime/__init__.py | 29 + python/CuTeDSL/base_dsl/runtime/cuda.py | 470 ++ .../CuTeDSL/base_dsl/runtime/device_tensor.py | 121 + .../CuTeDSL/base_dsl/runtime/dlpack_types.py | 76 + .../base_dsl/runtime/jit_arg_adapters.py | 188 + .../base_dsl/runtime/tensor_descriptor.py | 201 + python/CuTeDSL/base_dsl/typing.py | 1897 +++++ python/CuTeDSL/base_dsl/utils/__init__.py | 19 + python/CuTeDSL/base_dsl/utils/logger.py | 80 + python/CuTeDSL/base_dsl/utils/stacktrace.py | 165 + python/CuTeDSL/base_dsl/utils/timer.py | 56 + python/CuTeDSL/cutlass/__init__.py | 57 + python/CuTeDSL/cutlass/cute/__init__.py | 310 + python/CuTeDSL/cutlass/cute/arch/__init__.py | 98 + python/CuTeDSL/cutlass/cute/arch/elect.py | 75 + python/CuTeDSL/cutlass/cute/arch/mbar.py | 208 + .../cutlass/cute/arch/nvvm_wrappers.py | 547 ++ python/CuTeDSL/cutlass/cute/arch/smem.py | 96 + python/CuTeDSL/cutlass/cute/arch/tmem.py | 142 + python/CuTeDSL/cutlass/cute/core.py | 6417 +++++++++++++++++ python/CuTeDSL/cutlass/cute/math.py | 354 + python/CuTeDSL/cutlass/cute/nvgpu/__init__.py | 26 + python/CuTeDSL/cutlass/cute/nvgpu/common.py | 143 + .../cutlass/cute/nvgpu/cpasync/__init__.py | 38 + .../cutlass/cute/nvgpu/cpasync/copy.py | 366 + .../cutlass/cute/nvgpu/cpasync/helpers.py | 327 + python/CuTeDSL/cutlass/cute/nvgpu/helpers.py | 159 + .../cutlass/cute/nvgpu/tcgen05/__init__.py | 57 + .../cutlass/cute/nvgpu/tcgen05/copy.py | 465 ++ .../cutlass/cute/nvgpu/tcgen05/helpers.py | 301 + .../CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py | 603 ++ .../cutlass/cute/nvgpu/warp/__init__.py | 25 + .../CuTeDSL/cutlass/cute/nvgpu/warp/copy.py | 189 + python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py | 78 + .../cutlass/cute/nvgpu/warpgroup/__init__.py | 29 + .../cutlass/cute/nvgpu/warpgroup/helpers.py | 109 + .../cutlass/cute/nvgpu/warpgroup/mma.py | 380 + python/CuTeDSL/cutlass/cute/runtime.py | 515 ++ python/CuTeDSL/cutlass/cute/testing.py | 285 + python/CuTeDSL/cutlass/cute/typing.py | 193 + python/CuTeDSL/cutlass/impl_utils.py | 32 + python/CuTeDSL/cutlass/torch.py | 169 + python/CuTeDSL/cutlass/utils/README.md | 9 + python/CuTeDSL/cutlass/utils/__init__.py | 78 + .../CuTeDSL/cutlass/utils/ampere_helpers.py | 26 + .../cutlass/utils/blackwell_helpers.py | 910 +++ .../grouped_gemm_tile_scheduler_helper.py | 466 ++ python/CuTeDSL/cutlass/utils/hardware_info.py | 174 + .../CuTeDSL/cutlass/utils/hopper_helpers.py | 195 + python/CuTeDSL/cutlass/utils/layout.py | 68 + python/CuTeDSL/cutlass/utils/pipeline.py | 984 +++ .../CuTeDSL/cutlass/utils/smem_allocator.py | 217 + .../utils/static_persistent_tile_scheduler.py | 384 + .../cutlass/utils/tensormap_manager.py | 140 + python/CuTeDSL/cutlass_dsl/__init__.py | 37 + python/CuTeDSL/cutlass_dsl/cutlass.py | 1322 ++++ .../cutlass_dsl/cutlass_ast_decorators.py | 515 ++ python/CuTeDSL/requirements.txt | 3 + python/cutlass/__init__.py | 2 +- python/cutlass/op/conv.py | 1 + python/cutlass/utils/lazy_import.py | 32 +- python/cutlass/utils/profiler.py | 1 + python/cutlass_library/emit_kernel_listing.py | 122 +- python/cutlass_library/gemm_operation.py | 51 +- python/cutlass_library/generator.py | 181 +- python/cutlass_library/manifest.py | 6 +- python/cutlass_library/sm90_utils.py | 16 +- python/setup_library.py | 2 +- python/setup_pycute.py | 2 +- test/unit/gemm/device/CMakeLists.txt | 25 +- ...0_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu | 96 +- ...0_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu | 96 +- ...0_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu | 72 +- ..._bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu | 96 +- ...0_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu | 96 +- ...0_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu | 72 +- ..._bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu | 72 +- ...0_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu | 36 +- ...0_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu | 36 +- ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu | 36 +- ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu | 36 +- ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu | 36 +- ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu | 36 +- ...mm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu | 36 +- ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu | 36 +- ...p_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu | 36 +- ...0_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu | 36 +- ...0_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu | 36 +- ..._bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu | 36 +- ..._bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu | 36 +- ..._bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu | 36 +- ..._bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu | 36 +- ..._gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu | 36 +- ..._gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu | 36 +- ...0_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu | 48 +- ...0_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu | 48 +- ...p_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu | 48 +- ...p_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu | 48 +- ...mm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu | 36 +- ...0_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu | 48 +- ...0_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu | 48 +- ..._bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu | 48 +- ..._bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu | 48 +- ..._bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu | 48 +- ..._bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu | 48 +- ..._gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu | 48 +- ..._gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu | 48 +- ...0_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu | 320 + .../sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu | 48 +- .../sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu | 48 +- .../sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu | 48 +- .../sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu | 48 +- .../sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu | 48 +- .../sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu | 132 +- ...sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu | 132 +- .../sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu | 132 +- .../sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu | 132 +- .../sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu | 132 +- .../sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu | 132 +- .../sm90_gemm_f8_f8_f32_tensor_op_fp32.cu | 72 + 299 files changed, 51495 insertions(+), 4413 deletions(-) create mode 100644 EULA.txt create mode 100644 examples/python/CuTeDSL/ampere/elementwise_add.py create mode 100644 examples/python/CuTeDSL/ampere/elementwise_apply.py create mode 100644 examples/python/CuTeDSL/ampere/flash_attention_v2.py create mode 100644 examples/python/CuTeDSL/ampere/sgemm.py create mode 100644 examples/python/CuTeDSL/ampere/tensorop_gemm.py create mode 100644 examples/python/CuTeDSL/blackwell/dense_gemm.py create mode 100644 examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py create mode 100644 examples/python/CuTeDSL/blackwell/fmha.py create mode 100644 examples/python/CuTeDSL/blackwell/grouped_gemm.py create mode 100644 examples/python/CuTeDSL/notebooks/README.md create mode 100644 examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/data_types.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/elementwise_add.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/hello_world.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/images/cuda_graphs_image.png create mode 100644 examples/python/CuTeDSL/notebooks/print.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/tensor.ipynb create mode 100644 examples/python/CuTeDSL/notebooks/tensorssa.ipynb rename examples/python/{ => deprecated}/00_basic_gemm.ipynb (100%) rename examples/python/{ => deprecated}/01_epilogue.ipynb (100%) rename examples/python/{ => deprecated}/02_pytorch_extension_grouped_gemm.ipynb (100%) rename examples/python/{ => deprecated}/03_basic_conv2d.ipynb (100%) rename examples/python/{ => deprecated}/04_epilogue_visitor.ipynb (100%) rename examples/python/{ => deprecated}/README.md (100%) delete mode 100644 include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp create mode 100644 media/docs/cpp/blackwell.rst create mode 100644 media/docs/cpp/build/index.rst create mode 100644 media/docs/cpp/cutlass_2x.rst create mode 100644 media/docs/cpp/cutlass_3x.rst create mode 100644 media/docs/cpp/getting_started.rst delete mode 100644 media/docs/cpp/overview.md create mode 100644 media/docs/pythonDSL/cute_dsl.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_arch.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst create mode 100644 media/docs/pythonDSL/cute_dsl_api/utils.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/debugging.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/dsl_modes.png create mode 100644 media/docs/pythonDSL/cute_dsl_general/framework_integration.rst create mode 100644 media/docs/pythonDSL/cute_dsl_general/notebooks.rst create mode 100644 media/docs/pythonDSL/faqs.rst create mode 100644 media/docs/pythonDSL/functionality.rst create mode 100644 media/docs/pythonDSL/limitations.rst create mode 100644 media/docs/pythonDSL/overview.rst create mode 100644 media/docs/pythonDSL/quick_start.rst create mode 100644 python/CuTeDSL/EULA.txt create mode 100644 python/CuTeDSL/base_dsl/__init__.py create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/arith.py create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py create mode 100644 python/CuTeDSL/base_dsl/_mlir_helpers/op.py create mode 100644 python/CuTeDSL/base_dsl/ast_helpers.py create mode 100644 python/CuTeDSL/base_dsl/ast_preprocessor.py create mode 100644 python/CuTeDSL/base_dsl/cache_helpers.py create mode 100644 python/CuTeDSL/base_dsl/common.py create mode 100644 python/CuTeDSL/base_dsl/compiler.py create mode 100644 python/CuTeDSL/base_dsl/dsl.py create mode 100644 python/CuTeDSL/base_dsl/env_manager.py create mode 100644 python/CuTeDSL/base_dsl/jit_executor.py create mode 100644 python/CuTeDSL/base_dsl/runtime/__init__.py create mode 100644 python/CuTeDSL/base_dsl/runtime/cuda.py create mode 100644 python/CuTeDSL/base_dsl/runtime/device_tensor.py create mode 100644 python/CuTeDSL/base_dsl/runtime/dlpack_types.py create mode 100644 python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py create mode 100644 python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py create mode 100644 python/CuTeDSL/base_dsl/typing.py create mode 100644 python/CuTeDSL/base_dsl/utils/__init__.py create mode 100644 python/CuTeDSL/base_dsl/utils/logger.py create mode 100644 python/CuTeDSL/base_dsl/utils/stacktrace.py create mode 100644 python/CuTeDSL/base_dsl/utils/timer.py create mode 100644 python/CuTeDSL/cutlass/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/arch/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/arch/elect.py create mode 100644 python/CuTeDSL/cutlass/cute/arch/mbar.py create mode 100644 python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py create mode 100644 python/CuTeDSL/cutlass/cute/arch/smem.py create mode 100644 python/CuTeDSL/cutlass/cute/arch/tmem.py create mode 100644 python/CuTeDSL/cutlass/cute/core.py create mode 100644 python/CuTeDSL/cutlass/cute/math.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/common.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/helpers.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py create mode 100644 python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py create mode 100644 python/CuTeDSL/cutlass/cute/runtime.py create mode 100644 python/CuTeDSL/cutlass/cute/testing.py create mode 100644 python/CuTeDSL/cutlass/cute/typing.py create mode 100644 python/CuTeDSL/cutlass/impl_utils.py create mode 100644 python/CuTeDSL/cutlass/torch.py create mode 100644 python/CuTeDSL/cutlass/utils/README.md create mode 100644 python/CuTeDSL/cutlass/utils/__init__.py create mode 100644 python/CuTeDSL/cutlass/utils/ampere_helpers.py create mode 100644 python/CuTeDSL/cutlass/utils/blackwell_helpers.py create mode 100644 python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py create mode 100644 python/CuTeDSL/cutlass/utils/hardware_info.py create mode 100644 python/CuTeDSL/cutlass/utils/hopper_helpers.py create mode 100644 python/CuTeDSL/cutlass/utils/layout.py create mode 100644 python/CuTeDSL/cutlass/utils/pipeline.py create mode 100644 python/CuTeDSL/cutlass/utils/smem_allocator.py create mode 100644 python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py create mode 100644 python/CuTeDSL/cutlass/utils/tensormap_manager.py create mode 100644 python/CuTeDSL/cutlass_dsl/__init__.py create mode 100644 python/CuTeDSL/cutlass_dsl/cutlass.py create mode 100644 python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py create mode 100644 python/CuTeDSL/requirements.txt create mode 100644 test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ca90d8e..813a04be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,51 +1,83 @@ -# NVIDIA CUTLASS Changelog +# Changelog -## [3.9.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.2) (2025-05-03) +# CUTLASS 4.x +## [4.0.0](https://github.com/NVIDIA/cutlass/tree/main) (2025-05-09) -* Fixed [Blockwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM hang issue when problem size K is 128. +### CuTe DSL +* CuTe DSL, a Python DSL centered around CuTe's abstractions + - [Core DSL implementation files](https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL) + - [DSL quick start](./media/docs/pythonDSL/quick_start.rst) + - [DSL Overview](./media/docs/pythonDSL/overview.rst) +* [Overhauled documentation with an new dedicated website](https://docs.nvidia.com/cutlass) +* Set of examples demonstrating how to use CuTe DSL to write peak-performance kernels + - [Blackwell persistent dense GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py) + - [Blackwell grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/grouped_gemm.py) + - [Blackwell fused multi-head attention forward pass](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/fmha.py) + - [Ampere GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/tensorop_gemm.py) + - [FlashAttention-2 implementation targeting Ampere and Ada class GPUs (SM80, SM86, SM89)](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/flash_attention_v2.py) +* [Educational notebooks for getting started with CuTe DSL](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks) + +### CUTLASS C++ +* Support [Family Specific Architecture Features](https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/) which was introduced in CUDA 12.9 + - 100f, 101f, 120f were added to support Family Specific Architecture Features which allows running the same binary on different chips belonging to the same Family (e.g. sm100) without recompiling. +* Instruction shapes and redundant accumulation type have been removed from CUTLASS 3.x-style library kernel names to disambiguate kernels and shorten names. + - For example: + + `(old) cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma` + `(new) cutlass3x_sm90_tensorop_gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma` + - If you are using the CUTLASS library kernel names directly (e.g. to compile a subset of the CUTLASS library with `-DCUTLASS_LIBRARY_KERNELS`, filter kernels in the CUTLASS profiler with `--kernels`), please update your uses accordingly, this is a breaking change. +* Further improved [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMMs on Hopper and Blackwell. + - Added non-power-of-two tile sizes. + - Improved performance for K-major scale factors. + - The argument `mma_promotion_interval` has been removed from non-grouped GEMM to align with the grouped and Blackwell versions. +* Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs! * Optimal code generation with CUDA toolkit versions 12.9. +# CUTLASS 3.x + +## [3.9.2](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.2) (2025-05-03) +* Fixed [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM hang issue when problem size K is 128. +* Optimal code generation with CUDA toolkit versions 12.9. ## [3.9.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.1) (2025-04-30) - * Fixed Group Gemm hang issue in CUTLASS 3.x -* Improved Hopper [Blockwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM performance. +* Improved Hopper [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMM performance. ## [3.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.9.0) (2025-04-24) * Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API: - Collective mainloops that target for: - * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp) - * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp) - - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders. - - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp). + * [Blockscaled datatypes with support for dense GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp) + * [Blockscaled datatypes with support for sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp) + - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders. + - [Blackwell SM120 epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp). * Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture: - - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu). - - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu). - - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu). - - [Grouped GEMM with nvfp4 datatype](./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu). - - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu). - - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu). -* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM. + - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu). + - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu). + - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu). + - [Grouped GEMM with nvfp4 datatype](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu). + - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu). + - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu). +* Set of unit tests that demonstrate the usage of both [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM. * Support for Blackwell SM100 Sparse kernels: - Collective mainloop that target for - * [SM100 Sparse GEMM](./include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp) + * [SM100 Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp) * Set of example that demonstrate the usage of the 3.x API for targeting Blackwell SM100 Sparse GEMM: - - [Sparse GEMM](./examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu) - - [Blockscaled Sparse GEMM with NVFP4 input data type](./examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu) - - [Blockscaled Sparse GEMM with mixed input data type (MXFP8 and MXFP4)](./examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu) -* Set of unit tests that demonstrate the usage of [sparse](./test/unit/gemm/device/sm100_sparse_tensorop_gemm) and [blockscaled sparse](./test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm) Blackwell SM100 GEMM. -* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/) covers the flashMLA-like weight-absorbed decoding use-case. -* A new FMHA Backward kernel for SM100 Blackwell architecture extends CUTLASS [example](./examples/77_blackwell_fmha/) to show how the five backward pass MMAs can be fused into a single kernel to achieve high performance. -* A new [distributed GEMM example](./examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture. + - [Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu) + - [Blockscaled Sparse GEMM with NVFP4 input data type](https://github.com/NVIDIA/cutlass/tree/main/examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu) + - [Blockscaled Sparse GEMM with mixed input data type (MXFP8 and MXFP4)](https://github.com/NVIDIA/cutlass/tree/main/examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu) +* Set of unit tests that demonstrate the usage of [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm100_sparse_tensorop_gemm) and [blockscaled sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm) Blackwell SM100 GEMM. +* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/) covers the flashMLA-like weight-absorbed decoding use-case. +* A new FMHA Backward kernel for SM100 Blackwell architecture extends CUTLASS [example](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/) to show how the five backward pass MMAs can be fused into a single kernel to achieve high performance. +* A new [distributed GEMM example](https://github.com/NVIDIA/cutlass/tree/main/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture. * Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures: - - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture. - - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture. - - Support for [grouped GEMM with blockwise and groupwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture. - - Support for [grouped-wise GEMM](./tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler. - - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture. - - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture. - - Support for [grouped GEMM with blockwise](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture. + - Enhancement of [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture. + - Enhancement of [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture. + - Support for [grouped GEMM with blockwise and groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture. + - Support for [grouped-wise GEMM](https://github.com/NVIDIA/cutlass/tree/main/tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler. + - Support for [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture. + - Support for [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture. + - Support for [grouped GEMM with blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture. * Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler: - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels. - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance. @@ -58,32 +90,32 @@ ## [3.8.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.8.0) (2025-01-25) * Support for new CuTe building blocks specifically for Blackwell SM100 architecture: - - [5th generation Blackwell Tensor Core instructions (TCGen05)](./include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms. - - Extensions to [Tensor Memory Accelerator](./include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms. - - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](./include/cute/pointer.hpp) across CuTe as a first class data locale. - - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](./include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe. - - [`make_tmem_copy()`](./include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms. - - Support for [new variants of LDSM on Blackwell](./include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms. + - [5th generation Blackwell Tensor Core instructions (TCGen05)](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/mma_traits_sm100.hpp) via CuTe MMA atoms. + - Extensions to [Tensor Memory Accelerator](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100_tma.hpp) via CuTe Copy atoms. + - Exposure of Blackwell's new tensor memory (note: distinct from TMA) as [`tmem`](https://github.com/NVIDIA/cutlass/tree/main/include/cute/pointer.hpp) across CuTe as a first class data locale. + - Exposure of [`tmem->rmem`, `rmem->tmem` and `smem->tmem data movement instructions`](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100.hpp) as copy atoms in CuTe. + - [`make_tmem_copy()`](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100.hpp) utility method to ease creation of tiled copies for tmem copy atoms. + - Support for [new variants of LDSM on Blackwell](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm100.hpp) via CuTe Copy atoms. * Support for new CUTLASS building blocks specifically for Blackwell SM100 architecture: - - Various narrow precision [FP4, FP6, and FP8](./include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](./include/cutlass/float_subbyte.h) - - [Pipelines that implement Blackwell specific synchronization](./include/cutlass/pipeline/sm100_pipeline.hpp). - - [Cluster launch control API supporting preferred and fallback cluster shapes](./include/cutlass/cluster_launch.hpp). + - Various narrow precision [FP4, FP6, and FP8](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/exmy_base.h) formats as well as their [block-scaled variants NVFP4, MXFP4, MXFP6, and MXFP8](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/float_subbyte.h) + - [Pipelines that implement Blackwell specific synchronization](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/pipeline/sm100_pipeline.hpp). + - [Cluster launch control API supporting preferred and fallback cluster shapes](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/cluster_launch.hpp). - Data types including NVFP4, MXFP4, MXFP6, and MXFP8 and all their supported element and scale factor types. - - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/cpp/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](./include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](./include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp). + - Tile schedulers using [Blackwell's Cluster Launch Control (CLC) feature](./media/docs/cpp/blackwell_cluster_launch_control.md) to implement dynamic persistence scheduling for [GEMMs](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_tile_scheduler.hpp), and [stream-K](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_tile_scheduler_stream_k.hpp). - Extensions to testbeds and reference check code for unit tests and CUTLASS profiler. * Full support for Blackwell SM100 kernels in CUTLASS 3.x API: - - [Blackwell specific kernel layers](./include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that + - [Blackwell specific kernel layers](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp) that + Implement a new warp-specialization recipe tuned specifically for Blackwell SM100 architecture. + Leverage all the new features such as CLC based tile scheduling, preferred cluster, and TMEM based double buffering of accumulators. + Support stream-K load balancing for all kernel types everywhere via composable scheduler support. - Blackwell collective mainloops that target the TCGen05 MMA instructions (both SS and TS) for - * [Non-block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp) - * [Non-block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp) - * [Block scaled data types without support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp) - * [Block scaled data types with support for pointer array and grouped GEMM with TMA](./include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp) - - Blackwell [collective mainloop for convolution kernels](./include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp) supporting non-block scaled data types for fprop, dgrad, and wgrad. - - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp), [convolution](./include/cutlass/conv/dispatch_policy.hpp), and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders. - - [Blackwell epilogue that supports loading accumulators from `tmem`](./include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp) and [full set of EVT fusions](). + * [Non-block scaled data types without support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_warpspecialized.hpp) + * [Non-block scaled data types with support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized.hpp) + * [Block scaled data types without support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_blockscaled_mma_warpspecialized.hpp) + * [Block scaled data types with support for pointer array and grouped GEMM with TMA](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp) + - Blackwell [collective mainloop for convolution kernels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/collective/sm100_implicit_gemm_umma_warpspecialized.hpp) supporting non-block scaled data types for fprop, dgrad, and wgrad. + - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp), [convolution](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/dispatch_policy.hpp), and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders. + - [Blackwell epilogue that supports loading accumulators from `tmem`](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp) and full set of EVT fusions. * CUTLASS library and profiler integration for block scaled data types for kernel emission, profiling, and verification. - Support for preferred and fallback cluster shapes via profiler command line arguments parsing to set dynamic cluster shapes. - Support for dynamic datatypes by parsing profiler via profiler command line arguments parsing to set dynamic datatype setting in TCGen05 MMA instruction descriptors. @@ -91,81 +123,81 @@ * New CUTLASS profiler flag `use-cuda-graphs` to reduce overheads when benchmarking launch-bound kernels. * A new 3.x version of grouped GEMM to the CUTLASS library and generates kernels for Hopper and Blackwell. Now grouped GEMM support is enabled in the CUTLASS profiler (`./cutlass_profiler --operation=GroupedGemm --help` for details). * Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM100 architecture: - - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](./examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API. - - GEMM with [opt-in collective builder schedules showcasing available recipes](./examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell. + - [Basic FP16 and FP8 GEMMs with minimal changes from Hopper examples](https://github.com/NVIDIA/cutlass/tree/main/examples/70_blackwell_gemm/), demonstrating ease of migration for off the shelf kernels using the 3.x collective builder API. + - GEMM with [opt-in collective builder schedules showcasing available recipes](https://github.com/NVIDIA/cutlass/tree/main/examples/71_blackwell_gemm_with_collective_builder/71_blackwell_gemm_with_collective_builder.cu) for Blackwell. - Block scaled data type GEMMs targeting Blackwell's native block scaled Tensor Cores: - + [NVFP4 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu) - + [NVFP4 inputs with NVFP4 output](./examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu) - + [Mixed MXFP8 and MXFP6 inputs with BF16 output](./examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu) - - GEMM example demonstrating [Blackwell's new preferred cluster support via dynamic cluster shapes](./examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for increased occupancy. - - [GEMM with CLC based StreamK scheduler for load balancing](./examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu). - - Grouped GEMM for [vanilla FP8 data inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](./examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu). - - Convolution kernels for [fprop](./examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](./examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](./examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu). - - [Fused multi-head attention fprop kernel](./examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128. - - A new BF16x9 GEMM [kernel](./examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations. + + [NVFP4 inputs with BF16 output](https://github.com/NVIDIA/cutlass/tree/main/examples/72_blackwell_narrow_precision_gemm/72a_blackwell_nvfp4_bf16_gemm.cu) + + [NVFP4 inputs with NVFP4 output](https://github.com/NVIDIA/cutlass/tree/main/examples/72_blackwell_narrow_precision_gemm/72b_blackwell_nvfp4_nvfp4_gemm.cu) + + [Mixed MXFP8 and MXFP6 inputs with BF16 output](https://github.com/NVIDIA/cutlass/tree/main/examples/72_blackwell_narrow_precision_gemm/72c_blackwell_mixed_mxfp8_bf16_gemm.cu) + - GEMM example demonstrating [Blackwell's new preferred cluster support via dynamic cluster shapes](https://github.com/NVIDIA/cutlass/tree/main/examples/73_blackwell_gemm_preferred_cluster/blackwell_gemm_preferred_cluster.cu) for increased occupancy. + - [GEMM with CLC based StreamK scheduler for load balancing](https://github.com/NVIDIA/cutlass/tree/main/examples/74_blackwell_gemm_streamk/blackwell_gemm_streamk.cu). + - Grouped GEMM for [vanilla FP8 data inputs](https://github.com/NVIDIA/cutlass/tree/main/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu) and [NVFP4 block scaled inputs](https://github.com/NVIDIA/cutlass/tree/main/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu). + - Convolution kernels for [fprop](https://github.com/NVIDIA/cutlass/tree/main/examples/76_blackwell_conv/76_blackwell_conv_fprop.cu), [dgrad](https://github.com/NVIDIA/cutlass/tree/main/examples/76_blackwell_conv/76_blackwell_conv_dgrad.cu), and [wgrad](https://github.com/NVIDIA/cutlass/tree/main/examples/76_blackwell_conv/76_blackwell_conv_wgrad.cu). + - [Fused multi-head attention fprop kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/77_blackwell_fmha/77_blackwell_fmha.cu) supporting fp16/bf16/fp8 data types across head dims of 32,64, and 128. + - A new BF16x9 GEMM [kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/78_blackwell_emulated_bf16x9_gemm/78_blackwell_emulated_bf16x9_gemm.cu) that emulates FP32 GEMM (SGEMM) using BF16 operations. * Set of examples that demonstrate the usage of the 3.x API for targeting Hopper architecture: - - A set of new [Hopper grouped GEMM kernels](./examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes. - - A new [Hopper FP8 GEMM with groupwise scaling](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu). + - A set of new [Hopper grouped GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/examples/69_hopper_mixed_dtype_grouped_gemm/) that support mixed A and B datatypes. + - A new [Hopper FP8 GEMM with groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu). * Documentation updates: - - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/cpp/quickstart.md#instantiating-a-blackwell-gemm-kernel). + - [Quickstart - instantiating a Blackwell block-scaled GEMM](./media/docs/cpp/quickstart.md#instantiating-a-blackwell-sm100-gemm-kernel). - Detailed [Blackwell block-scaled GEMM functionality documentation](./media/docs/cpp/blackwell_functionality.md) - A new [functionality documentation](./media/docs/cpp/functionality.md) specifically for 3.x API comprehensively documenting all supported kernel types, data types, kernel features, minimum CUDA tookit support etc for 3.x supported architectures. - - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#Target-Architecture). + - Updates to [compatibility](./README.md#compatibility) section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures, and [Target Architecture](./README.md#target-architecture). - Updates to [profiler documentation](./media/docs/cpp/profiler.md) for testing mixed input GEMM kernels on Hopper. ## [3.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.7.0) (2025-01-11) -- [Hopper blockwise scaling FP8 GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) uses 2D scaling tensor, assigning one value per threadblock. This allows a finer-grained scaling to be applied for each output tile per gemm-k iteration. The operands and scaling tensors are loaded from global memory to shared memory using TMA and cp_async, respectively. The scaling is applied inside the mainloop. Details with figures are [here](https://github.com/NVIDIA/cutlass/pull/1932#issue-2645398439). -- [Distributed GEMM](./examples/65_distributed_gemm/65_distributed_gemm.cu) is a new (experimental) API which can turn existing CUTLASS GEMM kernels into pipelined Tensor Parallel GEMMs that run efficiently on NVLink-based network of GPUs. Its pipelining schedules can hide most of the communication behind computation, and relies on point-to-point communication, which can simply use CUDA runtime's peer device access feature. It also utilizes remote TMA loads and memcopies with CUDA graphs to handle communication primarily through the Copy Engine, leaving all SMs free for Hopper's persistent kernels. For more details you can refer to the [DistGEMM blog post](https://blog.shi-labs.com/distributed-gemm-88be6a481e2b). -- Improved persistent grid launch for Hopper kernels with large cluster sizes (>= size of 4) using the new `make_kernel_hardware_info` API as shown in [example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu). +- [Hopper blockwise scaling FP8 GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) uses 2D scaling tensor, assigning one value per threadblock. This allows a finer-grained scaling to be applied for each output tile per gemm-k iteration. The operands and scaling tensors are loaded from global memory to shared memory using TMA and cp_async, respectively. The scaling is applied inside the mainloop. Details with figures are [here](https://github.com/NVIDIA/cutlass/pull/1932#issue-2645398439). +- [Distributed GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/65_distributed_gemm/65_distributed_gemm.cu) is a new (experimental) API which can turn existing CUTLASS GEMM kernels into pipelined Tensor Parallel GEMMs that run efficiently on NVLink-based network of GPUs. Its pipelining schedules can hide most of the communication behind computation, and relies on point-to-point communication, which can simply use CUDA runtime's peer device access feature. It also utilizes remote TMA loads and memcopies with CUDA graphs to handle communication primarily through the Copy Engine, leaving all SMs free for Hopper's persistent kernels. For more details you can refer to the [DistGEMM blog post](https://blog.shi-labs.com/distributed-gemm-88be6a481e2b). +- Improved persistent grid launch for Hopper kernels with large cluster sizes (>= size of 4) using the new `make_kernel_hardware_info` API as shown in [example 48](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu). - Enabled high precision accumulation for Hopper FP8 Sparse GEMM. - Potential API breaking changes: + Fix `cute::UniversalCopy` for type safety. + No longer implicitly select `cute::SM80_CP_ASYNC_*` based on input tensors. This avoids implicit downstream synchronization requirements. To use `SM80_CP_ASYNC`, users must explicitly select the appropriate CopyAtom. + Fix `cute::SM80_CP_ASYNC_CACHEALWAYS`, `cute::SM80_CP_ASYNC_CACHEGLOBAL`, `cute::SM80_CP_ASYNC_CACHEALWAYS_ZFILL`, `cute::SM80_CP_ASYNC_CACHEGLOBAL_ZFILL` to avoid implicitly selecting `ZFILL` behavior on predication. + Remove `cute::copy_vec` in favor of `cute::copy_aligned` and `cute::copy(AutoVectorizingCopyWithAssumedAlignment,...)`. - + A refactor of default epilogue struct `DefaultEpilogue` [API](./include/cutlass/epilogue/collective/default_epilogue.hpp) to avoid reading non-void `ElementC` value for `ElementC = void` kernel. + + A refactor of default epilogue struct `DefaultEpilogue` [API](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/default_epilogue.hpp) to avoid reading non-void `ElementC` value for `ElementC = void` kernel. - New CUTLASS profiler flags: `profiling-duration`, `min-iterations`, and `kernels-file` documented in [profiler.md](./media/docs/cpp/profiler.md#cutlass-profiler). - Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs! - Optimal code generation with CUDA toolkit versions 12.6. ## [3.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.6.0) (2024-10-03) -- [Hopper structured sparse GEMM](./examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu). - + [FP16](./test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu) - + [FP8](./test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu) - + [INT8](./test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu) - + [TF32](./test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu) -- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](./include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API. -- [An improved mixed input GEMM](./examples/55_hopper_mixed_dtype_gemm/README.md) and a [lookup table implementation](./examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode. -- [EVT nodes for Top-K selection and softmax](./include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](./examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu). -- [Programmatic Dependent Launch](./include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/cpp/dependent_kernel_launch.md). -- [A new debugging tool, synclog](./include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/cpp/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details. -- A new TMA-enabled [epilogue](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support. -- A SIMT-enabled pointer-array [epilogue](./include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp). -- A new [Ping-Pong kernel schedule for Grouped GEMM](./include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations. -- [A new instantiation strategy for CUTLASS profiler kernels](./python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/cpp/profiler.md#instantiating-more-kernels-with-hopper). -- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](./include/cutlass/bfloat16.h) -- Fixed use of isnan on Windows for [`half_t`](./test/unit/core/functional.cu). +- [Hopper structured sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/62_hopper_sparse_gemm/62_hopper_sparse_gemm.cu). + + [FP16](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_f16_f16_f32_tensor_op_f32.cu) + + [FP8](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_f8_f8_f32_tensor_op_f32.cu) + + [INT8](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_s8_s8_s32_tensor_op_s32.cu) + + [TF32](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_sparse_gemm_tf32_tf32_f32_tensor_op_f32.cu) +- A refactor to the CUTLASS 3.x convolution `kernel::ConvUniversal` [API](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/kernel/sm90_implicit_gemm_tma_warpspecialized.hpp) to bring it in line with `gemm::GemmUniversal`. Now the 3.x convolution API is no longer considered as a beta API. +- [An improved mixed input GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm/cpp/README.md) and a [lookup table implementation](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm/55_hopper_int4_fp8_gemm.cu) for `INT4`x`FP8` scale-only mode. +- [EVT nodes for Top-K selection and softmax](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm90_visitor_topk_softmax.hpp) and [GEMM example using those](https://github.com/NVIDIA/cutlass/tree/main/examples/61_hopper_gemm_with_topk_and_softmax/61_hopper_gemm_with_topk_and_softmax.cu). +- [Programmatic Dependent Launch](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/grid_dependency_control.h) (PDL) that leverages a new Hopper feature to speedup two back-to-back kernels, and its corresponding [documentations](./media/docs/cpp/dependent_kernel_launch.md). +- [A new debugging tool, synclog](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/synclog.hpp), for dumping out all synchronization events from within a kernel to a file. Please see [synclog documentation](./media/docs/cpp/utilities.md#debugging-asynchronous-kernels-with-cutlasss-built-in-synclog-tool) for details. +- A new TMA-enabled [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for grouped GEMM that brings significant performance improvement, as well as its EVT support. +- A SIMT-enabled pointer-array [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm70_epilogue_vectorized_array.hpp). +- A new [Ping-Pong kernel schedule for Grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp) and some other optimizations. +- [A new instantiation strategy for CUTLASS profiler kernels](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/sm90_shapes.py) along with [improved documentation for instantiation level in CUTLASS profiler](./media/docs/cpp/profiler.md#instantiating-more-kernels-with-hopper). +- A new hardware support for comparisons and computations of [`cutlass::bfloat16_t`](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/bfloat16.h) +- Fixed use of isnan on Windows for [`half_t`](https://github.com/NVIDIA/cutlass/tree/main/test/unit/core/functional.cu). - Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs! - Optimal code generation with CUDA toolkit versions 12.6. ## [3.5.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.5.1) (2024-07-25) -- [Minimal SM90 WGMMA + TMA GEMM example in 100 lines of code](./examples/cute/tutorial/wgmma_sm90.cu) -- [Exposure of L2 `cache_hint`s in TMA copy atoms](./include/cute/arch/copy_sm90_tma.hpp#L48) -- Exposure of raster order and tile swizzle extent in [CUTLASS library profiler](./media/docs/cpp/profiler.md#GEMM), and -[example 48](./examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu). -- [TMA store based and EVT supported epilogues](./include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for [Hopper pointer array batched kernels](./test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu). -- A new [`GemmSparseUniversal` API for CUTLASS 2.x Ampere kernels](./include/cutlass/gemm/device/gemm_sparse_universal.h) to enable serial and parallel split-k for sparse tensor cores and new tiny tile sizes to better support LLM inferrence: - + [FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu#L269-L393) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu#L269-L411). - + [int8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452). - + [int4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452). - + [FP32 TN](./test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu#L427-L642) and [NT](./test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu#L427-L456). -- [CUDA host adapter](./include/cutlass/cuda_host_adapter.hpp) extensions to support TMA descriptor construction driver APIs. -- Inclusion of more [Hopper fprop, dgrad, and wgrad convolution kernels in CUTLASS library and profiler](./python/cutlass_library/generator.py). +- [Minimal SM90 WGMMA + TMA GEMM example in 100 lines of code](https://github.com/NVIDIA/cutlass/tree/main/examples/cute/tutorial/wgmma_sm90.cu) +- [Exposure of L2 `cache_hint`s in TMA copy atoms](https://github.com/NVIDIA/cutlass/tree/main/include/cute/arch/copy_sm90_tma.hpp#L48) +- Exposure of raster order and tile swizzle extent in [CUTLASS library profiler](./media/docs/cpp/profiler.md#gemm), and +[example 48](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm/48_hopper_warp_specialized_gemm.cu). +- [TMA store based and EVT supported epilogues](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp) for [Hopper pointer array batched kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_ptr_array.cu). +- A new [`GemmSparseUniversal` API for CUTLASS 2.x Ampere kernels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/device/gemm_sparse_universal.h) to enable serial and parallel split-k for sparse tensor cores and new tiny tile sizes to better support LLM inferrence: + + [FP16 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16t_f16n_f32t_tensor_op_f32_sparse_sm80.cu#L269-L393) and [NT](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16n_f16t_f32t_tensor_op_f32_sparse_sm80.cu#L269-L411). + + [int8 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s8t_s8n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452). + + [int4 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s4t_s4n_s32t_tensor_op_s32_sparse_sm80.cu#L264-L452). + + [FP32 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f32t_f32n_f32t_tensor_op_f32_sparse_sm80.cu#L427-L642) and [NT](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f32n_f32t_f32t_tensor_op_f32_sparse_sm80.cu#L427-L456). +- [CUDA host adapter](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/cuda_host_adapter.hpp) extensions to support TMA descriptor construction driver APIs. +- Inclusion of more [Hopper fprop, dgrad, and wgrad convolution kernels in CUTLASS library and profiler](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/generator.py). - Support for residual add (beta != 0) in convolution kernels. -- A new convolution [epilogue](./examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu#L269) for CUTLASS 2.x to support non-packed NHWC output. -- A refactor of [include files throughout CUTLASS core directories](./include/cutlass/gemm/collective/collective_mma_decl.hpp) to reduce circular dependencies and [tests to guard against them](./test/self_contained_includes/CMakeLists.txt). +- A new convolution [epilogue](https://github.com/NVIDIA/cutlass/tree/main/examples/16_ampere_tensorop_conv2dfprop/ampere_tensorop_conv2dfprop.cu#L269) for CUTLASS 2.x to support non-packed NHWC output. +- A refactor of [include files throughout CUTLASS core directories](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/collective_mma_decl.hpp) to reduce circular dependencies and [tests to guard against them](https://github.com/NVIDIA/cutlass/tree/main/test/self_contained_includes/CMakeLists.txt). - [A guide for setting up VSCode to work well with CUTLASS](./media/docs/cpp/ide_setup.md) and [expanded code style guide](./media/docs/cpp/programming_guidelines.md). - Better support for MSVC as a host compiler. - Many performance optimizations, improvements, and bug fixes including fixes for FlashAttention-2. @@ -173,49 +205,49 @@ ## [3.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.5.0) (2024-04-09) -- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](./include/cute/atom/copy_traits_sm90_im2col.hpp) +- Implicit GEMM Convolutions targeting Hopper SM90A via WGMMA + [TMA im2col](https://github.com/NVIDIA/cutlass/tree/main/include/cute/atom/copy_traits_sm90_im2col.hpp) + Native implementation in CUTLASS 3.x using CuTe, mirroring the [same design hierarchy as that of GEMMs](./media/docs/cpp/gemm_api_3x.md). - + Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](./include/cutlass/conv/convnd_problem_shape.hpp). - + Support for [Fprop](./test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](./test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](./test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms - + [CUTLASS profiler support](./python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API. + + Support for 1D, 2D, and 3D convolutions in a [rank-agnostic fashion](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/convnd_problem_shape.hpp). + + Support for [Fprop](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device_3x/fprop/sm90_conv3d_fprop_implicit_gemm_s8_s8_s32_tensorop_s32.cu), [Dgrad](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device_3x/dgrad/sm90_conv2d_dgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu), and [Wgrad](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device_3x/wgrad/sm90_conv1d_wgrad_implicit_gemm_f16_f16_f32_tensorop_f16.cu) algorithms + + [CUTLASS profiler support](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/conv3x_emitter.py) for 2D and 3D convolutions implemented via the 3.x API. + NOTE: this is a beta release. Further updates to CUTLASS will include major performance improvements, feature enablement, and possible breaking changes to the API until 3.7 release. Your feedback is welcome on the design! -- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](./examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer. -- [Ampere gather/scatter convolution example](./examples/59_ampere_gather_scatter_conv/README.md) in CuTe and CUTLASS 3.x +- Support for [Ada (SM89) FP8 tensor cores via the 2.x API](https://github.com/NVIDIA/cutlass/tree/main/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu). Requires CUDA 12.4 or newer. +- [Ampere gather/scatter convolution example](https://github.com/NVIDIA/cutlass/tree/main/examples/59_ampere_gather_scatter_conv/cpp/README.md) in CuTe and CUTLASS 3.x + Showcasing how custom kernels can be written and optimized using CUTLASS 3.x and CuTe and the general strategy for implementing convolutions as specializations of GETTs. + Implementation of a coarse grained sparse gather/scatter kernel achieving peak performance on Ampere class tensor cores. - 32x and 16x tile sizes are added to CUTLASS 2.x to improve the performance of narrow-tall and wide-short matrices. - + [Ampere FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu) and [NT](./test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu#L227-L301), [Ampere INT8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu#L392-L1342), [Ampere INT4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu#L372-L934). - + [Turing FP16 TN](./test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu#L55-L394), [Turing INT8 TN](./test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu#L166-L537), [Turing INT4 TN](./test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu#L310-L564). -- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cpp/cute/03_tensor.md), [MMA atoms](./media/docs/cpp/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](./examples/cute/tutorial). -- Extensions to CuTe to support [L2 prefetching](./include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](./include/cute/arch/copy_sm90_tma.hpp#L1337). + + [Ampere FP16 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm80.cu) and [NT](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16n_f16t_f16t_tensor_op_f32_sm80.cu#L227-L301), [Ampere INT8 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm80.cu#L392-L1342), [Ampere INT4 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm80.cu#L372-L934). + + [Turing FP16 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16t_f16n_f16t_tensor_op_f32_sm75.cu#L55-L394), [Turing INT8 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s8t_s8n_s8t_tensor_op_s32_sm75.cu#L166-L537), [Turing INT4 TN](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_s4t_s4n_s4t_tensor_op_s32_sm75.cu#L310-L564). +- Updates to CuTe documentation for [`cute::Tensor<>`](./media/docs/cpp/cute/03_tensor.md), [MMA atoms](./media/docs/cpp/cute/0t_mma_atom.md), and an overhauled [CuTe GEMM tutorial series](https://github.com/NVIDIA/cutlass/tree/main/examples/cute/tutorial). +- Extensions to CuTe to support [L2 prefetching](https://github.com/NVIDIA/cutlass/tree/main/include/cute/algorithm/prefetch.hpp) and [TMA store+reductions](https://github.com/NVIDIA/cutlass/tree/main/include/cute/arch/copy_sm90_tma.hpp#L1337). - Remove C++11 requirement on a few CUTLASS 2.x API header files. All CUTLASS files now require C++17. - Fixes to greatly reduce build warnings. - Updates and bugfixes from the community (thanks!) ## [3.4.1](https://github.com/NVIDIA/cutlass/releases/tag/v3.4.1) (2024-02-14) -- Statically available [CUTLASS Version macros](./include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side. -- Improvements for Hopper [Group-GEMMs](./examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMMs](./examples/56_hopper_ptr_array_batched_gemm). +- Statically available [CUTLASS Version macros](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/version.h) that allow for handling API changes between CUTLASS releases on the users' side. +- Improvements for Hopper [Group-GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/57_hopper_grouped_gemm) and [Pointer-Array Batched GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/56_hopper_ptr_array_batched_gemm). - Updates and bugfixes from the community (thanks!). ## [3.4.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.4.0) (2024-01-12) -* Expanded [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) support covering {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors. -* Performance improvements to [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) -* Beta release of [Pointer-Array Batched GEMMs](./examples/56_hopper_ptr_array_batched_gemm) now available on Hopper GPUs utilizing TMA and WGMMA (requires CUDA 12.3 or above). -* Beta release of [Group-GEMM](./examples/57_hopper_grouped_gemm) utilizing TMA and WGMMA (requires CUDA 12.3 or above). -* [Ampere Sparse GEMM](./examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now. -* NamedBarriers usability improvement and list of [ReservedNamedBarriers](./include/cutlass/arch/barrier.h) has been officially released. -* Improved [CuTe documentation](./media/docs/cpp/cute/) including improved clarity and depth of [Quickstart](./media/docs/cute/00_quickstart.md), [CuTe Layout](./media/docs/cpp/cute/01_layout.md), and [CuTe Layout Algebra](./media/docs/cpp/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](./test/unit/cute/core/) also improved. +* Expanded [Mixed-input Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm) support covering {16-bit, 8-bit} x {8-bit, 4-bit} input types with fast numerical converters and group scaling factors. +* Performance improvements to [Mixed-input Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm) +* Beta release of [Pointer-Array Batched GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/56_hopper_ptr_array_batched_gemm) now available on Hopper GPUs utilizing TMA and WGMMA (requires CUDA 12.3 or above). +* Beta release of [Group-GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/57_hopper_grouped_gemm) utilizing TMA and WGMMA (requires CUDA 12.3 or above). +* [Ampere Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm_with_visitor.cu) supports Epilogue Visitor Tree (EVT) now. +* NamedBarriers usability improvement and list of [ReservedNamedBarriers](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/barrier.h) has been officially released. +* Improved CuTe documentation including improved clarity and depth of [Quickstart](./media/docs/cpp/cute/00_quickstart.md), [CuTe Layout](./media/docs/cpp/cute/01_layout.md), and [CuTe Layout Algebra](./media/docs/cpp/cute/02_layout_algebra.md). Associated code comments, post-conditions, and details in [CuTe Core Unit Tests](./test/unit/cute/core/) also improved. ## [3.3](https://github.com/NVIDIA/cutlass/releases/tag/v3.3.0) (2023-10-31) -* [Mixed-input Hopper GEMMs](./examples/55_hopper_mixed_dtype_gemm) support covering 16-bit x 8-bit input operand types. +* [Mixed-input Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/55_hopper_mixed_dtype_gemm) support covering 16-bit x 8-bit input operand types. * [Mixed-input Ampere GEMMs](https://github.com/NVIDIA/cutlass/pull/1084) with support for canonical layouts (TN). The implementation supports upcast on operandB {fp16, bf16} x {s8, u8}, and upcast on operandA {s8, u8} x {fp16, bf16}. -* [Copy Async based Hopper GEMMs](./test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu) - which support lower than 16B aligned input tensors. +* [Copy Async based Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_bf16_bf16_bf16_alignx_tensor_op_f32_warpspecialized_cooperative.cu) - which support lower than 16B aligned input tensors. * Kernel schedules and Builder support for mixed precision and Copy Async GEMMs with < 16B aligned input tensors. * Profiler support for lower-aligned Hopper GEMMs. -* Performance Improvements to [Scatter-Gather Hopper Example](./examples/52_hopper_gather_scatter_fusion). +* Performance Improvements to [Scatter-Gather Hopper Example](https://github.com/NVIDIA/cutlass/tree/main/examples/52_hopper_gather_scatter_fusion). * Sub-Byte type fixes and improvements. -* EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](./include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details. +* EVT Support for RELU with Aux bitmap tensor store (used in dRELU). See [SM90 EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm90_visitor_compute_tma_warpspecialized.hpp) for details. * Fusion support for backprop fusions including drelu, dgelu, and dbias. * Support for void-C kernels and SM80 mixed-input GEMMs in the CUTLASS Python interface @@ -227,7 +259,7 @@ * SM80 EVT support in C++ and Python. * Other SM90 epilogue improvements. * Splitting CUTLASS library into smaller units based on operation, arch and datatypes. See [1105](https://github.com/NVIDIA/cutlass/discussions/1105) for details. -* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](./python/README.md) for details. +* Making `tools/library/scripts` packageable - `tools/library/scripts` is now moving to `python/cutlass_library`. See the Python [README](https://github.com/NVIDIA/cutlass/tree/main/python/README.md) for details. * SM90 TF32 kernel improvements for all layouts. * SM90 rasterization direction support in the CUTLASS profiler. * Improvement for CUTLASS profiler build times. @@ -235,34 +267,34 @@ ## [3.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.2.0) (2023-08-03) -* New warp-specialized persistent FP8 GEMM kernel [kernel schedules](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](./include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](./examples/54_hopper_fp8_warp_specialized_gemm). FP8 GEMMs come with a fast accumulation mode. When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results will not periodically be promoted to a higher precision. -* New [Epilogue Visitor Tree (EVT)](./examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue. -* [Stream-K](./include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release. -* Improved CTA rasterization and support for CTA swizzling for Hopper kernels using the [Tile Scheduler](./include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp). -* Improved performance for [warp-specialized TensorFloat-32 (TF32) GEMM kernels](test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA. -* [Hopper GEMM+Permute](./examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu), an example of fusing tensor reordering (permutation) with GEMM mainloop or epilogue. -* New CUTLASS 2D Convolution Python interface. New [example](./examples/python/03_basic_conv2d.ipynb) here. +* New warp-specialized persistent FP8 GEMM kernel [kernel schedules](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) and [mainloops](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. An example showcasing [Hopper warp-specialized FP8 GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/54_hopper_fp8_warp_specialized_gemm). FP8 GEMMs come with a fast accumulation mode. When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results will not periodically be promoted to a higher precision. +* New [Epilogue Visitor Tree (EVT)](https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu) support for Hopper TMA epilogues. EVTs allows for user-defined customized epilogue fusion patterns without having to write a new epilogue. +* [Stream-K](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_tile_scheduler_stream_k.hpp) feature for Hopper. Note that this is only a functional implementation of stream-K, and should not be used for performance comparison. Optimizations are expected in a future release. +* Improved CTA rasterization and support for CTA swizzling for Hopper kernels using the [Tile Scheduler](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp). +* Improved performance for [warp-specialized TensorFloat-32 (TF32) GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA. +* [Hopper GEMM+Permute](https://github.com/NVIDIA/cutlass/tree/main/examples/53_hopper_gemm_permute/53_hopper_gemm_permute.cu), an example of fusing tensor reordering (permutation) with GEMM mainloop or epilogue. +* New CUTLASS 2D Convolution Python interface. New [example](https://github.com/NVIDIA/cutlass/tree/main/examples/python/03_basic_conv2d.ipynb) here. * Support for Windows (MSVC) builds. Tested with Visual Studio 2019 v16.11.27 on Windows 10.0. * Optimal performance using [**CUDA 12.2u1**](https://developer.nvidia.com/cuda-downloads) * Updates and bugfixes from the community (thanks!) ## [3.1.0](https://github.com/NVIDIA/cutlass/releases/tag/v3.1.0) (2023-04-14) -* New CUTLASS Python interface that aims to provide an ease-of-use interface for instantiating, emitting, compiling, and running CUTLASS kernels via Python. More details [here](./python/README.md) and new [examples](./examples/python). -* New [efficient epilogues](test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu#L783) using TMA for Hopper. -* Support for [fused epilogues](test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu), such Bias, ReLU and GELU, using the new efficient epilogues. -* New [warp-specialized TensorFloat-32 (TF32) GEMM kernels](test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA. -* New [*warp-specialized persistent cooperative*](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) kernel design that allows for larger tile sizes and improves performance on Hopper. -* An [example](./examples/51_hopper_gett) showcasing GEMM-Like Tensor-Tensor Contraction (GETT) capability on Hopper. -* Epilogue builders. Similar to mainloop builders (see [example 49](./examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu)), epilogue builders aim to generate the best-possible epilogue while exposing incremental opt-ins for greater customization. +* New CUTLASS Python interface that aims to provide an ease-of-use interface for instantiating, emitting, compiling, and running CUTLASS kernels via Python. More details [here](https://github.com/NVIDIA/cutlass/tree/main/python/cpp/README.md) and new [examples](https://github.com/NVIDIA/cutlass/tree/main/examples/python). +* New [efficient epilogues](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative.cu#L783) using TMA for Hopper. +* Support for [fused epilogues](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_f16_f16_f16_tensor_op_f32_cluster_warpspecialized_cooperative_bias_elementwise.cu), such Bias, ReLU and GELU, using the new efficient epilogues. +* New [warp-specialized TensorFloat-32 (TF32) GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm90_gemm_tf32_tf32_f32_tensor_op_f32_gmma_rs_cluster_warpspecialized.cu) targeting Hopper TMA. +* New [*warp-specialized persistent cooperative*](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp) kernel design that allows for larger tile sizes and improves performance on Hopper. +* An [example](https://github.com/NVIDIA/cutlass/tree/main/examples/51_hopper_gett) showcasing GEMM-Like Tensor-Tensor Contraction (GETT) capability on Hopper. +* Epilogue builders. Similar to mainloop builders (see [example 49](https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu)), epilogue builders aim to generate the best-possible epilogue while exposing incremental opt-ins for greater customization. * Profiler support for overriding kernel and epilogue builder auto schedules for 3.x API kernels, allowing specific policies to be run in the CUTLASS profiler. -* Performance optimizations for the [*warp-specialized persistent ping-pong*](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) kernel. +* Performance optimizations for the [*warp-specialized persistent ping-pong*](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp) kernel. * Changes to the [GEMM API 3.x](./media/docs/cpp/gemm_api_3x.md), involving the host-facing arguments and the underlying `Params` structs. -* [FMHA Backward Pass](./examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu) from Meta xFormers. -* [Streamk GEMM with Broadcast](./examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu) enables epilogue broadcast with StreamK GEMM. -* [Batched B2B GEMM](./examples/13_two_tensor_op_fusion) now can run multiple Back-to-Back GEMM with the same problem size in parallel. -* [Batched Strided GEMV](test/unit/gemm/device/gemv.cu) support both row major and column major input matrix. -* [Permute + GEMM fusion](./examples/39_gemm_permute) can fuse Permute with following GEMM now. Before, we only support fusing GEMM with Permute in the epilogue. -* [Row Broadcast](./include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h) can be fused in the epilogue. +* [FMHA Backward Pass](https://github.com/NVIDIA/cutlass/tree/main/examples/41_fused_multi_head_attention/fused_multi_head_attention_backward.cu) from Meta xFormers. +* [Streamk GEMM with Broadcast](https://github.com/NVIDIA/cutlass/tree/main/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk_broadcast.cu) enables epilogue broadcast with StreamK GEMM. +* [Batched B2B GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion) now can run multiple Back-to-Back GEMM with the same problem size in parallel. +* [Batched Strided GEMV](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemv.cu) support both row major and column major input matrix. +* [Permute + GEMM fusion](https://github.com/NVIDIA/cutlass/tree/main/examples/39_gemm_permute) can fuse Permute with following GEMM now. Before, we only support fusing GEMM with Permute in the epilogue. +* [Row Broadcast](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/threadblock/predicated_tile_iterator_row_broadcast.h) can be fused in the epilogue. * The GitHub branch is renamed from `master` to `main` in this release. * Optimal performance using [**CUDA 12.1**](https://developer.nvidia.com/cuda-downloads) * Updates and bugfixes from the community (thanks!) @@ -272,28 +304,30 @@ * [A new conceptual operation hierarchy](./media/docs/cpp/cutlass_3x_design.md) that replaces the architecture-centric hierarchy of CUTLASS 2.x and [documentation for CUTLASS 3.0's GEMM API changes](./media/docs/cpp/gemm_api_3x.md). * Strict API backwards compatibility that exposes both 2.x and 3.x API kernels through the same [`device::GemmUniversalAdapter`](./include/cutlass/gemm/device/gemm_universal_adapter.h) and [`kernel::GemmUniversal`](./include/cutlass/gemm/kernel/gemm_universal.hpp) types, allowing users to include both APIs in the same translation units. More information can be found in the [3.x backwards compatibility section](./media/docs/cpp/cutlass_3x_backwards_compatibility.md). * Updates to [Functionality](./media/docs/cpp/functionality.md) which directs users on which kernels are supported via CUTLASS-2 and CUTLASS-3. -* Updates to [Compatibility](./README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](./README.md#Target-Architecture). -* New warp-specialized GEMM [kernel schedules](./include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](./include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. +* Updates to [Compatibility](./README.md#compatibility) Section regarding supported compilers, operating systems, CUDA Toolkits, Hardware Architectures and [Target Architecture](./README.md#target-architecture). +* New warp-specialized GEMM [kernel schedules](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp) and [mainloops](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp) targeting Hopper architecture that achieve great performance with TMA, WGMMA, and threadblock clusters. * Extensions to CUTLASS profiler to support threadblock cluster shapes in library and profiler tile configurations. -* [CUTLASS library integration](./tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler. -* Support for [Hopper GEMMs](./examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features. -* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](./examples/48_hopper_warp_specialized_gemm), [49](./examples/49_hopper_gemm_schedules_with_collective_builder), and [50](./examples/50_hopper_gemm_with_epilogue_swizzle). +* [CUTLASS library integration](https://github.com/NVIDIA/cutlass/tree/main/tools/library/src/gemm_operation_3x.hpp) for 3.x API kernels built through the new `CollectiveBuilder` API, enabling CUTLASS profiler. +* Support for [Hopper GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm) through the new 3.0 API with CuTe-based exposure of the Hopper [Tensor Memory Accelerator](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor) and [WGMMA Tensor Core](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) features. +* Set of examples that demonstrate the usage of the new 3.0 API to easily build GEMM kernels targeting Hopper: examples [48](https://github.com/NVIDIA/cutlass/tree/main/examples/48_hopper_warp_specialized_gemm), [49](https://github.com/NVIDIA/cutlass/tree/main/examples/49_hopper_gemm_schedules_with_collective_builder), and [50](https://github.com/NVIDIA/cutlass/tree/main/examples/50_hopper_gemm_with_epilogue_swizzle). + +# CUTLASS 2.x ## [2.11.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.11.0) (2022-11-19) -* [Stream-K](./examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K. It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one. -* [Fused multi-head attention Kernel](./examples/41_fused_multi_head_attention). It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length. Both versions just need one kernel. -* [Dual GEMM](./examples/45_dual_gemm), which can fuse A x B and A x C into one kernel. Two GEMMs has no producer-consumer dependency. -* Hopper improves [double precision matrix multiplication](./test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8. -* [BLAS3](./test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions. -* [ELL Block Sparse GEMM](./examples/43_ell_block_sparse_gemm), which uses an [ELL matrix](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) to describe the sparsity of A matrix. B and output matrices are still dense. The block size can be arbitary. -* Optimized [Group Conv](./examples/42_ampere_tensorop_group_conv) for SingleGroup mode, which requires that the output channel per group is a multiple of Threadblock tile N. -* [Optimized DepthWise Conv](./examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu). Two new modes are added - * [kOptimized](./test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - use direct conv to compute instead of implicit GEMM. +* [Stream-K](https://github.com/NVIDIA/cutlass/tree/main/examples/47_ampere_gemm_universal_streamk), which is a new general way to do split-K. It can not only improve performance, but can also significantly reduce the number of tile sizes that need to be profiled to find the best one. +* [Fused multi-head attention Kernel](https://github.com/NVIDIA/cutlass/tree/main/examples/41_fused_multi_head_attention). It has two variants: one uses batched GEMM for the fixed sequence length, and the other one uses group GEMM for the variable sequence length. Both versions just need one kernel. +* [Dual GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/45_dual_gemm), which can fuse A x B and A x C into one kernel. Two GEMMs has no producer-consumer dependency. +* Hopper improves [double precision matrix multiplication](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f64n_f64t_f64t_tensor_op_f64_sm90.cu) by 2x compared to Ampere at iso-clocks. It is supported since CUDA 11.8. +* [BLAS3](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu) functions with Hoppers new double precision matrix multiplication instructions. +* [ELL Block Sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/43_ell_block_sparse_gemm), which uses an [ELL matrix](https://developer.nvidia.com/blog/accelerating-matrix-multiplication-with-block-sparse-format-and-nvidia-tensor-cores/) to describe the sparsity of A matrix. B and output matrices are still dense. The block size can be arbitary. +* Optimized [Group Conv](https://github.com/NVIDIA/cutlass/tree/main/examples/42_ampere_tensorop_group_conv) for SingleGroup mode, which requires that the output channel per group is a multiple of Threadblock tile N. +* [Optimized DepthWise Conv](https://github.com/NVIDIA/cutlass/tree/main/examples/46_depthwise_simt_conv2dfprop/depthwise_simt_conv2dfprop.cu). Two new modes are added + * [kOptimized](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - use direct conv to compute instead of implicit GEMM. * The restrictions are: 1) input ,output channel and group number should be multiple of (128 / sizeof(input element)). 2) The input filter size should be the same as the template parameter configuration. - * [kFixedStrideDilation](./test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - which puts stride and dilation into templates to further improve the performance. In this mode, kernel persistents some inputs into register to squeeze more performance, so large filter/stride/dilation is not recommanded. + * [kFixedStrideDilation](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/depthwise_conv2d_fprop_direct_conv_fixed_stride_dilation_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) - which puts stride and dilation into templates to further improve the performance. In this mode, kernel persistents some inputs into register to squeeze more performance, so large filter/stride/dilation is not recommanded. * The restrictions are: 1) input, output channel and group number should be multiple of (128 / sizeof(input element)). 2) input filter size, stride, dilation should same as the template parameter configuration. -* [Scripts](./examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM. Its implementation was discussed in a GTC'22 Spring [talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41606/). -* [FP8 data type definition](./include/cutlass/float8.h) and [conversion routines](./include/cutlass/numeric_conversion.h#L1274-2115). +* [Scripts](https://github.com/NVIDIA/cutlass/tree/main/examples/44_multi_gemm_ir_and_codegen) to fuse multiple back-to-back GEMM. Its implementation was discussed in a GTC'22 Spring [talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41606/). +* [FP8 data type definition](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/float8.h) and [conversion routines](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/numeric_conversion.h#L1274-2115). * Updates and bugfixes from the community (thanks!). Big shout out to Meta's [xFormers](https://github.com/facebookresearch/xformers). * **Deprecation announcement:** CUTLASS plans to deprecate the following: @@ -302,54 +336,54 @@ * CUDA 10.2 ## [2.10.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.10.0) (2022-08-23) -* [CUTLASS Python](./examples/40_cutlass_py) now supports GEMM, CONV, Group GEMM for different data types as well as different epilogue flavours. -* Optimizations for CUTLASS's [Grouped GEMM](./examples/24_gemm_grouped/gemm_grouped.cu) kernel. Threadblock scheduling part is improved. Some computation can be moved to the host side if applicable. [Grouped Syr2k](./examples/38_syr2k_grouped/syr2k_grouped.cu) kernels are added, too. -* Optimizations for [GEMM+Softmax](./examples/35_gemm_softmax). All the reduction computation is fused into the previous GEMM. More template arguments are provided to fine tune the performance. -* [Grouped GEMM for Multihead Attention](./examples/41_multi_head_attention). This general group gemm based MHA does not require the sequence length of all GEMMs to be the same which makes it most useful for natural language processing. -* [GEMM + Layer norm fusion for Ampere](./examples/37_gemm_layernorm_gemm_fusion/) splits the layernorm into two parts and both of them can be fused into the GEMMs before and after separately. In addition to use square sum to compute variance of layernorm, [Shift-K](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data) is provided if square sum raise numerical issues. -* [GEMM Epilogue Permutation Fusion](./examples/39_gemm_permute) can apply user provided permutation layout mapping in the GEMM epilogue. -* [Grouped convolution targeting implicit GEMM](test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) introduces the first group convolution implementation to CUTLASS. It is an Analytical implementation, not an Optimized. The restrictions are: 1) input and output channel number should be multiple of group number. 2) split-K is not supported. The implementation has 2 modes: +* [CUTLASS Python](https://github.com/NVIDIA/cutlass/tree/main/examples/40_cutlass_py) now supports GEMM, CONV, Group GEMM for different data types as well as different epilogue flavours. +* Optimizations for CUTLASS's [Grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/24_gemm_grouped/gemm_grouped.cu) kernel. Threadblock scheduling part is improved. Some computation can be moved to the host side if applicable. [Grouped Syr2k](https://github.com/NVIDIA/cutlass/tree/main/examples/38_syr2k_grouped/syr2k_grouped.cu) kernels are added, too. +* Optimizations for [GEMM+Softmax](https://github.com/NVIDIA/cutlass/tree/main/examples/35_gemm_softmax). All the reduction computation is fused into the previous GEMM. More template arguments are provided to fine tune the performance. +* [Grouped GEMM for Multihead Attention](https://github.com/NVIDIA/cutlass/tree/main/examples/41_multi_head_attention). This general group gemm based MHA does not require the sequence length of all GEMMs to be the same which makes it most useful for natural language processing. +* [GEMM + Layer norm fusion for Ampere](https://github.com/NVIDIA/cutlass/tree/main/examples/37_gemm_layernorm_gemm_fusion/) splits the layernorm into two parts and both of them can be fused into the GEMMs before and after separately. In addition to use square sum to compute variance of layernorm, [Shift-K](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data) is provided if square sum raise numerical issues. +* [GEMM Epilogue Permutation Fusion](https://github.com/NVIDIA/cutlass/tree/main/examples/39_gemm_permute) can apply user provided permutation layout mapping in the GEMM epilogue. +* [Grouped convolution targeting implicit GEMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/group_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) introduces the first group convolution implementation to CUTLASS. It is an Analytical implementation, not an Optimized. The restrictions are: 1) input and output channel number should be multiple of group number. 2) split-K is not supported. The implementation has 2 modes: * kSingleGroup: output channel per group is multiple of Threadblock tile N. * kMultipleGroup: Threadblock tile N is multiple of output channel per group. -* [Depthwise separable convolution](test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) introduces the first depthwise convolution which is also Analytical for now. The restrictions are: 1) SIMT only 2) No split-K 3) input channel equals to output channel equals to group number. -* Standalone [Layernorm](./tools/util/include/cutlass/util/device_layernorm.h) and [Pooling](./tools/util/include/cutlass/util/device_nhwc_pooling.h) kernels. -* [Back-to-back GEMM/CONV](./examples/13_two_tensor_op_fusion) relaxes the requirement that the first GEMM K dimension needs to be the multiple of Threadblock Tile K dimension. +* [Depthwise separable convolution](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/depthwise_conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_simt_f16_sm60.cu) introduces the first depthwise convolution which is also Analytical for now. The restrictions are: 1) SIMT only 2) No split-K 3) input channel equals to output channel equals to group number. +* Standalone [Layernorm](https://github.com/NVIDIA/cutlass/tree/main/tools/util/include/cutlass/util/device_layernorm.h) and [Pooling](https://github.com/NVIDIA/cutlass/tree/main/tools/util/include/cutlass/util/device_nhwc_pooling.h) kernels. +* [Back-to-back GEMM/CONV](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion) relaxes the requirement that the first GEMM K dimension needs to be the multiple of Threadblock Tile K dimension. * Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads) * Updates and bugfixes from the community (thanks!) ## [2.9.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.9.0) (2022-04-21) -* [First layer Convolution kernels](./test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment - * [Few channels](./include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities - * [Fixed channels](./include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size - * [Unit tests](./test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) - * [Python-based instance emitter](./python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler +* [First layer Convolution kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) specialized for small channel counts and reduced alignment + * [Few channels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_few_channels.h) specialization for reduced alignment capabilities + * [Fixed channels](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/threadblock/conv2d_fprop_activation_tile_access_iterator_fixed_channels.h) further specialized when channel count perfectly matches the access vector size + * [Unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.cu) + * [Python-based instance emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/generator.py) in the CUTLASS Library and support in the Profiler * [BLAS3](https://docs.nvidia.com/cuda/cublas/index.html#cublas-level-3-function-reference) operators accelerated by Tensor Cores * Supported types: f32, cf32, f64, cf64, tf32x3, complex tf32x3 - * [HERK](./test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](./python/cutlass_library/rank_k_operation.py) - * [SYRK](./test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](./python/cutlass_library/rank_k_operation.py) - * [SYMM](./test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](./python/cutlass_library/symm_operation.py) - * [TRMM](./test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](./python/cutlass_library/trmm_operation.py) - * [Unit tests](./test/unit/gemm/device/testbed_rank_k_universal.h) -* [CUTLASS Python](./examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python) - * [Python-based runtime](./tools/library/scripts/rt.py) interoperable with existing emitters -* [GEMM + Softmax example](./examples/35_gemm_softmax) -* [Gather and Scatter Fusion with GEMM](./examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel. + * [HERK](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/her2k_cf32h_cf32n_tensor_op_fast_f32_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/rank_k_operation.py) + * [SYRK](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/syrk_f32n_f32t_tensor_op_fast_f32_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/rank_k_operation.py) + * [SYMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/symm_f32n_f32n_tensor_op_fast_f32_ls_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/symm_operation.py) + * [TRMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/trmm_f32n_f32t_f32t_tensor_op_fast_f32_ls_sm80.cu) with [emitter](https://github.com/NVIDIA/cutlass/tree/main/python/cutlass_library/trmm_operation.py) + * [Unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/testbed_rank_k_universal.h) +* [CUTLASS Python](https://github.com/NVIDIA/cutlass/tree/main/examples/40_cutlass_py) demonstrating JIT compilation of CUTLASS kernels and a Python-based runtime using [CUDA Python](https://developer.nvidia.com/cuda-python) + * [Python-based runtime](https://github.com/NVIDIA/cutlass/tree/main/tools/library/scripts/rt.py) interoperable with existing emitters +* [GEMM + Softmax example](https://github.com/NVIDIA/cutlass/tree/main/examples/35_gemm_softmax) +* [Gather and Scatter Fusion with GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/36_gather_scatter_fusion) can gather inputs and scatters outputs based on indices vectors in the same GEMM kernel. * It can select random rows in a row major matrix. * It can select random columns in a column major matrix. -* [Back-to-back GEMM/CONV](./examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use. It can eliminate register spill when the tile size is big. Additionally, bias vector add is supported in the first GEMM/CONV. +* [Back-to-back GEMM/CONV](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion) fully supports buffering the first GEMM/CONV results in the shared memory for the latter one to use. It can eliminate register spill when the tile size is big. Additionally, bias vector add is supported in the first GEMM/CONV. * Supported kernels: GEMM and CONV. * Supported types: fp16 and int8. * Supported architectures: Turing and Ampere. -* [Transposed Convolution](./examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation. -* [Utility functions](./tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC. +* [Transposed Convolution](https://github.com/NVIDIA/cutlass/tree/main/examples/34_transposed_conv2d) (a.k.a Deconvolution) support which reuses Dgrad implementation. +* [Utility functions](https://github.com/NVIDIA/cutlass/tree/main/tools/util/include/cutlass/util) that can pad NHWC and convert between NCHW and NHWC. * [Small alignment implicit gemm](https://github.com/NVIDIA/cutlass/issues/242) support for Fprop/Dgrad/Wgrad so that padding is no longer mandated to use tensor cores in these kernels. * Epilogue enhancement: * Eliminate bank conflicts in int8 tensor core kernels. * Half2 usage if epilogue compute type is fp16. * More activation functions: Silu, Hardswish, Leaky Relu. - * New elementwise fusion pattern for [residual block](./include/cutlass/epilogue/thread/linear_combination_residual_block.h). -* [Group GEMM](./examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs. + * New elementwise fusion pattern for [residual block](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/linear_combination_residual_block.h). +* [Group GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/24_gemm_grouped) thread block number calculation fix which helps to launch the intended number of threadblocks to fully occupy the GPUs. * [Parallel GEMM splitk](https://github.com/NVIDIA/cutlass/pull/277) support in the CUTLASS profiler. * Optimal performance using [**CUDA 11.6u2**](https://developer.nvidia.com/cuda-downloads) * Updates and bugfixes from the community (thanks!) @@ -359,17 +393,17 @@ * **TF32x3:** emulated single-precision using Tensor Cores * 45+ TFLOPs on NVIDIA A100 - * [GEMM SDK example](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real) - * [COMPLEX GEMM SDK example](./examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu) (complex) - * [Implicit GEMM Convolution SDK example](./examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu) + * [GEMM SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm/27_ampere_3xtf32_fast_accurate_tensorop_gemm.cu) (real) + * [COMPLEX GEMM SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/29_ampere_3xtf32_fast_accurate_tensorop_complex_gemm/29_3xtf32_complex_gemm.cu) (complex) + * [Implicit GEMM Convolution SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/28_ampere_3xtf32_fast_accurate_tensorop_fprop/ampere_3xtf32_fast_accurate_tensorop_fprop.cu) * **Mainloop fusion for Convolution:** convolution with fused per-channel scale-bias-relu - * [Conv Fprop SDK example](./examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu) - * [Conv WGrad SDK example](./examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu) - * [cutlass::conv::device::ImplicitGemmConvolutionFusion](./include/cutlass/conv/device/implicit_gemm_convolution_fusion.h) + * [Conv Fprop SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/25_ampere_fprop_mainloop_fusion/ampere_fprop_mainloop_fusion.cu) + * [Conv WGrad SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/26_ampere_wgrad_mainloop_fusion/ampere_wgrad_mainloop_fusion.cu) + * [cutlass::conv::device::ImplicitGemmConvolutionFusion](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/device/implicit_gemm_convolution_fusion.h) * **Grouped GEMM:** similar to batched GEMM with distinct problem size per group - * [SDK example](./examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM - * [cutlass::gemm::device::GemmGrouped](./include/cutlass/gemm/device/gemm_grouped.h) -* [Implicit GEMM Convolution fusion](./examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue. + * [SDK example](https://github.com/NVIDIA/cutlass/tree/main/examples/24_gemm_grouped) with performance comparison with Batched Strided GEMM + * [cutlass::gemm::device::GemmGrouped](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/device/gemm_grouped.h) +* [Implicit GEMM Convolution fusion](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion/) supports staging 1st convolution's output accumulator in the shared memory on Turing. This allows more flexible warp tile sizes and less regsiter pressue. * Optimal performance using [**CUDA 11.5**](https://developer.nvidia.com/cuda-downloads) * Updates from the community (thanks!) @@ -379,13 +413,13 @@ * CUDA 10.2 ## [2.7.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.7.0) (2021-09-24) - * Mainloop fusion for GEMM: [summation over A or B](./examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu) - * [Strided DGRAD (optimized iterators)](./include/cutlass/conv/kernel/default_conv2d_dgrad.h) - * [Half-precision GELU_taylor activation functions](./include/cutlass/epilogue/thread/activation.h#L196) + * Mainloop fusion for GEMM: [summation over A or B](https://github.com/NVIDIA/cutlass/tree/main/examples/23_ampere_gemm_operand_reduction_fusion/ampere_gemm_operand_reduction_fusion.cu) + * [Strided DGRAD (optimized iterators)](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/kernel/default_conv2d_dgrad.h) + * [Half-precision GELU_taylor activation functions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/activation.h#L196) * Use these when accumulation and epilogue compute types are all `cutlass::half_t` - * Tuning and bug fixes to [fused GEMM + GEMM example](./examples/13_two_tensor_op_fusion/) - * Support for smaller than 128b aligned Convolutions: [see examples](test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu#L272) - * Caching of results to accelerate Convolution [unit tests](test/unit/conv/device/cache_testbed_output.h) + * Tuning and bug fixes to [fused GEMM + GEMM example](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion/) + * Support for smaller than 128b aligned Convolutions: [see examples](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.cu#L272) + * Caching of results to accelerate Convolution [unit tests](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/cache_testbed_output.h) * Can be enabled or disabled by running `cmake .. -DCUTLASS_TEST_ENABLE_CACHED_RESULTS=OFF` * Corrections and bug fixes reported by the CUTLASS community * Thank you for filing these issues! @@ -398,24 +432,24 @@ ## [2.6.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.6.0) (2021-07-22) * Optimal performance when compiled with the [CUDA 11.4 Toolkit](https://developer.nvidia.com/cuda-toolkit) - * Adopt the new L2 prefetch feature in [cp.async](./include/cutlass/arch/memory.h) and [global load](./include/cutlass/arch/memory_sm80.h) + * Adopt the new L2 prefetch feature in [cp.async](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/memory.h) and [global load](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/memory_sm80.h) * Fused operators with GEMM and Convolution - * [Fused broadcast in epilogue](test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu) - * [Fused partial reduction in epilogue](./test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu) + * [Fused broadcast in epilogue](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_with_broadcast_f16n_f16n_f16n_tensorop_f32_sm75.cu) + * [Fused partial reduction in epilogue](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_with_reduction_f16n_f16n_f16n_tensorop_f32_sm75.cu) * 64b tensor strides and leading dimensions support for GEMMs * Affine rank=2 matrix layouts - * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](./include/cutlass/layout/matrix.h) - * Support [FP64 tensor core](./examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM. - * [Batched GEMV](./test/unit/gemm/device/gemv.cu) preview implementation - * [New strided Dgrad](test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation + * Row stride and column stride for matrices using [cutlass::layout::AffineRank2](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/layout/matrix.h) + * Support [FP64 tensor core](https://github.com/NVIDIA/cutlass/tree/main/examples/18_ampere_fp64_tensorop_affine2_gemm/ampere_fp64_tensorop_affine2_gemm.cu) and SIMT GEMM. + * [Batched GEMV](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemv.cu) preview implementation + * [New strided Dgrad](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.cu) implementation * Accelerates over previous implementation by cutting down redundant math by 4x * Support using new `Dy` and `w` analytic iterators and existing `cutlass::conv::device::ImplicitGemmConvolution` interface * Quaternion-valued GEMM and Convolution in single- and double-precision (targeting CUDA Cores) - * Updates to [quaternion.h](./include/cutlass/quaternion.h) and [functional.h](./include/cutlass/functional.h) - * SDK Example for [GEMM](./examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](./examples/22_quaternion_conv/quaternion_conv.cu) - * [Unit tests for GEMM](./test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](./test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu) + * Updates to [quaternion.h](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/quaternion.h) and [functional.h](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/functional.h) + * SDK Example for [GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/21_quaternion_gemm/quaternion_gemm.cu) and [Convolution](https://github.com/NVIDIA/cutlass/tree/main/examples/22_quaternion_conv/quaternion_conv.cu) + * [Unit tests for GEMM](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/simt_qgemm_nn_sm50.cu) and [Convolution](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv2d_fprop_implicit_gemm_qf32nhwc_qf32nhwc_qf32nhwc_simt_f32_sm50.cu) * Many improvements to the epilogue. - * Provide an [option](./include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations + * Provide an [option](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/threadblock/epilogue.h) to not fully unroll the epilogue to reduce the code size and improve the performance when using complicated elementwise operations * Performance improvement for FP16 tensor core kernels * Bug fixes * Enhanced Clang support and the combination of Clang 13 and CUDA 11.4 can build and run kernels from Pascal and Ampere. @@ -427,14 +461,14 @@ ## [2.5.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.5.0) (2021-02-26) * Tensor reductions * _m_-to-_n_ reductions of tensors with affine layout - * [Specializations](./test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension - * [Specializations](./test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension + * [Specializations](https://github.com/NVIDIA/cutlass/tree/main/test/unit/reduction/device/tensor_reduce_contiguous.cu) for reductions including contiguous dimension + * [Specializations](https://github.com/NVIDIA/cutlass/tree/main/test/unit/reduction/device/tensor_reduce_strided.cu) for reductions excluding contiguous dimension * Custom reduction functors such as `cutlass::logical_and` * Large tensor support, up to 2^63 elements (however, each dimension is limited to an extent of 2^31) * Optimizations for 3-D convolution - * [Optimized tile iterators](./include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution - * Full coverage of [forward](test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution - * [Fused Convolution+Convolution example](./examples/13_two_tensor_op_fusion/README.md) + * [Optimized tile iterators](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/conv/threadblock/conv3d_fprop_activation_tile_access_iterator_optimized.h) using precomputed delta table for 3-D convolution + * Full coverage of [forward](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv3d_fprop_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) and [backwards](https://github.com/NVIDIA/cutlass/tree/main/test/unit/conv/device/conv3d_dgrad_implicit_gemm_f16ndhwc_f16ndhwc_f32ndhwc_tensor_op_f32_sm80.cu) passes for 3D convolution + * [Fused Convolution+Convolution example](https://github.com/NVIDIA/cutlass/tree/main/examples/13_two_tensor_op_fusion/README.md) * Corrections and bug fixes reported by the CUTLASS community * Thank you for filing these issues! @@ -453,16 +487,16 @@ ## [2.3.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.3.0) (2020-09-23) * [NVIDIA Ampere Architecture features](https://devblogs.nvidia.com/nvidia-ampere-architecture-in-depth/) - * [Sparse Tensor Core GEMM kernels](test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu): + * [Sparse Tensor Core GEMM kernels](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/gemm_f16n_f16n_f32t_tensor_op_f32_sparse_sm80.cu): * Direct access to Sparse Tensor Cores and maximum performance via [`mma.sp.sync`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma-and-friends) * Fast SGEMM targeting GeForce RTX 30-series CUDA Cores * Minor Features: - * [Activation functions](./include/cutlass/epilogue/thread/activation.h) such as [GeLU](./include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](./include/cutlass/epilogue/thread/linear_combination_sigmoid.h) - * Small [matrix](./include/cutlass/matrix.h) and [quaternion](./include/cutlass/quaternion.h) template classes in device code - * [Floating-point constants](./include/cutlass/constants.h) + * [Activation functions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/activation.h) such as [GeLU](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/linear_combination_gelu.h) and [Sigmoid](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/thread/linear_combination_sigmoid.h) + * Small [matrix](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/matrix.h) and [quaternion](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/quaternion.h) template classes in device code + * [Floating-point constants](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/constants.h) * NVIDIA Ampere GPU Architecture examples and documentation: - * [Tensor Float 32](./examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and - * [Sparse Tensor Cores](./examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu) + * [Tensor Float 32](https://github.com/NVIDIA/cutlass/tree/main/examples/14_ampere_tf32_tensorop_gemm/ampere_tf32_tensorop_gemm.cu) and + * [Sparse Tensor Cores](https://github.com/NVIDIA/cutlass/tree/main/examples/15_ampere_sparse_tensorop_gemm/ampere_sparse_tensorop_gemm.cu) * Documentation added on CUTLASS [efficient row-major epilogue](./media/docs/cpp/gemm_api.md#efficient-epilogue) ## [2.2.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.2.0) (2020-06-08) @@ -487,7 +521,7 @@ * API to launch compiled kernel instances for GEMM and planar complex GEMM * Planar Complex GEMM kernels targeting Volta and Turing Tensor Cores * Computes complex matrix products on matrices stored as disjoint real and imaginary parts - * [SDK Examples of Planar Complex GEMMs](./examples/10_planar_complex/planar_complex.cu) + * [SDK Examples of Planar Complex GEMMs](https://github.com/NVIDIA/cutlass/tree/main/examples/10_planar_complex/planar_complex.cu) * Minor enhancements and bug fixes ## [2.0.0](https://github.com/NVIDIA/cutlass/releases/tag/v2.0.0) (2019-11-19) diff --git a/CMakeLists.txt b/CMakeLists.txt index b54b8335..f141fd40 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,6 +178,10 @@ if (CUDA_VERSION VERSION_GREATER_EQUAL 12.8) list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100 100a 101 101a 120 120a) endif() +if (CUDA_VERSION VERSION_GREATER_EQUAL 12.9) + list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 100f 101f 120f) +endif() + set(CUTLASS_NVCC_ARCHS ${CUTLASS_NVCC_ARCHS_SUPPORTED} CACHE STRING "The SM architectures requested.") set(CUTLASS_NVCC_ARCHS_ENABLED ${CUTLASS_NVCC_ARCHS} CACHE STRING "The SM architectures to build code for.") @@ -676,25 +680,6 @@ if (NOT CUTLASS_NAMESPACE STREQUAL "cutlass") target_compile_definitions(CUTLASS INTERFACE CUTLASS_NAMESPACE=${CUTLASS_NAMESPACE}) endif() -if (NOT DEFINED CUTLASS_REVISION) - - find_package(Git QUIET) - - execute_process( - COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD - RESULT_VARIABLE CUTLASS_REVISION_RESULT - OUTPUT_VARIABLE CUTLASS_REVISION - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - - if (CUTLASS_REVISION_RESULT) - message(STATUS "CUTLASS Revision: Unable to detect, Git returned code ${CUTLASS_REVISION_RESULT}.") - else() - message(STATUS "CUTLASS Revision: ${CUTLASS_REVISION}") - endif() - -endif() - configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/version_extended.h.in ${CMAKE_CURRENT_BINARY_DIR}/include/cutlass/version_extended.h diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 46506007..f6ef0f50 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -2,7 +2,7 @@ [README](./README.md#documentation) > **Contributors** -# CUTLASS Developers ** +# CUTLASS C++ Developers ** Andrew Kerr
Paul Springer
@@ -70,8 +70,49 @@ Shreya Gaur
** _The list is sorted in order of the author's first contribution to the CUTLASS project._ +# CUTLASS DSL Developers *** -# CUTE Developers +Albert Di
+Albert Xu
+Anakin Zheng
+Arvin Jou
+Brandon Sun
+Chenyang Xu
+Chunyu Wang
+Cris Cecka
+dePaul Miller
+Edward Cao
+Fung Xie
+Guray Ozen
+Hao Hu
+Hong Wang
+Jeremy Furtek
+Jie Fang
+JingZe Cui
+Kihiro Bando
+Linfeng Zheng
+Longsheng Du
+Mina Sun
+Mindy Li
+Pradeep Ramani
+Questa Wang
+Serif Yesil
+Tao Xie
+Tina Li
+Vicki Wang
+Vincent Zhang
+Vijay Thakkar
+Xiao Dong
+Xiaolei Shi
+Xinyu Wang
+Yihan Chen
+Yuhan Li
+Zekun Fan
+ +*** _Sorted in alphabetical order._ + + +# CuTe Developers Cris Cecka
Vijay Thakkar
@@ -100,6 +141,9 @@ David Tanner
Tri Dao
Jay Shah
+Mehdi Amini
+Larry Wu
+Justin Holewinski
Timothy Costa
Julien Demouth
Brian Fahs
@@ -108,14 +152,11 @@ Michael Goldfarb
Mostafa Hagog
Fei Hu
Alan Kaatz
-Tina Li
Wei Liu
Tim Martin
Kevin Siu
Markus Tavenrath
John Tran
-Vicki Wang
-Fung Xie
Yang Xu
Scott Yokim
Girish Bharambe
diff --git a/EULA.txt b/EULA.txt new file mode 100644 index 00000000..e7699599 --- /dev/null +++ b/EULA.txt @@ -0,0 +1,188 @@ +NVIDIA Software License Agreement + +IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE SOFTWARE +This software license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity, (“you”) and NVIDIA Corporation (“NVIDIA”) and governs the use of the NVIDIA CUTLASS DSLs software and materials that NVIDIA delivers to you under this Agreement (“Software”). +NVIDIA and you are each a “party” and collectively the “parties.” +This Agreement can be accepted only by an adult of legal age of majority in the country in which the Software is used. +If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not use the Software. + +1. License Grants + + 1.1. License Grant to You. The Software made available by NVIDIA to you is licensed, not sold. + Subject to the terms of this Agreement, NVIDIA grants you a limited, non-exclusive, revocable, non-transferable, and non-sublicensable (except as expressly granted in this Agreement), license to: + + a. install and use copies of the Software, + b. configure the Software using configuration files provided (if applicable), + c. modify and create derivative works of any sample or example source code NVIDIA delivers to you as part of the Software (“Derivatives”) (if applicable), and + d. distribute python files in the Software package in source format as incorporated into a software application subject to the following distribution requirements: + + i. Your application must have material additional functionality, beyond the included portions of the Software. + ii. The distributable portions of the Software shall only be accessed by your application. + iii. The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.” + iv. Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only. + v. The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. + vi. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. + + The foregoing (a) through (d) are, collectively, the “Purpose”, and the developed applications are only for use in systems with NVIDIA GPUs. + + 1.2. License Grant to NVIDIA. Subject to the terms of this Agreement, you grant NVIDIA and its affiliates a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit at NVIDIA’s discretion any Derivatives created by or for you. + You may, but are not required to, deliver any Derivatives to NVIDIA. + +2. License Restrictions + + Your license to use the Software and Derivatives is restricted as stated in this Section 2 (“License Restrictions”). + You will cooperate with NVIDIA and, upon NVIDIA’s written request, you will confirm in writing and provide reasonably requested information to verify your compliance with the terms of this Agreement. + You may not: + + 2.1. Use the Software or Derivatives for any purpose other than the Purpose; + + 2.2. Sell, rent, sublicense, transfer, distribute or otherwise make available to others (except authorized users as stated in Section 3 (“Authorized Users”)) any portion of the Software or Derivatives, except as expressly granted in Section 1.1 (“License Grant to You”); + + 2.3. Reverse engineer, decompile, or disassemble the Software components provided in binary form, nor attempt in any other manner to obtain source code of such Software; + + 2.4. Modify or create derivative works of the Software, except as expressly granted in Section 1.1 (“License Grant to You”); + + 2.5. Change or remove copyright or other proprietary notices in the Software; + + 2.6. Bypass, disable, or circumvent any technical limitation, encryption, security, digital rights management or authentication mechanism in the Software; + + 2.7. Use the Software or Derivatives in any manner that would cause them to become subject to an open source software license, subject to the terms in Section 6 (“Components Under Other Licenses”); + + 2.8. Use the Software or Derivatives in violation of any applicable law or regulation in relevant jurisdictions + + 2.9. Indicate that a product or service developed with the Software or Derivatives is sponsored or endorsed by NVIDIA; + + 2.10. Replace any NVIDIA software components in the Software that are governed by this Agreement with other software that implements NVIDIA APIs; + + 2.11. Reverse engineer, decompile or disassemble any portion of the output generated using Software elements for the purpose of translating such output artifacts to target a non-NVIDIA platform; or + +3. Authorized Users + + You may allow employees and contractors of your entity or of your subsidiary(ies), and for educational institutions also enrolled students, to internally access and use the Software as authorized by this Agreement from your secure network to perform the work authorized by this Agreement on your behalf. + You are responsible for the compliance with the terms of this Agreement by your authorized users. + Any act or omission that if committed by you would constitute a breach of this Agreement will be deemed to constitute a breach of this Agreement if committed by your authorized users. + +4. Pre-Release + + Software versions identified as alpha, beta, preview, early access or otherwise as pre-release (“Pre-Release”) may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability and reliability standards relative to NVIDIA commercial offerings. + You use Pre-Release Software at your own risk. NVIDIA did not design or test the Software for use in production or business-critical systems. + NVIDIA may choose not to make available a commercial version of Pre-Release Software. + NVIDIA may also choose to abandon development and terminate the availability of Pre-Release Software at any time without liability. + +5. Updates + + NVIDIA may at any time and at its option, change, discontinue, or deprecate any part, or all, of the Software, or change or remove features or functionality, or make available patches, workarounds or other updates to the Software. + Unless the updates are provided with their separate governing terms, they are deemed part of the Software licensed to you under this Agreement, and your continued use of the Software is deemed acceptance of such changes. + +6. Components Under Other Licenses + + The Software may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as open source software licenses and other license terms (“Other Licenses”). + The components are subject to the applicable Other Licenses, including any proprietary notices, disclaimers, requirements and extended use rights; + except that this Agreement will prevail regarding the use of third-party open source software, unless a third-party open source software license requires its license terms to prevail. + Open source software license means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (http://opensource.org), Free Software Foundation (http://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (http://www.spdx.org). + +7. Ownership + + 7.1. NVIDIA Ownership. The Software, including all intellectual property rights, is and will remain the sole and exclusive property of NVIDIA or its licensors. + Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Software, and (b) no other license or right is granted to you by implication, estoppel or otherwise. + + 7.2. Your Ownership. Subject to the rights of NVIDIA and its suppliers in the Software, which continue to be licensed as stated in this Agreement, even when incorporated in your products or services, and the extent permitted by applicable law, as between you and NVIDIA, you hold all rights, title and interest in and to your products, services and Derivatives you develop as permitted in this Agreement including their respective intellectual property rights. + +8. Feedback + + You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the Software (collectively, “Feedback”). + Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates. + If you provide Feedback, you grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion. + +9. Termination + + 9.1. Termination. This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the Software. + Additionally, either party may terminate this Agreement at any time with thirty (30) days’ advance written notice to the other party. + + 9.2. Effect of Termination. Upon any expiration or termination of this Agreement, you will promptly (a) stop using and return, delete or destroy NVIDIA confidential information and all Software received under this Agreement, and (b) delete or destroy Derivatives created under this Agreement, unless an authorized NVIDIA representative provides prior written approval that you may keep a copy of the Derivatives solely for archival purposes. + Upon written request, you will certify in writing that you have complied with your obligations under this Section 9.2 (“Effect of Termination”). + + 9.3. Survival. Section 1.2 (“License Grant to NVIDIA”), Section 5 (“Updates”), Section 6 (“Components Under Other Licenses”), Section 7 (“Ownership”), Section 8 (“Feedback), Section 9.2 (“Effect of Termination”), Section 9.3 (“Survival”), Section 10 (“Disclaimer of Warranties”), Section 11 (“Limitation of Liability”), Section 12 (“Use in Mission Critical Applications”), Section 13 (“Governing Law and Jurisdiction”), Section 14 (“Indemnity”) and Section 15 (“General”) will survive any expiration or termination of this Agreement. + +10. Disclaimer of Warranties + + THE SOFTWARE IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER + EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING. NVIDIA DOES NOT WARRANT OR ASSUME RESPONSIBILITY FOR THE ACCURACY OR COMPLETENESS OF ANY THIRD-PARTY INFORMATION, TEXT, GRAPHICS, LINKS CONTAINED IN THE SOFTWARE. + WITHOUT LIMITING THE FOREGOING, NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS, ANY DEFECTS OR ERRORS WILL BE CORRECTED, ANY CERTAIN CONTENT WILL BE AVAILABLE; OR THAT THE SOFTWARE IS FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS. NO INFORMATION OR ADVICE GIVEN BY NVIDIA WILL IN ANY WAY INCREASE THE SCOPE OF ANY WARRANTY EXPRESSLY PROVIDED IN THIS AGREEMENT. + NVIDIA does not warrant or assume responsibility for the accuracy or completeness of any third-party information, text, graphics or links contained in the Software. + +11. Limitations of Liability + + 11.1. EXCLUSIONS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL NVIDIA BE LIABLE FOR ANY (I) INDIRECT, PUNITIVE, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, OR (ii) DAMAGES FOR (a) THE COST OF PROCURING SUBSTITUTE GOODS, OR (b) LOSS OF PROFITS, REVENUES, USE, DATA OR GOODWILL ARISING OUT OF OR RELATED TO THIS AGREEMENT, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, AND EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF A PARTY’S REMEDIES FAIL THEIR ESSENTIAL PURPOSE. + + 11.2. DAMAGES CAP. ADDITIONALLY, TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA’S TOTAL CUMULATIVE AGGREGATE LIABILITY FOR ANY AND ALL LIABILITIES, OBLIGATIONS OR CLAIMS ARISING OUT OF OR RELATED TO THIS AGREEMENT WILL NOT EXCEED FIVE U.S. DOLLARS (US$5). + +12. Use in Mission Critical Applications + + You acknowledge that the Software provided under this Agreement is not designed or tested by NVIDIA for use in any system or application where the use or failure of such system or application developed with NVIDIA’s Software could result in injury, death or catastrophic damage (each, a “Mission Critical Application”). + Examples of Mission Critical Applications include use in avionics, navigation, autonomous vehicle applications, AI solutions for automotive products, military, medical, life support or other mission-critical or life-critical applications. + NVIDIA will not be liable to you or any third party, in whole or in part, for any claims or damages arising from these uses. + You are solely responsible for ensuring that systems and applications developed with the Software include sufficient safety and redundancy features and comply with all applicable legal and regulatory standards and requirements. + +13. Governing Law and Jurisdiction + + This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. + The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; + except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. + +14. Indemnity + + By using the Software you agree to defend, indemnify and hold harmless NVIDIA and its affiliates and their respective officers, directors, employees and agents from and against any claims, disputes, demands, liabilities, damages, losses, costs and expenses arising out of or in any way connected with (i) products or services that have been developed or deployed with or use the Software, or claims that they violate laws, or infringe, violate, or misappropriate any third party right; + or (ii) use of the Software in breach of the terms of this Agreement. + +15. General + + 15.1. Independent Contractors. + The parties are independent contractors, and this Agreement does not create a joint venture, partnership, agency, or other form of business association between the parties. + Neither party will have the power to bind the other party or incur any obligation on its behalf without the other party’s prior written consent. + Nothing in this Agreement prevents either party from participating in similar arrangements with third parties. + + 15.2. No Assignment. + NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law. + You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void. + + 15.3. No Waiver. + No failure or delay by a party to enforce any term or obligation of this Agreement will operate as a waiver by that party, or prevent the enforcement of such term or obligation later. + + 15.4. Trade Compliance. + You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. + You confirm (a) your understanding that export or reexport of certain NVIDIA products or technologies may require a license or other approval from appropriate authorities and (b) that you will not export or reexport any products or technology, directly or indirectly, without first obtaining any required license or other approval from appropriate authorities, (i) to any countries that are subject to any U.S. or local export restrictions (currently including, but not necessarily limited to, Belarus, Cuba, Iran, North Korea, Russia, Syria, the Region of Crimea, Donetsk People’s Republic Region and Luhansk People’s Republic Region); + (ii) to any end-user who you know or have reason to know will utilize them in the design, development or production of nuclear, chemical or biological weapons, missiles, rocket systems, unmanned air vehicles capable of a maximum range of at least 300 kilometers, regardless of payload, or intended for military end-use, or any weapons of mass destruction; + (iii) to any end-user who has been prohibited from participating in the U.S. or local export transactions by any governing authority; + or (iv) to any known military or military-intelligence end-user or for any known military or military-intelligence end-use in accordance with U.S. trade compliance laws and regulations. + + 15.5. Government Rights. + The Software, documentation and technology (“Protected Items”) are “Commercial products” as this term is defined at 48 C.F.R. + 2.101, consisting of “commercial computer software” and “commercial computer software documentation” as such terms are used in, respectively, 48 C.F.R. + 12.212 and 48 C.F.R. 227.7202 & 252.227-7014(a)(1). Before any Protected Items are supplied to the U.S. Government, you will (i) inform the U.S. Government in writing that the Protected Items are and must be treated as commercial computer software and commercial computer software documentation developed at private expense; + (ii) inform the U.S. Government that the Protected Items are provided subject to the terms of the Agreement; + and (iii) mark the Protected Items as commercial computer software and commercial computer software documentation developed at private expense. + In no event will you permit the U.S. Government to acquire rights in Protected Items beyond those specified in 48 C.F.R. + 52.227-19(b)(1)-(2) or 252.227-7013(c) except as expressly approved by NVIDIA in writing. + + 15.6. Notices. + Please direct your legal notices or other correspondence to legalnotices@nvidia.com with a copy mailed to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. + If NVIDIA needs to contact you, you consent to receive the notices by email and agree that such notices will satisfy any legal communication requirements. + + 15.7. Severability. + If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect. + + 15.8. Amendment. + Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties. + + 15.9. Construction. + The headings in the Agreement are included solely for convenience and are not intended to affect the meaning or interpretation of the Agreement. + As required by the context of the Agreement, the singular of a term includes the plural and vice versa. + + 15.10. Force Majeure. + Neither party will be liable during any period where an event or circumstance prevents or delays that party from performing its obligations under this Agreement and that event or circumstance: (i) is not within the reasonable control of that party and is not the result of that party’s negligence, and (ii) cannot be overcome or avoided by that party using reasonably diligent efforts. + + 15.11. Entire Agreement. + Regarding the subject matter of this Agreement, the parties agree that (a) this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications and (b) any additional or different terms or conditions, whether contained in purchase orders, order acknowledgments, invoices or otherwise, will not be binding and are null and void. + +(v. May 8, 2025) diff --git a/LICENSE.txt b/LICENSE.txt index 47016fa7..e08eb49c 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -25,3 +25,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Certain files within this repository are subject to separate licensing terms: + +- The files located in the `python/CuTeDSL` directory are licensed under the + NVIDIA End User License Agreement (EULA). Please refer to + https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html + for the full terms. diff --git a/README.md b/README.md index 26ec3abd..667eb73d 100644 --- a/README.md +++ b/README.md @@ -1,86 +1,80 @@ ![ALT](./media/images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") +# Overview -# CUTLASS 3.9.2 +# CUTLASS 4.0.0 -_CUTLASS 3.9.2 - May 2025_ +_CUTLASS 4.0.0 - May 2025_ -CUTLASS is a collection of CUDA C++ template abstractions for implementing -high-performance matrix-matrix multiplication (GEMM) and related computations at all levels -and scales within CUDA. It incorporates strategies for hierarchical decomposition and -data movement similar to those used to implement cuBLAS and cuDNN. CUTLASS decomposes -these "moving parts" into reusable, modular software components abstracted by C++ template -classes. Primitives for different levels of a conceptual parallelization hierarchy -can be specialized and tuned via custom tiling sizes, data types, -and other algorithmic policy. The resulting flexibility simplifies their use -as building blocks within custom kernels and applications. +CUTLASS is a collection of abstractions for implementing high-performance matrix-matrix multiplication (GEMM) +and related computations at all levels and scales within CUDA. It incorporates strategies for +hierarchical decomposition and data movement. CUTLASS decomposes these "moving parts" into reusable, modular +software components and abstractions. -To support a wide variety of applications, CUTLASS provides extensive support for -mixed-precision computations, providing specialized data-movement and +Primitives for different levels of a conceptual parallelization hierarchy can be specialized and tuned +via custom tiling sizes, data types, and other algorithmic policy. The resulting flexibility simplifies +their use as building blocks within custom kernels and applications. + +CUTLASS has been providing CUDA C++ template abstractions for high-performance linear algebra since 2017 and +these abstractions provide extensive support for a wide range of computations including +mixed-precision computations, specialized data-movement (async copy) and multiply-accumulate abstractions for FP64, FP32, TF32, FP16, BF16, -[FP32 emulation via tensor core instruction](./examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm), +[FP32 emulation via tensor core instruction](https://github.com/NVIDIA/cutlass/tree/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm), 8b floating point types (e5m2 and e4m3), block scaled data types (NVIDIA NVFP4 and OCP standard MXFP4, MXFP6, MXFP8), narrow integer types (4 and 8b signed and unsigned integers), and binary 1b data types (where architectures allow for the -native support of such data types). -CUTLASS demonstrates optimal matrix multiply operations +native support of such data types) across NVIDIA's Volta, Turing, Ampere, Ada, Hopper, and Blackwell architectures. + +To this rich ecosystem of C++ based kernel programming abstractions, CUTLASS 4 adds CUTLASS DSLs. These are Python native interfaces for writing high-performance CUDA kernels based on core CUTLASS and CuTe concepts without any performance compromises. This allows for a much smoother learning curve, orders of magnitude faster compile times, native integration with DL frameworks without writing glue code, and much more intuitive metaprogramming that does not require deep C++ expertise. + +Overall we envision CUTLASS DSLs as a family of domain-specific languages (DSLs). With the release of 4.0, we are releasing the first of these in CuTe DSL. This is a low level programming model that is fully consistent with CuTe C++ abstractions — exposing core concepts such as layouts, tensors, hardware atoms, and full control over the hardware thread and data hierarchy. + +CuTe DSL demonstrates optimal matrix multiply and other linear algebra operations targeting the programmable, high-throughput _Tensor Cores_ implemented by -NVIDIA's Volta, Turing, Ampere, Ada, Hopper, and Blackwell architectures. +NVIDIA's Ampere, Hopper, and Blackwell architectures. -In addition to GEMMs, CUTLASS implements high-performance convolution via -the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution -operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. -This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components. +We believe it will become an indispensable tool for students, researchers, and performance +engineers alike — flattening the learning curve of GPU programming, rapidly prototyping kernel +designs, and bringing optimized solutions into production. -See the [Quick Start Guide](./media/docs/cpp/quickstart.md) to get started quickly. +CuTe DSL is currently in public beta and will graduate out of beta by end of summer 2025. -See the [functionality docs](./media/docs/cpp/functionality.md) for a more comprehensive -list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU -architecture. +To get started quickly - please refer : + - [CUTLASS C++ Quick Start Guide](./media/docs/cpp/quickstart.md). + - [CuTe DSL Quick Start Guide](./media/docs/pythonDSL/quick_start.rst). -# What's New in CUTLASS 3.9 +# What's New in CUTLASS 4.0 -* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API: - - Collective mainloops that target for: - * [Blockscaled datatypes with support for dense GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp) - * [Blockscaled datatypes with support for sparse GEMM](./include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp) - - New [GEMM](./include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](./include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders. - - [Blackwell SM120 epilogue](./include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](./include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp). -* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture: - - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu). - - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](./examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu). - - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](./examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu). - - [Grouped GEMM with nvfp4 datatype](./examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu). - - [Sparse Blockscaled GEMM with mxfp8 input datatype and BF16 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80a_blackwell_geforce_mxfp8_bf16_sparse_gemm.cu). - - [Sparse Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor](./examples/80_blackwell_geforce_sparse_gemm/80b_blackwell_geforce_nvfp4_nvfp4_sparse_gemm.cu). -* Set of unit tests that demonstrate the usage of both [sparse](./test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](./test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM. -* Support for Blackwell SM100 Sparse kernels: - - Collective mainloop that target for - * [SM100 Sparse GEMM](./include/cutlass/gemm/collective/sm100_sparse_mma_warpspecialized.hpp) -* Set of example that demonstrate the usage of the 3.x API for targeting Blackwell SM100 Sparse GEMM: - - [Sparse GEMM](./examples/83_blackwell_sparse_gemm/83_blackwell_sparse_gemm.cu) - - [Blockscaled Sparse GEMM with NVFP4 input data type](./examples/84_blackwell_narrow_precision_sparse_gemm/84a_blackwell_nvfp4_bf16_sparse_gemm.cu) - - [Blockscaled Sparse GEMM with mixed input data type (MXFP8 and MXFP4)](./examples/84_blackwell_narrow_precision_sparse_gemm/84b_blackwell_mixed_mxfp8_bf16_sparse_gemm.cu) -* Set of unit tests that demonstrate the usage of [sparse](./test/unit/gemm/device/sm100_sparse_tensorop_gemm) and [blockscaled sparse](./test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm) Blackwell SM100 GEMM. -* A new Multi-head Latent Attention (MLA) for SM100 Blackwell architecture in CUTLASS [example](./examples/77_blackwell_fmha/) covers the flashMLA-like weight-absorbed decoding use-case. -* A new FMHA Backward kernel for SM100 Blackwell architecture extends CUTLASS [example](./examples/77_blackwell_fmha/) to show how the five backward pass MMAs can be fused into a single kernel to achieve high performance. -* A new [distributed GEMM example](./examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu) for SM100 Blackwell architecture. -* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures: - - Enhancement of [blockwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture. - - Enhancement of [groupwise GEMM](./examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture. - - Support for [grouped GEMM with blockwise and groupwise scaling](./examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture. - - Support for [grouped-wise GEMM](./tools/profiler/src/blockwise_gemm_operation_profiler.cu) in CUTLASS profiler. - - Support for [blockwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture. - - Support for [groupwise GEMM](./examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture. - - Support for [grouped GEMM with blockwise](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](./examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture. -* Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler: - - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels. - - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance. - - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration. - - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./media/docs/cpp/profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss). -* Support `void` as the D element in sm100 kernel epilogues. +## CuTe DSL +* CuTe DSL, a Python DSL centered around CuTe's abstractions + - [Core DSL implementation files](https://github.com/NVIDIA/cutlass/tree/main/python/CuTeDSL) + - [DSL Quick Start](./media/docs/pythonDSL/quick_start.rst) + - [DSL Overview](./media/docs/pythonDSL/overview.rst) +* [Overhauled documentation with an new dedicated website](https://docs.nvidia.com/cutlass) +* Set of examples demonstrating how to use CuTe DSL to write peak-performance kernels + - [Blackwell persistent dense GEMM with static scheduling](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py) + - [Blackwell grouped GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/grouped_gemm.py) + - [Blackwell fused multi-head attention forward pass](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/blackwell/fmha.py) + - [Ampere GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/tensorop_gemm.py) + - [FlashAttention-2 implementation targeting Ampere and Ada class GPUs (SM80, SM86, SM89)](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/ampere/flash_attention_v2.py) +* [Educational notebooks for getting started with CuTe DSL](https://github.com/NVIDIA/cutlass/tree/main/examples/python/CuTeDSL/notebooks) -Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits. +## CUTLASS C++ +* Support [Family Specific Architecture Features](https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/) which was introduced in CUDA 12.9 + - 100f, 101f, 120f were added to support Family Specific Architecture Features which allows running the same binary on different chips belonging to the same Family (e.g. sm100) without recompiling. +* Instruction shapes and redundant accumulation type have been removed from CUTLASS 3.x-style library kernel names to disambiguate kernels and shorten names. + - For example: + `(old) cutlass3x_sm90_tensorop_s64x128x16gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma` + `(new) cutlass3x_sm90_tensorop_gemm_bf16_bf16_f32_bf16_bf16_128x256x64_1x1x1_0_tnn_align8_warpspecialized_cooperative_epi_tma` + - If you are using the CUTLASS library kernel names directly (e.g. to compile a subset of the CUTLASS library with `-DCUTLASS_LIBRARY_KERNELS`, filter kernels in the CUTLASS profiler with `--kernels`), please update your uses accordingly, this is a breaking change. +* Further improved [Blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) and [Groupwise](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) GEMMs on Hopper and Blackwell. + - Added non-power-of-two tile sizes. + - Improved performance for K-major scale factors. + - The argument `mma_promotion_interval` has been removed from non-grouped GEMM to align with the grouped and Blackwell versions. +* Various improvements and fixes from the community and CUTLASS team. Thanks to everyone who submitted PRs! +* Optimal code generation with CUDA toolkit versions 12.9. + +Note: CUTLASS 4.x builds are known to be down on Windows platforms for all CUDA toolkits. CUTLASS team is working on a fix. **See the [CHANGELOG](CHANGELOG.md) for details of all past releases and updates.** @@ -89,21 +83,21 @@ CUTLASS team is working on a fix. CUTLASS primitives are very efficient. When used to construct device-wide GEMM kernels, they exhibit nearly optimal utilization of peak theoretical throughput. The figure below -shows CUTLASS 3.8's performance as a % of theoretical peak utilization +shows CUTLASS 3.8's performance as a % of theoretical peak utilization on various input and output data types when run on NVIDIA Blackwell SM100 architecture GPU. -

+![ALT](media/images/cutlass-3.8-blackwell-gemm-peak-performance.svg "") -The two figures below show the continual CUTLASS performance improvements +The two figures below show the continual CUTLASS performance improvements on an [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) (NVIDIA Hopper architecture) since CUTLASS 3.1. -CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads). -Tensor Core operations are implemented using CUDA's +CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads). +Tensor Core operations are implemented using CUDA's [mma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma) and [wgmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) instructions. -

-

+![ALT](media/images/cutlass-3.5.1-gemm-peak-performance.png "") +![ALT](media/images/cutlass-3.5.1-gemm-peak-performance-fp8.png "") # CuTe @@ -135,7 +129,7 @@ Minimum requirements: - Compiler: Must support at least C++17 - CUDA Toolkit version: 11.4 -CUTLASS requires a C++17 host compiler and +CUTLASS requires a C++17 host compiler and performs best when built with the [**CUDA 12.8 Toolkit**](https://developer.nvidia.com/cuda-downloads). It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, and all other CUDA 12.x versions. @@ -196,17 +190,17 @@ the kernel is expected to fail with a runtime error. ``` cmake .. -DCUTLASS_NVCC_ARCHS="90a" ``` -Or +Or ``` -cmake .. -DCUTLASS_NVCC_ARCHS="100a" +cmake .. -DCUTLASS_NVCC_ARCHS="100a" ``` -Note: The NVIDIA Blackwell SM100 architecture used in the datacenter -products has a different compute capability than the one underpinning -NVIDIA Blackwell GeForce RTX 50 series GPUs. As a result, kernels -compiled for Blackwell SM100 architecture with arch conditional features -(using `sm100a`) are not compatible with RTX 50 series GPUs. +Note: The NVIDIA Blackwell SM100 architecture used in the datacenter +products has a different compute capability than the one underpinning +NVIDIA Blackwell GeForce RTX 50 series GPUs. As a result, kernels +compiled for Blackwell SM100 architecture with arch conditional features +(using `sm100a`) are not compatible with RTX 50 series GPUs. Please refer to the [functionality documentation](./media/docs/cpp/functionality.md) for details on which kernels require which target architectures. @@ -231,7 +225,7 @@ CUTLASS is described in the following documents and the accompanying - [Tile Iterators](./media/docs/cpp/tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory - [CUTLASS Profiler](./media/docs/cpp/profiler.md) - command-line driven profiling application - [CUTLASS Utilities](./media/docs/cpp/utilities.md) - additional templates used to facilitate rapid development -- [Dependent kernel launch](./media/docs/cpp/dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent +- [Dependent kernel launch](./media/docs/cpp/dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent kernels in the same stream, and how it is used in CUTLASS. # Resources @@ -291,11 +285,11 @@ All tests should pass on supported platforms, though the exact number of tests m # Project Structure -CUTLASS is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests. -[Doxygen documentation](https://nvidia.github.io/cutlass) provides a complete list of files, classes, +CUTLASS is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests. +[Doxygen documentation](https://nvidia.github.io/cutlass) provides a complete list of files, classes, and template concepts defined in the CUTLASS project. -A detailed explanation of the source code organization may be found in the +A detailed explanation of the source code organization may be found in the [CUTLASS documentation](./media/docs/cpp/code_organization.md), but several main components are summarized below. ## CUTLASS Template Library @@ -320,7 +314,7 @@ include/ # client applications should target this directory reduction/ # bandwidth-limited reduction kernels that do not fit the "gemm" model thread/ # simt code that can be performed within a CUDA thread - + transform/ # code specialized for layout, type, and domain transformations * # core vocabulary types, containers, and basic numeric operations @@ -345,7 +339,7 @@ include/ # client applications should target this directory ### CUTLASS SDK Examples -[CUTLASS SDK examples](./examples) apply CUTLASS templates to implement basic computations. +[CUTLASS SDK examples](https://github.com/NVIDIA/cutlass/tree/main/examples) apply CUTLASS templates to implement basic computations. ### Tools @@ -358,7 +352,7 @@ tools/ profiler/ # CUTLASS Profiler - command-line utility for executing operations in the # CUTLASS Library - + util/ # CUTLASS Utilities - contains numerous helper classes for include/ # manging tensors in device memory, reference cutlass/ # implementations for GEMM, random initialization @@ -384,7 +378,7 @@ $ make cutlass_profiler -j16 By default, only one tile size is instantiated for each data type, math instruction, and layout. To instantiate all, set the following environment variable when running CMake from an empty `build/` directory. -Beware, this results in *tens of thousands* of kernels and long build times. +Beware, this results in *tens of thousands* of kernels and long build times. This would also result in a large binary size and on some platforms linker to fail on building the library. Therefore, it's highly recommended to generate only a subset of kernels as demonstrated in the sub-section below. ```bash @@ -395,13 +389,13 @@ $ make cutlass_profiler -j16 ## Building a subset of GEMM and Convolution kernels (_reduced_ build times) -To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with +To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with wildcard characters may be used to reduce the set of kernels. The following examples show building exactly one or a subset of kernels for NVIDIA Ampere and Turing architecture: ### Building a subset Tensor Core GEMM kernels -To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, +To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8 @@ -490,7 +484,7 @@ $ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 ### Building a subset of Tensor Core Convolution kernels -To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation +To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16 @@ -538,7 +532,7 @@ reference_device: Passed ### Building one Convolution CUDA kernel -To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation +To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: ```bash $ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc @@ -586,14 +580,14 @@ reference_device: Passed ## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler - Please follow the links for more CMake examples on selectively compiling CUTLASS kernels: - - [GEMM CMake Examples](./media/docs/cpp/quickstart.md#gemm-cmake-examples) + - [GEMM CMake Examples](./media/docs/cpp/quickstart.md#gemm-cmake-examples) - [Implicit GEMM convolution CMake Examples](./media/docs/cpp/quickstart.md#convolution-cmake-examples) - [Further details about the CUTLASS Profiler are described here.](./media/docs/cpp/profiler.md) # About -CUTLASS is released by NVIDIA Corporation as Open Source software under the +CUTLASS is released by NVIDIA Corporation as Open Source software under the [3-clause "New" BSD license](LICENSE.txt). # Contributors diff --git a/customConfigs.cmake b/customConfigs.cmake index d98fe6c5..a7342044 100644 --- a/customConfigs.cmake +++ b/customConfigs.cmake @@ -36,7 +36,7 @@ set(CUTLASS_PROFILER_REGRESSION_TEST_LEVEL ${CUTLASS_TEST_LEVEL} CACHE STRING " find_package(Python3 3.5 COMPONENTS Interpreter REQUIRED) -function(cutlass_generate_kernel_filter_and_testlists_files) +function(cutlass_generate_kernel_filter_and_testlist_files) set(options) set(oneValueArgs TEST_SET_NAME) @@ -59,30 +59,30 @@ function(cutlass_generate_kernel_filter_and_testlists_files) ) if(NOT cutlass_FILTER_GENERATION_RESULT EQUAL 0) - message(FATAL_ERROR "Error generating kernel filters and testlists files. See ${CMAKE_CURRENT_BINARY_DIR}/library_filter_generation.log") + message(FATAL_ERROR "Error generating kernel filters and testlist files. See ${CMAKE_CURRENT_BINARY_DIR}/library_filter_generation.log") endif() endfunction() if(CUTLASS_BUILD_FOR_PROFILER_REGRESSIONS) - set(PROFILER_ARCH_LIST 100a 101a 120a) + set(PROFILER_ARCH_LIST 100a 100f 101a 101f 120a 120f) foreach(ARCH IN LISTS CUTLASS_NVCC_ARCHS) if(NOT (ARCH IN_LIST PROFILER_ARCH_LIST)) - message(FATAL_ERROR "Only SM100a/101a/120a compute capability is supported with profiler-based unit tests") + message(FATAL_ERROR "Only SM${PROFILER_ARCH_LIST} compute capabilities are supported with profiler-based unit tests") endif() endforeach() if(CUTLASS_PROFILER_REGRESSION_TEST_LEVEL EQUAL 0) message(STATUS "Building for L0 profiler-based functional regressions") - cutlass_generate_kernel_filter_and_testlists_files(TEST_SET_NAME kernel_testlist_l0) + cutlass_generate_kernel_filter_and_testlist_files(TEST_SET_NAME kernel_testlist_l0) set(KERNEL_FILTER_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L0_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm_kernel_filter.list CACHE STRING "Kernel set") set(CUTLASS_PROFILER_REGRESSION_LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L0_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm.csv CACHE STRING "Regression set") elseif (CUTLASS_PROFILER_REGRESSION_TEST_LEVEL EQUAL 1) message(STATUS "Building for L1 profiler-based functional regressions") - cutlass_generate_kernel_filter_and_testlists_files(TEST_SET_NAME kernel_testlist_l1) + cutlass_generate_kernel_filter_and_testlist_files(TEST_SET_NAME kernel_testlist_l1) set(KERNEL_FILTER_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L1_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm_kernel_filter.list CACHE STRING "Kernel set") set(CUTLASS_PROFILER_REGRESSION_LIST_FILE ${CMAKE_CURRENT_BINARY_DIR}/FK_functional_L1_testlist_SM${CUTLASS_NVCC_ARCHS}_cutlass3x_gemm.csv CACHE STRING "Regression set") diff --git a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu index 4f77ae03..b12e75ec 100644 --- a/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu +++ b/examples/56_hopper_ptr_array_batched_gemm/56_hopper_ptr_array_batched_gemm.cu @@ -489,7 +489,7 @@ int run(Options &options) std::cout << " Batches : " << options.l << std::endl; std::cout << " Alpha, Beta : " << options.alpha << ',' << options.beta << std::endl; std::cout << " Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPS : " << result.gflops << std::endl; + std::cout << " TFLOPS : " << result.gflops / 1000.0 << std::endl; } return 0; diff --git a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu index 6cedb599..eb449e8f 100644 --- a/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu +++ b/examples/57_hopper_grouped_gemm/57_hopper_grouped_gemm.cu @@ -124,7 +124,7 @@ struct CooperativeConfig { using KernelSchedule = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperativeFP8FastAccum; using EpilogueSchedule = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative; using TileShape = Shape<_256,_128,_128>; - using ClusterShape = Shape<_2,_2,_1>; + using ClusterShape = Shape<_1,_2,_1>; }; struct PingpongConfig { @@ -296,14 +296,14 @@ struct Options { int m = cmd_line_m; int n = cmd_line_n; int k = cmd_line_k; - if (m < 1) { - m = alignment * ((rand() % 64) + 1); + if (m < 0) { + m = alignment * ((rand() % 64)); } - if (n < 1) { - n = alignment * ((rand() % 64) + 1); + if (n < 0) { + n = alignment * ((rand() % 64)); } - if (k < 1) { - k = alignment * ((rand() % 64) + 1); + if (k < 0) { + k = alignment * ((rand() % 64)); } problem_sizes_host.push_back({m, n, k}); } @@ -333,19 +333,9 @@ struct Options { cutlass::CommandLine::tokenize(tokens, extent_str, 'x'); for (int i = 0; i < int(tokens.size()); ++i) { - int x = std::atoi(tokens.at(i).c_str()); - - // round up - if (x % alignment) { - x += (alignment - (x % alignment)); - } - - extent.at(i) = x; - } - - if (extent.product()) { - problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()}); + extent.at(i) = std::atoi(tokens.at(i).c_str()); } + problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()}); } groups = static_cast(problem_sizes_host.size()); @@ -500,10 +490,27 @@ void initialize(const Options &options) { std::vector ptr_beta_host(options.groups); for (int32_t i = 0; i < options.groups; ++i) { - ptr_A_host.at(i) = block_A.get() + offset_A.at(i); - ptr_B_host.at(i) = block_B.get() + offset_B.at(i); - ptr_C_host.at(i) = block_C.get() + offset_C.at(i); - ptr_D_host.at(i) = block_D.get() + offset_D.at(i); + // If the current group's matrix has size 0, set the pointer to nullptr + if (i < options.groups - 1 && offset_A.at(i) == offset_A.at(i + 1)) { + ptr_A_host.at(i) = nullptr; + } else { + ptr_A_host.at(i) = block_A.get() + offset_A.at(i); + } + if (i < options.groups - 1 && offset_B.at(i) == offset_B.at(i + 1)) { + ptr_B_host.at(i) = nullptr; + } else { + ptr_B_host.at(i) = block_B.get() + offset_B.at(i); + } + if (i < options.groups - 1 && offset_C.at(i) == offset_C.at(i + 1)) { + ptr_C_host.at(i) = nullptr; + } else { + ptr_C_host.at(i) = block_C.get() + offset_C.at(i); + } + if (i < options.groups - 1 && offset_D.at(i) == offset_D.at(i + 1)) { + ptr_D_host.at(i) = nullptr; + } else { + ptr_D_host.at(i) = block_D.get() + offset_D.at(i); + } alpha_host.push_back((options.alpha == FLT_MAX) ? static_cast((rand() % 5) + 1) : options.alpha); beta_host.push_back((options.beta == FLT_MAX) ? static_cast(rand() % 5) : options.beta); ptr_alpha_host.at(i) = block_alpha.get() + i; @@ -539,9 +546,10 @@ void initialize(const Options &options) { beta_device.reset(options.groups); beta_device.copy_from_host(ptr_beta_host.data()); - initialize_block(block_A, seed + 2023); + initialize_block(block_A, seed + 2021); initialize_block(block_B, seed + 2022); - initialize_block(block_C, seed + 2021); + initialize_block(block_C, seed + 2023); + initialize_block(block_D, seed + 2024); block_alpha.copy_from_host(alpha_host.data()); block_beta.copy_from_host(beta_host.data()); } @@ -653,6 +661,13 @@ int run(Options &options, bool host_problem_shapes_available = true) allocate(options); initialize(options); + std::cout << " Problem Sizes, Alpha, Beta " << std::endl; + for (int32_t i = 0; i < options.groups; ++i) { + std::cout << " " << options.problem_sizes_host.at(i); + std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl; + } + std::cout << " Groups : " << options.groups << std::endl; + // Instantiate CUTLASS kernel depending on templates GemmT gemm; @@ -700,14 +715,8 @@ int run(Options &options, bool host_problem_shapes_available = true) result.avg_runtime_ms = double(elapsed_ms) / double(options.iterations); result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); - std::cout << " Problem Sizes, Alpha, Beta " << std::endl; - for (int32_t i = 0; i < options.groups; ++i) { - std::cout << " " << options.problem_sizes_host.at(i); - std::cout << ", " << alpha_host.at(i) << ", " << beta_host.at(i) << std::endl; - } - std::cout << " Groups : " << options.groups << std::endl; std::cout << " Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPS : " << result.gflops << std::endl; + std::cout << " TFLOPS : " << result.gflops / 1000.0 << std::endl; } return 0; diff --git a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu index d84934ac..9f60d077 100644 --- a/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu +++ b/examples/58_ada_fp8_gemm/ada_fp8_gemm.cu @@ -770,9 +770,6 @@ int main(int argc, char const** argv) { bool satisfied; if (props.major < 10) { - // Pre-Blackwell - satisfied = (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4); - satisfied &= (props.major > 8) || (props.major == 8 && props.minor == 9); } else { satisfied = (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8); @@ -786,7 +783,6 @@ int main(int argc, char const** argv) { std::cout << "CUTLASS's FP8 SM89 example requires an NVIDIA GPU with compute capability 8.9 or greater " << "and CUDA toolkit version 12.4 or later" - << " (12.8 or later needed for SM100+)" << std::endl; return 0; diff --git a/examples/65_distributed_gemm/65_distributed_gemm.cu b/examples/65_distributed_gemm/65_distributed_gemm.cu index 6509609f..06d18cef 100644 --- a/examples/65_distributed_gemm/65_distributed_gemm.cu +++ b/examples/65_distributed_gemm/65_distributed_gemm.cu @@ -132,7 +132,7 @@ using namespace cute; using TP = _8; static constexpr int TP_ = TP{}; -#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && \ +#if defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && \ (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)) // Distributed GEMM tiling/sharding schedule @@ -252,7 +252,7 @@ HostTensorB tensor_B_arr[TP_]; HostTensorD tensor_C_arr[TP_]; HostTensorD tensor_D_arr[TP_]; -#endif // (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) +#endif // (defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) ///////////////////////////////////////////////////////////////////////////////////////////////// /// Testbed utility types @@ -344,7 +344,7 @@ struct Result { }; -#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && \ +#if defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && \ (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)) ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -803,7 +803,7 @@ int run(Options &options) { return 0; } -#endif // (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) +#endif // (defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -857,8 +857,12 @@ int main(int argc, char const **args) { // Evaluate CUTLASS kernels // -#if (defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) +#if (defined(CUTLASS_ARCH_MMA_SM90A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) run(options); +#else + std::cerr + << "This example must be compiled with `sm90a` and CUDA Toolkit 12.4 or later." << std::endl; + return 0; #endif return 0; diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu index 5d4fe1a1..b3da5583 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu @@ -205,7 +205,6 @@ cutlass::HostTensor tensor_A; cutlass::HostTensor tensor_B; cutlass::HostTensor tensor_C; cutlass::HostTensor tensor_D; -uint32_t mma_promotion_interval; cutlass::HostTensor blockscale_tensor_A; cutlass::HostTensor blockscale_tensor_B; cutlass::HostTensor tensor_ref_D; @@ -405,12 +404,6 @@ void initialize(const Options &options) { blockscale_tensor_A.sync_device(); blockscale_tensor_B.sync_device(); - // Note : This value has to match the KernelSchedule::ScalePromotionInterval - // Else kernel will fail can_implement() check - // Deprecation Notice : We plan to remove this params member in an upcoming release - // Users can safely delete this line from their code, since the default is already 4 - mma_promotion_interval = 4; - if (options.save_aux) { tensor_aux.resize(c_coord); tensor_aux.sync_device(); @@ -470,7 +463,6 @@ typename Gemm::Arguments args_from_options(const Options &op stride_A, tensor_B.device_data(), stride_B, - mma_promotion_interval, blockscale_tensor_A.device_data(), layout_SFA, blockscale_tensor_B.device_data(), diff --git a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu index 096e56a6..e7e3e4ea 100644 --- a/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu +++ b/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu @@ -215,7 +215,6 @@ cutlass::HostTensor tensor_A; cutlass::HostTensor tensor_B; cutlass::HostTensor tensor_C; cutlass::HostTensor tensor_D; -uint32_t mma_promotion_interval; cutlass::HostTensor blockscale_tensor_A; cutlass::HostTensor blockscale_tensor_B; cutlass::HostTensor tensor_ref_D; @@ -413,12 +412,6 @@ void initialize(const Options &options) { blockscale_tensor_A.sync_device(); blockscale_tensor_B.sync_device(); - // Note : This value has to match the KernelSchedule::ScalePromotionInterval - // Else kernel will fail can_implement() check - // Deprecation Notice : We plan to remove this params member in an upcoming release - // Users can safely delete this line from their code, since the default is already 4 - mma_promotion_interval = 4; - if (options.save_aux) { tensor_aux.resize(c_coord); tensor_aux.sync_device(); @@ -479,7 +472,6 @@ GemmArguments args_from_options(const Options &options) stride_A, tensor_B.device_data(), stride_B, - mma_promotion_interval, blockscale_tensor_A.device_data(), layout_SFA, blockscale_tensor_B.device_data(), diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu index ad563a4b..f9d5e842 100644 --- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu +++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm.cu @@ -354,19 +354,9 @@ struct Options { cutlass::CommandLine::tokenize(tokens, extent_str, 'x'); for (int i = 0; i < int(tokens.size()); ++i) { - int x = std::atoi(tokens.at(i).c_str()); - - // round up - if (x % alignment) { - x += (alignment - (x % alignment)); - } - - extent.at(i) = x; - } - - if (extent.product()) { - problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()}); + extent.at(i) = std::atoi(tokens.at(i).c_str()); } + problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()}); } groups = static_cast(problem_sizes_host.size()); @@ -745,7 +735,7 @@ int run(Options &options, bool host_problem_shapes_available = true) result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); std::cout << " Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPS : " << result.gflops << std::endl; + std::cout << " TFLOPS : " << result.gflops / 1000.0 << std::endl; } return 0; diff --git a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu index d5814c0a..f052b5f2 100644 --- a/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu +++ b/examples/75_blackwell_grouped_gemm/75_blackwell_grouped_gemm_block_scaled.cu @@ -124,6 +124,7 @@ constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; // A using ElementAccumulator = float; // Element type for internal accumulation // using ElementD = cutlass::float_e2m1_t; // Enable for SF Output // Element type for D matrix operands + using ElementSFD = cutlass::float_ue4m3_t; // Element type for SF Output operands constexpr int OutputSFVectorSize = 16; using FusionOperation = cutlass::epilogue::fusion::LinCombEltActBlockScaleFactor< @@ -422,19 +423,9 @@ struct Options { cutlass::CommandLine::tokenize(tokens, extent_str, 'x'); for (int i = 0; i < int(tokens.size()); ++i) { - int x = std::atoi(tokens.at(i).c_str()); - - // round up - if (x % alignment) { - x += (alignment - (x % alignment)); - } - - extent.at(i) = x; - } - - if (extent.product()) { - problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()}); + extent.at(i) = std::atoi(tokens.at(i).c_str()); } + problem_sizes_host.push_back({extent.m(), extent.n(), extent.k()}); } groups = static_cast(problem_sizes_host.size()); @@ -885,7 +876,7 @@ int run(Options &options, bool host_problem_shapes_available = true) result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); std::cout << " Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPS : " << result.gflops << std::endl; + std::cout << " TFLOPS : " << result.gflops / 1000.0 << std::endl; } return 0; diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp index 60b411a3..1eaea0ce 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_fwd_mainloop_tma_warpspecialized.hpp @@ -505,12 +505,12 @@ struct Sm100FmhaFwdMainloopTmaWarpspecialized { // Q1 * K1 , Q2 * K1 , S11 * V1 , Q1 * K2 , S21 * V1 , Q2 * K2 , S12 * V2 , Q1 * K3 , S22 * K2 , ... } - template + template CUTLASS_DEVICE auto softmax_step( float& row_max, float& row_sum, Stage stage, bool final_call, - BlkCoord const& blk_coord, CountingTensor const& cS, + BlkCoord const& blk_coord, CoordTensor const& cS, Params const& params, ProblemShape const& problem_shape, PipelineS& pipeline_s, typename PipelineS::PipelineState& pipeline_s_consumer_state, PipelineC& pipeline_c, typename PipelineC::PipelineState& pipeline_c_producer_state, diff --git a/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp b/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp index 655c080e..4df7daf5 100644 --- a/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp +++ b/examples/77_blackwell_fmha/collective/sm100_fmha_gen_mainloop_warpspecialized.hpp @@ -514,12 +514,12 @@ struct Sm100FmhaGenMainloopWarpspecialized { // Q1 * K1 , Q2 * K1 , S11 * V1 , Q1 * K2 , S21 * V1 , Q2 * K2 , S12 * V2 , Q1 * K3 , S22 * K2 , ... } - template + template CUTLASS_DEVICE auto softmax_step( float& row_max, float& row_sum, Stage stage, bool final_call, - BlkCoord const& blk_coord, CountingTensor const& cS, + BlkCoord const& blk_coord, CoordTensor const& cS, Params const& params, ProblemShape const& problem_shape, PipelineS& pipeline_s, typename PipelineS::PipelineState& pipeline_s_consumer_state, PipelineC& pipeline_c, typename PipelineC::PipelineState& pipeline_c_producer_state, diff --git a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu index d36bf4dd..c86580db 100644 --- a/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu +++ b/examples/79_blackwell_geforce_gemm/79d_blackwell_geforce_nvfp4_grouped_gemm.cu @@ -861,7 +861,7 @@ int run(Options &options, bool host_problem_shapes_available = true) result.gflops = options.gflops(result.avg_runtime_ms / 1000.0, options.problem_sizes_host); std::cout << " Avg runtime : " << result.avg_runtime_ms << " ms" << std::endl; - std::cout << " GFLOPS : " << result.gflops << std::endl; + std::cout << " TFLOPS : " << result.gflops / 1000.0 << std::endl; } return 0; diff --git a/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu b/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu index f955b8e9..573c25cb 100644 --- a/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu +++ b/examples/82_blackwell_distributed_gemm/82_blackwell_distributed_gemm.cu @@ -132,7 +132,7 @@ using namespace cute; using TP = _8; static constexpr int TP_ = TP{}; -#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && \ +#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \ (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)) // Distributed GEMM tiling/sharding schedule @@ -254,7 +254,7 @@ HostTensorB tensor_B_arr[TP_]; HostTensorD tensor_C_arr[TP_]; HostTensorD tensor_D_arr[TP_]; -#endif // (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) +#endif // (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) ///////////////////////////////////////////////////////////////////////////////////////////////// /// Testbed utility types @@ -346,7 +346,7 @@ struct Result { }; -#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && \ +#if defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && \ (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 4)) ///////////////////////////////////////////////////////////////////////////////////////////////// @@ -805,7 +805,7 @@ int run(Options &options) { return 0; } -#endif // (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) +#endif // (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) /////////////////////////////////////////////////////////////////////////////////////////////////// @@ -861,8 +861,12 @@ int main(int argc, char const **args) { // Evaluate CUTLASS kernels // -#if (defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) +#if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) && (__CUDACC_VER_MAJOR__ >= 12) && (__CUDACC_VER_MINOR__ >= 4)) run(options); +#else + std::cerr + << "This example must be compiled with `sm100a` and CUDA Toolkit 12.4 or later." << std::endl; + return 0; #endif return 0; diff --git a/examples/cute/tutorial/hopper/wgmma_sm90.cu b/examples/cute/tutorial/hopper/wgmma_sm90.cu index 405bb310..e2b33fb4 100644 --- a/examples/cute/tutorial/hopper/wgmma_sm90.cu +++ b/examples/cute/tutorial/hopper/wgmma_sm90.cu @@ -506,13 +506,13 @@ int main(int argc, char** argv) return -1; } - if (props.major < 8) { - std::cout << "This example requires an Ampere GPU or newer (CC >= 80)" << std::endl; + if (props.major != 9) { + std::cout << "This example requires NVIDIA's Hopper Architecture GPU with compute capability 90a" << std::endl; // Return 0 so tests pass if run on unsupported architectures or CUDA Toolkits. return 0; } -#if defined(CUTLASS_ARCH_MMA_SM90A_SUPPORTED) +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) int m = 5120; if (argc >= 2) @@ -604,7 +604,7 @@ int main(int argc, char** argv) printf("CUTE_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / cute_time, cute_time*1000); #else - std::cout << "CUTLASS_ARCH_MMA_SM90A_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl; + std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl; #endif return 0; diff --git a/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu b/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu index 77a30890..98df4fa4 100644 --- a/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu +++ b/examples/cute/tutorial/hopper/wgmma_tma_sm90.cu @@ -461,7 +461,7 @@ int main(int argc, char** argv) return 0; } -#if defined(CUTLASS_ARCH_MMA_SM90A_SUPPORTED) +#if defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED) int m = 512; if (argc >= 2) @@ -553,7 +553,7 @@ int main(int argc, char** argv) printf("CUTE_GEMM: [%6.1f]GFlop/s (%6.4f)ms\n", gflops / cute_time, cute_time*1000); #else - std::cout << "CUTLASS_ARCH_MMA_SM90A_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl; + std::cout << "CUTLASS_ARCH_MMA_SM90_SUPPORTED must be enabled, but it is not. Test is waived \n" << std::endl; #endif return 0; diff --git a/examples/python/CuTeDSL/ampere/elementwise_add.py b/examples/python/CuTeDSL/ampere/elementwise_add.py new file mode 100644 index 00000000..dc70a913 --- /dev/null +++ b/examples/python/CuTeDSL/ampere/elementwise_add.py @@ -0,0 +1,392 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import argparse +import torch +import time +from typing import Type + +import cuda.bindings.driver as cuda + +import cutlass +import cutlass.cute as cute +from cutlass.cute.runtime import from_dlpack +import cutlass.torch as cutlass_torch + +""" +An Elementwise Addition Example using CuTe DSL. + +This example kernel copies data from global memory to register memory (rmem), performs the elementwise +addition operation, and stores the result back to global memory. + +Primary goals of this example are to demonstrate how basic global memory copies can be expressed in +CuTe DSL and illustrate canonical partitioning patterns in CuTe. It also implements canonical +predication for tensors whose shape is not multiple of tile size to guard OOB reads. + +Thread-value (or TV) layouts are central to canonical partitioning patterns in CuTe. They provide a +mapping from thread and a thread's value to the set of coordinates within a tile that we have sliced +out from a data tensor. + +The input tensors are row-major layout, that leading dimension is the right most dimension. In order +to efficiently copy data from global memory, we must map threads contiguously on row dimension. + +Thread ID mapping to 2D coordinates with layout `(4,32):(32,1)`: + + +----+----+----+----+-----+----+ + | | 0 | 1 | 2 | ... | 31 | + +----+----+----+----+-----+----+ + | 0 | T0 | T1 | T2 | ... | T31| + +----+----+----+----+-----+----+ + | 1 |T32 |T33 |T34 | ... |T63 | + +----+----+----+----+-----+----+ + | 2 |T64 |T65 |T66 | ... |T95 | + +----+----+----+----+-----+----+ + | 3 |T96 |T97 |T98 | ... |T127| + +----+----+----+----+-----+----+ + +As Ampere GPU supports a maximum of 128bit per load/store instruction and each element is 32bit, we +can load 4 elements per instruction. Having additional contiguous values allows for vectorization +across threads (coalesced accesses) and is required for saturating the memory bandwidth. + +We use `(4,4):(4,1)` as the val layout in this example. Notice that the major mode is the same as +the major mode of the input tensor - without which vectorization would not be possible. + +If you already know the TV layout you want to use for your tiled copy, CuTe DSL provides utility +`cute.make_layout_tv` to build the tiled copy type around it and the atom of your choice. + +.. code-block:: python + + thr_layout = cute.make_layout((4, 32), stride=(32, 1)) + val_layout = cute.make_layout((4, 4), stride=(4, 1)) + tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout) + + # Tile input tensor to thread blocks: ((TileM,TileN),(RestM,RestN)) + gA = cute.zipped_divide(mA, tiler_mn) + +where `tiler_mn` is the tile size per thread block and `tv_layout` is the TV layout which maps +thread index and inter-thread index of data array per thread to logical coordinates of elements in +input and output tensors. + +Then we can build tiled copy for input and output tensors with `cute.make_tiled_copy` utility. + +.. code-block:: python + + blkA = gA[((None, None), bidx)] # (TileM,TileN) + + copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type) + tiled_copy_A = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn) + + # get slice of tiled_copy_A for current thread + thr_copy_A = tiled_copy_A.get_slice(tidx) + + # partition per thread block tensor as source of tiled copy + thrA = thr_copy_A.partition_S(blkA) + + # allocate fragment for gmem->rmem + frgA = cute.make_fragment_like(thrA) + + # copy data from global memory to register memory + cute.copy(copy_atom_load, thrA, frgA) + + +To run this example: + +.. code-block:: bash + + python examples/ampere/elementwise_add.py --M 3 --N 12 + python examples/ampere/elementwise_add.py --M 1024 --N 512 + python examples/ampere/elementwise_add.py --M 1024 --N 1024 --benchmark --warmup_iterations 2 --iterations 1000 + +To collect performance with NCU profiler: + +.. code-block:: bash + + # Don't iterate too many times when profiling with ncu + ncu python examples/ampere/elementwise_add.py --M 2048 --N 2048 --benchmark --iterations 10 --skip_ref_check +""" + + +@cute.kernel +def elementwise_add_kernel( + gA: cute.Tensor, + gB: cute.Tensor, + gC: cute.Tensor, + cC: cute.Tensor, # coordinate tensor + shape: cute.Shape, + tv_layout: cute.Layout, + tiler_mn: cute.Shape, +): + tidx, _, _ = cute.arch.thread_idx() + bidx, _, _ = cute.arch.block_idx() + + # slice for CTAs + # logical id -> address + blk_coord = ((None, None), bidx) + blkA = gA[blk_coord] # (TileM,TileN) + blkB = gB[blk_coord] # (TileM,TileN) + blkC = gC[blk_coord] # (TileM,TileN) + blkCrd = cC[blk_coord] # (TileM, TileN) + + print(f"[DSL INFO] Sliced Tensors per thread block:") + print(f"[DSL INFO] blkA = {blkA.type}") + print(f"[DSL INFO] blkB = {blkB.type}") + print(f"[DSL INFO] blkC = {blkC.type}") + print(f"[DSL INFO] blkCrd = {blkCrd.type}") + + # # declare the atoms which will be used later for memory copy + copy_atom_load = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gA.element_type) + copy_atom_store = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), gC.element_type) + + tiled_copy_A = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn) + tiled_copy_B = cute.make_tiled_copy(copy_atom_load, tv_layout, tiler_mn) + tiled_copy_C = cute.make_tiled_copy(copy_atom_store, tv_layout, tiler_mn) + + thr_copy_A = tiled_copy_A.get_slice(tidx) + thr_copy_B = tiled_copy_B.get_slice(tidx) + thr_copy_C = tiled_copy_C.get_slice(tidx) + + thrA = thr_copy_A.partition_S(blkA) + thrB = thr_copy_B.partition_S(blkB) + thrC = thr_copy_C.partition_S(blkC) + + # allocate fragments for gmem->rmem + frgA = cute.make_fragment_like(thrA) + frgB = cute.make_fragment_like(thrB) + frgC = cute.make_fragment_like(thrC) + + thrCrd = thr_copy_C.partition_S(blkCrd) + frgPred = cute.make_fragment(thrCrd.shape, cutlass.Boolean) + + print(f"[DSL INFO] Sliced Tensors per thread:") + print(f"[DSL INFO] thrA = {thrA.type}") + print(f"[DSL INFO] thrB = {thrB.type}") + print(f"[DSL INFO] thrC = {thrC.type}") + print(f"[DSL INFO] thrCrd = {thrCrd.type}") + + for i in cutlass.range_dynamic(0, cute.size(frgPred), 1): + val = cute.elem_less(thrCrd[i], shape) + frgPred[i] = val + + # Print per thread predicate mask + # if tidx == 0 and bidx == 0: + # cute.printf("block_dim = {}", cute.arch.grid_dim()) + # cute.printf("shape = {}", shape) + # cute.print_tensor(thrA) + # cute.print_tensor(thrB) + # cute.print_tensor(frgPred) + + ########################################################## + # Move data to reg address space + ########################################################## + + cute.copy(copy_atom_load, thrA, frgA, pred=frgPred) + cute.copy(copy_atom_load, thrB, frgB, pred=frgPred) + + # if tidx == 0 and bidx == 0: + # cute.print_tensor(frgA) + # cute.print_tensor(frgB) + + # Load data before use. The compiler will optimize the copy and load + # operations to convert some memory ld/st into register uses. + result = frgA.load() + frgB.load() + + # Save the results back to registers. Here we reuse b's registers. + frgC.store(result) + + # Copy the results back to c + cute.copy(copy_atom_store, frgC, thrC, pred=frgPred) + + +@cute.jit +def elementwise_add(mA, mB, mC, copy_bits: cutlass.Constexpr = 128): + dtype = mA.element_type + vector_size = copy_bits // dtype.width + + thr_layout = cute.make_ordered_layout((4, 32), order=(1, 0)) + val_layout = cute.make_ordered_layout((4, vector_size), order=(1, 0)) + tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout) + + print(f"[DSL INFO] Input Tensors:") + print(f"[DSL INFO] mA = {mA.type}") + print(f"[DSL INFO] mB = {mB.type}") + + print(f"[DSL INFO] Tiling Parameters:") + print(f"[DSL INFO] tiler_mn = {tiler_mn} per thread block") + print(f"[DSL INFO] tv_layout = {tv_layout}") + + gA = cute.zipped_divide(mA, tiler_mn) # ((TileM,TileN),(RestM,RestN)) + gB = cute.zipped_divide(mB, tiler_mn) # ((TileM,TileN),(RestM,RestN)) + gC = cute.zipped_divide(mC, tiler_mn) # ((TileM,TileN),(RestM,RestN)) + print(f"[DSL INFO] Tiled Tensors:") + print(f"[DSL INFO] gA = {gA.type}") + print(f"[DSL INFO] gB = {gB.type}") + print(f"[DSL INFO] gC = {gC.type}") + + idC = cute.make_identity_tensor(mC.shape) + cC = cute.zipped_divide(idC, tiler=tiler_mn) + print(f"[DSL INFO] coord tensor = {cC.type}") + + elementwise_add_kernel(gA, gB, gC, cC, mC.shape, tv_layout, tiler_mn).launch( + grid=[cute.size(gC, mode=[1]), 1, 1], + block=[cute.size(tv_layout, mode=[0]), 1, 1], + ) + + +def run_elementwise_add( + M, + N, + dtype: Type[cutlass.Numeric], + is_a_dynamic_layout=False, + is_b_dynamic_layout=False, + is_result_dynamic_layout=False, + skip_ref_check=False, + benchmark=True, + warmup_iterations=2, + iterations=200, +): + if not torch.cuda.is_available(): + raise RuntimeError(f"Ampere GPU is required to run this example!") + + print(f"\nRunning Elementwise Add test with:") + print(f"Tensor dimensions: [{M}, {N}]") + print(f"Input and Output Data type: {dtype}") + + torch_dtype = cutlass_torch.dtype(dtype) + if dtype.is_integer: + a = torch.randint(0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype) + b = torch.randint(0, 10, (M, N), device=torch.device("cuda"), dtype=torch_dtype) + else: + a = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) + b = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) + + c = torch.zeros_like(a) + + print(f"Input tensor shapes:") + print(f"a: {a.shape}, dtype: {a.dtype}") + print(f"b: {b.shape}, dtype: {b.dtype}") + print(f"c: {c.shape}, dtype: {c.dtype}\n") + + if not is_a_dynamic_layout: + a_tensor = from_dlpack(a).mark_layout_dynamic() + else: + a_tensor = a + + if not is_b_dynamic_layout: + b_tensor = from_dlpack(b).mark_layout_dynamic() + else: + b_tensor = b + + if not is_result_dynamic_layout: + c_tensor = from_dlpack(c).mark_layout_dynamic() + else: + c_tensor = c + + print("Compiling kernel with cute.compile ...") + start_time = time.time() + compiled_func = cute.compile(elementwise_add, a_tensor, b_tensor, c_tensor) + compilation_time = time.time() - start_time + print(f"Compilation time: {compilation_time:.4f} seconds") + + print("Executing vector add kernel...") + + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + + if not skip_ref_check: + compiled_func(a_tensor, b_tensor, c_tensor) + print("Verifying results...") + torch.testing.assert_close(a + b, c) + print("Results verified successfully!") + + if not benchmark: + return + + # Create CUDA events for timing + start_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1] + end_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1] + + # Warmup + for _ in range(warmup_iterations): + compiled_func(a_tensor, b_tensor, c_tensor) + + # Use the current stream for CUDA events instead of the default stream + # Record start event + cuda.cuEventRecord(start_event, current_stream) + + # Execute the kernel + for _ in range(iterations): + compiled_func(a_tensor, b_tensor, c_tensor) + + # Record end event + cuda.cuEventRecord(end_event, current_stream) + cuda.cuEventSynchronize(end_event) + + # Calculate elapsed time + err, elapsed_time = cuda.cuEventElapsedTime(start_event, end_event) + avg_time = elapsed_time / iterations + + # Print execution results + print(f"Kernel execution time: {avg_time:.4f} ms") + print( + f"Achieved memory throughput: {(3 * a.numel() * dtype.width // 8) / (avg_time / 1000) / 1e9:.2f} GB/s" + ) + print(f"First few elements of result: \n{c[:3, :3]}") + + # Destroy events + cuda.cuEventDestroy(start_event) + cuda.cuEventDestroy(end_event) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="example of elementwise add to demonstrate the numpy/pytorch as input for kernels" + ) + parser.add_argument("--M", default=1024, type=int) + parser.add_argument("--N", default=1024, type=int) + parser.add_argument("--warmup_iterations", default=2, type=int) + parser.add_argument("--iterations", default=100, type=int) + parser.add_argument("--skip_ref_check", action="store_true") + parser.add_argument("--benchmark", action="store_true") + + args = parser.parse_args() + run_elementwise_add( + args.M, + args.N, + dtype=cutlass.Float32, + is_a_dynamic_layout=True, + is_b_dynamic_layout=True, + is_result_dynamic_layout=True, + skip_ref_check=args.skip_ref_check, + benchmark=args.benchmark, + warmup_iterations=args.warmup_iterations, + iterations=args.iterations, + ) + print("\nPASS") diff --git a/examples/python/CuTeDSL/ampere/elementwise_apply.py b/examples/python/CuTeDSL/ampere/elementwise_apply.py new file mode 100644 index 00000000..e1e18729 --- /dev/null +++ b/examples/python/CuTeDSL/ampere/elementwise_apply.py @@ -0,0 +1,395 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import argparse +import operator +import torch +from typing import Type +import time + +import cuda.bindings.driver as cuda + +import cutlass +import cutlass.cute as cute +import cutlass.torch as cutlass_torch +from cutlass.cute.runtime import from_dlpack + +""" +An Elementwise Apply Example using CuTe DSL. + +This example kernel demonstrates the meta-programming capability of the CuTe DSL by allowing +customization of elementwise operations through lambda functions. The kernel copies data from +global memory to register memory (rmem), applies a user-defined operation to the elements, +and stores the result back to global memory. + +Primary goals of this example: +1. Demonstrate meta-programming capability by passing lambda functions to customize elementwise operations +2. Show how to apply different operations (add, multiply, etc.) using the same kernel structure +3. Illustrate how to parameterize CUDA kernels with operation types at compile time + +To run this example: + +.. code-block:: bash + + # Run with addition operation + python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op add + + # Run with multiplication operation + python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op mul + + # Run with subtraction operation + python examples/ampere/elementwise_apply.py --M 1024 --N 512 --op sub + + # Benchmark performance + python examples/ampere/elementwise_apply.py --M 2048 --N 2048 --op add --benchmark --warmup_iterations 2 --iterations 10 + +The example demonstrates how to express complex CUDA kernels with customizable operations +while maintaining high performance through efficient memory access patterns. +""" + + +@cute.kernel +def elementwise_apply_kernel( + op: cutlass.Constexpr, + gA: cute.Tensor, + gB: cute.Tensor, + gC: cute.Tensor, + cC: cute.Tensor, # coordinate tensor + shape: cute.Shape, + tv_layout: cute.Layout, # (tid, vid) -> logic coord +): + tidx, _, _ = cute.arch.thread_idx() + bidx, _, _ = cute.arch.block_idx() + + # slice for CTAs + cta_coord = ((None, None), bidx) + # logical coord -> address + ctaA = gA[cta_coord] # (TileM, TileN) + ctaB = gB[cta_coord] # (TileM, TileN) + ctaC = gC[cta_coord] # (TileM, TileN) + ctaCrd = cC[cta_coord] # (TileM, TileN) + + print(f"[DSL INFO] Sliced Tensors per thread block:") + print(f"[DSL INFO] ctaA = {ctaA.type}") + print(f"[DSL INFO] ctaB = {ctaB.type}") + print(f"[DSL INFO] ctaC = {ctaC.type}") + print(f"[DSL INFO] ctaCrd = {ctaCrd.type}") + + # compose with CTA TV layout + # (tid, vid) -> address + tidfrgA = cute.composition(ctaA, tv_layout) + tidfrgB = cute.composition(ctaB, tv_layout) + tidfrgC = cute.composition(ctaC, tv_layout) + tidfrgCrd = cute.composition(ctaCrd, tv_layout) + # print(f"{tv_layout = }") + # print(f"{tidfrgA = }") + + thr_coord = (tidx, (None, None)) + + # slice for threads + # vid -> address + thrA = tidfrgA[thr_coord] # (V) + thrB = tidfrgB[thr_coord] # (V) + thrC = tidfrgC[thr_coord] # (V) + thrCrd = tidfrgCrd[thr_coord] + + print(f"[DSL INFO] Sliced Tensors per thread:") + print(f"[DSL INFO] thrA = {thrA.type}") + print(f"[DSL INFO] thrB = {thrB.type}") + print(f"[DSL INFO] thrC = {thrC.type}") + print(f"[DSL INFO] thrCrd = {thrCrd.type}") + + # allocate fragments for gmem->rmem + frgA = cute.make_fragment_like(thrA, gA.element_type) + frgB = cute.make_fragment_like(thrB, gB.element_type) + frgC = cute.make_fragment_like(thrC, gC.element_type) + frgPred = cute.make_fragment(thrCrd.shape, cutlass.Boolean) + + for i in cutlass.range_dynamic(cute.size(frgPred), unroll=1): + frgPred[i] = cute.elem_less(thrCrd[i], shape) + + # if tidx == 0 and bidx == 0: + # cute.print_tensor(frgPred) + + ########################################################## + # Move data to reg address space + ########################################################## + + # declare the atoms which will be used later for memory copy + copy_atom_load = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), + gA.element_type, + num_bits_per_copy=gA.element_type.width, + ) + copy_atom_store = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), + gC.element_type, + num_bits_per_copy=gC.element_type.width, + ) + + cute.copy(copy_atom_load, thrA, frgA, pred=frgPred) + cute.copy(copy_atom_load, thrB, frgB, pred=frgPred) + + # Load data before use. The compiler will optimize the copy and load + # operations to convert some memory ld/st into register uses. + result = op(frgA.load(), frgB.load()) + + # Save the results back to registers. Here we reuse b's registers. + frgC.store(result) + + # Copy the results back to c + cute.copy(copy_atom_store, frgC, thrC, pred=frgPred) + + +@cute.jit +def elementwise_apply( + op: cutlass.Constexpr, + a: cute.Tensor, + b: cute.Tensor, + result: cute.Tensor, +): + """CUDA kernel applying binary operator on each element of two n-D input tensors in + CuTe Python and store to result tensor. + + :param op: Binary operator or lambda function to apply element-wise + :type op: cutlass.Constexpr + :param a: First input tensor + :type a: cute.Tensor + :param b: Second input tensor + :type b: cute.Tensor + :param result: Output tensor to store the results of op(a, b) + :type result: cute.Tensor + :return: None + :rtype: None + + .. code-block:: python + + # Example 1: Adding two tensors + x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32, device="cuda") + y = torch.tensor([[5, 6], [7, 8]], dtype=torch.float32, device="cuda") + result = torch.empty_like(x) + elementwise_apply(operator.add, from_dlpack(x), from_dlpack(y), from_dlpack(result)) + # result: + # tensor([[6.0, 8.0], + # [10.0, 12.0]], device='cuda:0') + + # Example 2: Using a lambda function + elementwise_apply(lambda a, b: a * a + b * b, from_dlpack(x), from_dlpack(y), from_dlpack(result)) + # result: + # tensor([[ 2., 8.], + # [ 54., 512.]], device='cuda:0') + """ + + # Baseline: naive TV layout + # * mA layout: (4096, 4096):(4096, 1) + # * TV layout map to (512, 4) tile + # * tidx maps to mode-0 but input layout is contiguous on mode-1, performance will be bad + # tv_layout = cute.make_layout((128, (4, 4)), stride=(4, (512, 1))) + # cta_tiler = (512, 4) + + # Opt-1: better TV layout with better 1D thread layout (SOL with 1D thread layout) + # * mA layout: (4096, 4096):(4096, 1) + # * TV layout map to (4, 512) tile + # * tidx maps to mode-1 which is leading mode of input tensor for coalesced load + # tv_layout = cute.make_layout((128, (4, 4)), stride=(16, (4, 1))) + # cta_tiler = (4, 512) + + # Opt-2: 2D tile but worse + # * mA layout: (4096, 4096):(4096, 1) + # * TV layout map to (128, 16) logical tile + # * V layout is bad as contiguous mode is not on right-most + # * `cute.copy` only supports vectorize when stride-1 of v-layout on right-most ) + # tv_layout = cute.make_layout(((32, 4), (4, 4)), stride=((4, 512), (1, 128))) + # cta_tiler = (128, 16) + + # Opt-3: SOL with 2D thread tile + # * mA layout: (4096, 4096):(4096, 1) + # * TV layout map to (16, 128) logical tile + # * tidx maps to mode-1 and input layout is contiguous on mode-1 for coalesced load-store + thr_layout = cute.make_layout((4, 32), stride=(32, 1)) + val_layout = cute.make_layout((4, 4), stride=(4, 1)) + tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout) + + print(f"[DSL INFO] Input Tensors:") + print(f"[DSL INFO] a = {a.type}") + print(f"[DSL INFO] b = {b.type}") + print(f"[DSL INFO] result = {result.type}") + + print(f"[DSL INFO] Tiling Parameters:") + print(f"[DSL INFO] tiler_mn = {tiler_mn} per thread block") + print(f"[DSL INFO] tv_layout = {tv_layout}") + + gA = cute.zipped_divide(a, tiler_mn) # ((TileM, TileN), (RestM, RestN)) + gB = cute.zipped_divide(b, tiler_mn) # ((TileM, TileN), (RestM, RestN)) + gC = cute.zipped_divide(result, tiler_mn) # ((TileM, TileN), (RestM, RestN)) + + print(f"[DSL INFO] Tiled Tensors:") + print(f"[DSL INFO] gA = {gA.type}") + print(f"[DSL INFO] gB = {gB.type}") + print(f"[DSL INFO] gC = {gC.type}") + + idC = cute.make_identity_tensor(result.shape) + cC = cute.zipped_divide(idC, tiler=tiler_mn) + print(f"[DSL INFO] coord tensor = {cC.type}") + + # Launch the kernel asynchronously + # Async token(s) can also be specified as dependencies + elementwise_apply_kernel( + op, + gA, + gB, + gC, + cC, + result.shape, + tv_layout, + ).launch( + grid=[cute.size(gC, mode=[1]), 1, 1], + block=[cute.size(tv_layout, mode=[0]), 1, 1], + ) + + +def run_elementwise_apply_and_verify( + op, + M, + N, + dtype: Type[cutlass.Numeric], + skip_ref_check=False, + benchmark=True, + warmup_iterations=2, + iterations=100, +): + if not torch.cuda.is_available(): + raise RuntimeError(f"Ampere GPU is required to run this example!") + + print(f"\nRunning Elementwise Apply test with:") + print(f"Tensor dimensions: [{M}, {N}]") + print(f"Input and Output Data type: {dtype}") + print(f"Warmup iterations: {warmup_iterations}") + print(f"Measurement iterations: {iterations}\n") + + torch_dtype = cutlass_torch.dtype(dtype) + + # Allocate tensors with random values. + a = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) + b = torch.randn(M, N, device=torch.device("cuda"), dtype=torch_dtype) + c = torch.zeros_like(a) + + print(f"Input tensor shapes:") + print(f"a: {a.shape}, dtype: {a.dtype}") + print(f"b: {b.shape}, dtype: {b.dtype}") + print(f"c: {c.shape}, dtype: {c.dtype}\n") + + epsilon = 1.2 + if op in (operator.truediv, operator.floordiv): + b = torch.where(b == 0, torch.tensor(epsilon), b) + + print("Compiling kernel with cute.compile ...") + start_time = time.time() + compilation_time = time.time() - start_time + print(f"Compilation time: {compilation_time:.4f} seconds") + + print("Executing elementwise apply kernel...") + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + + if not skip_ref_check: + elementwise_apply( + op, from_dlpack(a), from_dlpack(b), from_dlpack(c).mark_layout_dynamic() + ) + print("Verifying results...") + torch.testing.assert_close(op(a, b), c) + print("Results verified successfully!") + + if not benchmark: + return + + # Create CUDA events for timing + start_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1] + end_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1] + + # Warmup + for _ in range(warmup_iterations): + elementwise_apply( + op, from_dlpack(a), from_dlpack(b), from_dlpack(c).mark_layout_dynamic() + ) + + # Record start event + cuda.cuEventRecord(start_event, current_stream) + + # Execute the kernel + for _ in range(iterations): + elementwise_apply( + op, from_dlpack(a), from_dlpack(b), from_dlpack(c).mark_layout_dynamic() + ) + + # Record end event + cuda.cuEventRecord(end_event, current_stream) + cuda.cuEventSynchronize(end_event) + + # Calculate elapsed time + err, elapsed_time = cuda.cuEventElapsedTime(start_event, end_event) + avg_time = elapsed_time / iterations + + # Print execution results + print(f"Kernel execution time: {avg_time:.4f} ms") + print( + f"Achieved memory throughput: {(3 * a.numel() * dtype.width // 8) / (avg_time / 1000) / 1e9:.2f} GB/s" + ) + print(f"First few elements of result: \n{c[:3, :3]}") + + # Destroy events + cuda.cuEventDestroy(start_event) + cuda.cuEventDestroy(end_event) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="example of elementwise apply to demonstrate building elementwise kernels" + ) + parser.add_argument("--M", default=128, type=int) + parser.add_argument("--N", default=128, type=int) + parser.add_argument("--op", default="add", type=str) + parser.add_argument("--warmup_iterations", default=2, type=int) + parser.add_argument("--iterations", default=100, type=int) + parser.add_argument("--skip_ref_check", action="store_true") + parser.add_argument("--benchmark", action="store_true") + args = parser.parse_args() + run_elementwise_apply_and_verify( + getattr(operator, args.op), + args.M, + args.N, + dtype=cutlass.Float32, + warmup_iterations=args.warmup_iterations, + iterations=args.iterations, + skip_ref_check=args.skip_ref_check, + benchmark=args.benchmark, + ) + print("\nPASS") diff --git a/examples/python/CuTeDSL/ampere/flash_attention_v2.py b/examples/python/CuTeDSL/ampere/flash_attention_v2.py new file mode 100644 index 00000000..0f41245e --- /dev/null +++ b/examples/python/CuTeDSL/ampere/flash_attention_v2.py @@ -0,0 +1,1353 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from types import SimpleNamespace +from typing import Type, Union, Callable + +import torch +import cuda.bindings.driver as cuda + +import cutlass +import cutlass.cute as cute +from cutlass.cute.nvgpu import cpasync, warp +import cutlass.torch as cutlass_torch +from cutlass.cute.runtime import from_dlpack +import cutlass.utils.ampere_helpers as sm80_utils + +""" +A flash attention v2 forward pass example for NVIDIA Ampere SM80 architecture using CUTE DSL. + +- Matrix Q is BxSqxNxH, B is batch dimension, Sq is query sequence length, N is number of heads, H is head dimension +- Matrix K is BxSkxNxH, B is batch dimension, Sk is key sequence length, N is number of heads, H is head dimension +- Matrix V is BxSkxNxH, B is batch dimension, Sk is key sequence length, N is number of heads, H is head dimension +- Matrix O is BxSqxNxH, B is batch dimension, Sq is query sequence length, N is number of heads, H is head dimension + +This kernel supports the following features: + - Utilizes CpAsync for efficient memory operations + - Utilizes Ampere's tensor core for matrix multiply-accumulate (MMA) operations + - Utilizes register pipeline to overlap shared memory-to-register transfers with computations. + - Leverages DSL to implement an integrated online softmax fusion pattern. + +This kernel works as follows: +1. Load Q and K matrices from global memory (GMEM) to shared memory (SMEM) using CpAsync operations. +2. Perform matrix multiply-accumulate (MMA) operations using tensor core instructions to compute intermediate result S. +3. Apply padding mask or causal mask to S during initial iterations. +4. Apply online softmax to S and rescale O using results from previous iteration. +5. Load V matrices and perform matrix multiply-accumulate (MMA) operations to compute final result O. +6. Normalize O after all iterations complete and store result back to global memory (GMEM). + +To run this example: + +.. code-block:: bash + + python examples/ampere/flash_attention_v2.py \ + --dtype Float16 --head_dim 128 --m_block_size 128 --n_block_size 128 \ + --num_threads 128 --batch_size 1 --seqlen_q 1280 --seqlen_k 1536 \ + --num_head 16 --softmax_scale 1.0 --is_causal + +The above command configures the model to use float16 for inputs and outputs. The problem dimensions +are set to: batch size of 1, query sequence length of 1280, key sequence length of 1536, head dimension +of 128, and 16 attention heads. The softmax scale is set to 1.0 and causal masking is enabled. The computation +uses tiles of size 128x128 for m and n dimensions, and utilizes 128 parallel threads. + +To collect the performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/ampere/flash_attention_v2.py \ + --dtype Float16 --head_dim 128 --m_block_size 128 --n_block_size 128 \ + --num_threads 128 --batch_size 1 --seqlen_q 1280 --seqlen_k 1536 \ + --num_head 16 --softmax_scale 1.0 --is_causal --skip_ref_check + +There are some constraints for this example: +* Only fp16 and bf16 data types are supported. +* The contiguous dimension of each tensor must be at least 16 bytes aligned. +* The log-sum-exp(for training) is not computed in the kernel. +* The values of `m_block_size`, `n_block_size`, and `head_dim` must be selected to stay within shared memory capacity limits. +* `m_block_size * 2` must be divisible by `num_threads`, otherwise the kernel will not be able to get the correct result. +""" + + +class FlashAttentionForwardAmpere: + def __init__( + self, + head_dim: int, + m_block_size: int = 128, + n_block_size: int = 128, + num_threads: int = 128, + is_causal: bool = False, + ): + """Initializes the configuration for a flash attention v2 kernel. + + All contiguous dimensions must be at least 16 bytes aligned which indicates the head dimension + should be a multiple of 8. + + :param head_dim: head dimension + :type head_dim: int + :param m_block_size: m block size + :type m_block_size: int + :param n_block_size: n block size + :type n_block_size: int + :param num_threads: number of threads + :type num_threads: int + :param is_causal: is causal + """ + self._head_dim = head_dim + self._m_block_size = m_block_size + self._n_block_size = n_block_size + # padding head_dim to a multiple of 32 as k_block_size + self._head_dim_padded = (head_dim + 31) // 32 * 32 + self._num_threads = num_threads + self._is_causal = is_causal + + @staticmethod + def can_implement( + dtype, head_dim, m_block_size, n_block_size, num_threads, is_causal + ) -> bool: + """Check if the kernel can be implemented with the given parameters. + + :param dtype: data type + :type dtype: cutlass.Numeric + :param head_dim: head dimension + :type head_dim: int + :param m_block_size: m block size + :type m_block_size: int + :param n_block_size: n block size + :type n_block_size: int + :param num_threads: number of threads + :type num_threads: int + :param is_causal: is causal + :type is_causal: bool + + :return: True if the kernel can be implemented, False otherwise + :rtype: bool + """ + # Check if data type is fp16 or bf16 + if dtype != cutlass.Float16 and dtype != cutlass.BFloat16: + return False + + # Check if head dimension is a multiple of 8 + if head_dim % 8 != 0: + return False + + # Check if number of threads is a multiple of 32 + if num_threads % 32 != 0: + return False + + # Check if block size setting is out of shared memory capacity + # Shared memory usage: Q tile + (K tile + V tile) where K and V use the same tile size + smem_usage = (m_block_size * head_dim + n_block_size * head_dim * 2) * 2 + smem_capacity = sm80_utils.SMEM_CAPACITY["sm80"] + if smem_usage > smem_capacity: + return False + + # Check if twice the block size is divisible by the number of threads + if (m_block_size * 2) % num_threads != 0: + return False + + return True + + @cute.jit + def __call__( + self, + mQ: cute.Tensor, + mK: cute.Tensor, + mV: cute.Tensor, + mO: cute.Tensor, + softmax_scale: cutlass.Float32, + stream: cuda.CUstream, + ): + """Configures and launches the flash attention v2 kernel. + + mQ/mK/mV/mO has same data types(supports fp16 and bf16) and same layout: + (batch_size, seqlen_q, num_head, head_dim):(seqlen_q * num_head * head_dim, num_head * head_dim, head_dim, 1) + + Prepares the shared memory layout, tiled copy atoms, tiled mma and shared memory storage. + Then launches the kernel function with the prepared parameters. + + :param mQ: query tensor + :type mQ: cute.Tensor + :param mK: key tensor + :type mK: cute.Tensor + :param mV: value tensor + :type mV: cute.Tensor + :param mO: output tensor + :type mO: cute.Tensor + :param softmax_scale: softmax scale + :type softmax_scale: cutlass.Float32 + """ + # Get the data type and check if it is fp16 or bf16 + if cutlass.const_expr( + not ( + mQ.element_type == mK.element_type == mV.element_type == mO.element_type + ) + ): + raise TypeError("All tensors must have the same data type") + if cutlass.const_expr( + not ( + mQ.element_type == cutlass.Float16 + or mQ.element_type == cutlass.BFloat16 + ) + ): + raise TypeError("Only Float16 or BFloat16 is supported") + self._dtype: Type[cutlass.Numeric] = mQ.element_type + # /////////////////////////////////////////////////////////////////////////////// + # Shared memory layout: Q/K/V + # /////////////////////////////////////////////////////////////////////////////// + smem_k_block_size = 64 if self._head_dim_padded % 64 == 0 else 32 + swizzle_bits = 3 if smem_k_block_size == 64 else 2 + sQ_layout_atom = cute.make_composed_layout( + cute.make_swizzle(swizzle_bits, 3, 3), + 0, + cute.make_layout((8, smem_k_block_size), stride=(smem_k_block_size, 1)), + ) + sQ_layout = cute.tile_to_shape( + sQ_layout_atom, + (self._m_block_size, self._head_dim_padded), + (0, 1), + ) + + sKV_layout_atom = sQ_layout_atom + sKV_layout = cute.tile_to_shape( + sKV_layout_atom, + (self._n_block_size, self._head_dim_padded), + (0, 1), + ) + + sO_layout = sQ_layout + + @cute.struct + class SharedStorage: + sQ: cute.struct.Align[ + cute.struct.MemRange[self._dtype, cute.cosize(sQ_layout)], 1024 + ] + sK: cute.struct.Align[ + cute.struct.MemRange[self._dtype, cute.cosize(sKV_layout)], 1024 + ] + sV: cute.struct.Align[ + cute.struct.MemRange[self._dtype, cute.cosize(sKV_layout)], 1024 + ] + + # /////////////////////////////////////////////////////////////////////////////// + # GMEM Tiled copy: + # /////////////////////////////////////////////////////////////////////////////// + # Thread layouts for copies + universal_copy_bits = 128 + async_copy_elems = universal_copy_bits // self._dtype.width + # atom_async_copy: async copy atom for QKV load + atom_async_copy = cute.make_copy_atom( + cpasync.CopyG2SOp(cache_mode=cpasync.LoadCacheMode.GLOBAL), + self._dtype, + num_bits_per_copy=universal_copy_bits, + ) + # atom_universal_copy: universal copy atom for O store + atom_universal_copy = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), + self._dtype, + num_bits_per_copy=universal_copy_bits, + ) + # tQKV_layout: thread layout for QKV load + tQKV_shape_dim_1 = sQ_layout_atom.outer.shape[1] // async_copy_elems + tQKV_layout = cute.make_layout( + (self._num_threads // tQKV_shape_dim_1, tQKV_shape_dim_1), + stride=(tQKV_shape_dim_1, 1), + ) + # tO_layout: thread layout for O store + tO_layout = tQKV_layout + + # Value layouts for copies + vQKV_layout = cute.make_layout((1, async_copy_elems)) + vO_layout = vQKV_layout + + # gmem_tiled_copy_QKV: tiled copy for QKV load + gmem_tiled_copy_QKV = cute.make_tiled_copy_tv( + atom_async_copy, tQKV_layout, vQKV_layout + ) + # gmem_tiled_copy_O: tiled copy for O store + gmem_tiled_copy_O = cute.make_tiled_copy_tv( + atom_universal_copy, tO_layout, vO_layout + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Tiled mma + # /////////////////////////////////////////////////////////////////////////////// + tiled_mma = cute.make_tiled_mma( + warp.MmaF16BF16Op(self._dtype, cutlass.Float32, (16, 8, 16)), + (self._num_threads // 32, 1, 1), + permutation_mnk=(self._num_threads // 32 * 16, 16, 16), + ) + + # grid_dim: (m_block, batch_size, num_head) + grid_dim = ( + cute.ceil_div(mQ.shape[1], self._m_block_size), + cute.size(mQ.shape[0]), + cute.size(mQ.shape[2]), + ) + LOG2_E = 1.4426950408889634074 + softmax_scale_log2 = softmax_scale * LOG2_E + self.kernel( + mQ, + mK, + mV, + mO, + softmax_scale_log2, + sQ_layout, + sKV_layout, + sO_layout, + gmem_tiled_copy_QKV, + gmem_tiled_copy_O, + tiled_mma, + SharedStorage, + ).launch( + grid=grid_dim, + block=[self._num_threads, 1, 1], + smem=SharedStorage.size_in_bytes(), + stream=stream, + ) + + @cute.kernel + def kernel( + self, + mQ: cute.Tensor, + mK: cute.Tensor, + mV: cute.Tensor, + mO: cute.Tensor, + softmax_scale_log2: cutlass.Float32, + sQ_layout: cute.ComposedLayout, + sKV_layout: cute.ComposedLayout, + sO_layout: cute.ComposedLayout, + gmem_tiled_copy_QKV: cute.TiledCopy, + gmem_tiled_copy_O: cute.TiledCopy, + tiled_mma: cute.TiledMma, + SharedStorage: cutlass.Constexpr, + ): + """Kernel function for flash attention v2. + + :param mQ: query tensor + :type mQ: cute.Tensor + :param mK: key tensor + :type mK: cute.Tensor + :param mV: value tensor + :type mV: cute.Tensor + :param mO: output tensor + :type mO: cute.Tensor + :param softmax_scale_log2: softmax scale log2 + :type softmax_scale_log2: cutlass.Float32 + :param sQ_layout: query layout + :type sQ_layout: cute.ComposedLayout + :param sKV_layout: key/value layout + :type sKV_layout: cute.ComposedLayout + :param sO_layout: output layout + :type sO_layout: cute.ComposedLayout + :param gmem_tiled_copy_QKV: tiled copy for QKV load + :type gmem_tiled_copy_QKV: cute.TiledCopy + :param gmem_tiled_copy_O: tiled copy for O store + :type gmem_tiled_copy_O: cute.TiledCopy + :param tiled_mma: tiled mma + :type tiled_mma: cute.TiledMma + :param SharedStorage: shared storage + :type SharedStorage: cutlass.Constexpr + """ + # Thread index, block index + tidx, _, _ = cute.arch.thread_idx() + m_block, batch_size, num_head = cute.arch.block_idx() + + n_block_max = cute.ceil_div(mK.shape[1], self._n_block_size) + if self._is_causal: + n_block_max = min( + cute.ceil_div( + (m_block + 1) * self._m_block_size, + self._n_block_size, + ), + n_block_max, + ) + n_block = n_block_max - 1 + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread block. + # /////////////////////////////////////////////////////////////////////////////// + # (m_block_size, head_dim) + gQ = cute.local_tile( + mQ[batch_size, None, num_head, None], + (self._m_block_size, self._head_dim_padded), + (m_block, 0), + ) + # (n_block_size, head_dim, n_block) + gK = cute.local_tile( + mK[batch_size, None, num_head, None], + (self._n_block_size, self._head_dim_padded), + (None, 0), + ) + # (n_block_size, head_dim, n_block) + gV = cute.local_tile( + mV[batch_size, None, num_head, None], + (self._n_block_size, self._head_dim_padded), + (None, 0), + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Get shared memory buffer + # /////////////////////////////////////////////////////////////////////////////// + smem = cutlass.utils.SmemAllocator() + + storage = smem.allocate(SharedStorage) + sQ = storage.sQ.get_tensor(sQ_layout) + sK = storage.sK.get_tensor(sKV_layout) + sV = storage.sV.get_tensor(sKV_layout) + + # Transpose view of V to tensor with layout (head_dim, n_block_size) for tiled mma + sVt = cute.composition( + sV, + cute.make_layout( + (self._head_dim_padded, self._n_block_size), + stride=(self._n_block_size, 1), + ), + ) + + gmem_thr_copy_QKV = gmem_tiled_copy_QKV.get_slice(tidx) + # (CPY_Atom, CPY_M, CPY_K) + tQgQ = gmem_thr_copy_QKV.partition_S(gQ) + tQsQ = gmem_thr_copy_QKV.partition_D(sQ) + # (CPY_Atom, CPY_N, CPY_K, n_block) + tKgK = gmem_thr_copy_QKV.partition_S(gK) + tKsK = gmem_thr_copy_QKV.partition_D(sK) + # (CPY_Atom, CPY_N, CPY_K, n_block) + tVgV = gmem_thr_copy_QKV.partition_S(gV) + tVsV = gmem_thr_copy_QKV.partition_D(sV) + + # /////////////////////////////////////////////////////////////////////////////// + # Tile MMA compute thread partitions and allocate accumulators + # /////////////////////////////////////////////////////////////////////////////// + thr_mma = tiled_mma.get_slice(tidx) + tSrQ = thr_mma.make_fragment_A(thr_mma.partition_A(sQ)) + tSrK = thr_mma.make_fragment_B(thr_mma.partition_B(sK)) + tOrVt = thr_mma.make_fragment_B(thr_mma.partition_B(sVt)) + acc_shape_O = thr_mma.partition_shape_C( + (self._m_block_size, self._head_dim_padded) + ) + acc_O = cute.make_fragment(acc_shape_O, cutlass.Float32) + acc_O.fill(0.0) + + # /////////////////////////////////////////////////////////////////////////////// + # Smem copy atom tiling + # /////////////////////////////////////////////////////////////////////////////// + smem_copy_atom_Q = cute.make_copy_atom( + warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=4), + self._dtype, + ) + smem_copy_atom_K = cute.make_copy_atom( + warp.LdMatrix8x8x16bOp(transpose=False, num_matrices=4), + self._dtype, + ) + smem_copy_atom_V = cute.make_copy_atom( + warp.LdMatrix8x8x16bOp(transpose=True, num_matrices=4), + self._dtype, + ) + smem_tiled_copy_Q = cute.make_tiled_copy( + smem_copy_atom_Q, + layout_tv=tiled_mma.tv_layout_A_tiled, + tiler_mn=(tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(2)), + ) + smem_tiled_copy_K = cute.make_tiled_copy( + smem_copy_atom_K, + layout_tv=tiled_mma.tv_layout_B_tiled, + tiler_mn=(tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)), + ) + smem_tiled_copy_V = cute.make_tiled_copy( + smem_copy_atom_V, + layout_tv=tiled_mma.tv_layout_B_tiled, + tiler_mn=(tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)), + ) + + smem_thr_copy_Q = smem_tiled_copy_Q.get_slice(tidx) + smem_thr_copy_K = smem_tiled_copy_K.get_slice(tidx) + smem_thr_copy_V = smem_tiled_copy_V.get_slice(tidx) + + tSsQ = smem_thr_copy_Q.partition_S(sQ) + tSrQ_copy_view = smem_thr_copy_Q.retile(tSrQ) + tSsK = smem_thr_copy_K.partition_S(sK) + tSrK_copy_view = smem_thr_copy_K.retile(tSrK) + tOsVt = smem_thr_copy_V.partition_S(sVt) + tOrVt_copy_view = smem_thr_copy_V.retile(tOrVt) + + # /////////////////////////////////////////////////////////////////////////////// + # Predicate: Mark indices that need to copy when problem_shape isn't a multiple + # of tile_shape + # /////////////////////////////////////////////////////////////////////////////// + # Construct identity layout for Q and KV + mcQ = cute.make_identity_tensor(mQ.layout.shape) + mcKV = cute.make_identity_tensor(mK.layout.shape) + cQ = cute.local_tile( + mcQ[batch_size, None, num_head, None], + (self._m_block_size, self._head_dim_padded), + (m_block, 0), + ) + cKV = cute.local_tile( + mcKV[batch_size, None, num_head, None], + (self._n_block_size, self._head_dim_padded), + (n_block, 0), + ) + + # Repeat the partitioning with identity layouts + tQcQ = gmem_thr_copy_QKV.partition_S(cQ) + tKVcKV = gmem_thr_copy_QKV.partition_S(cKV) + # Allocate predicate tensors for m and n, here we only allocate the tile of k, and do special process for mn. + # This is to reduce register pressure and gets 2-3% performance gain compared with allocating the whole tile. + tQpQ = cute.make_fragment( + cute.make_layout( + ( + tQsQ.shape[0][1], + cute.size(tQsQ, mode=[1]), + cute.size(tQsQ, mode=[2]), + ), + stride=(cute.size(tQsQ, mode=[2]), 0, 1), + ), + cutlass.Boolean, + ) + tKVpKV = cute.make_fragment( + cute.make_layout( + ( + tKsK.shape[0][1], + cute.size(tKsK, mode=[1]), + cute.size(tKsK, mode=[2]), + ), + stride=(cute.size(tKsK, mode=[2]), 0, 1), + ), + cutlass.Boolean, + ) + # Set predicates for head_dim bounds, seqlen_q/k bounds is processed at the first tile. + for rest_v in range(tQpQ.shape[0]): + for rest_k in range(tQpQ.shape[2]): + tQpQ[rest_v, 0, rest_k] = cute.elem_less( + tQcQ[(0, rest_v), 0, rest_k][3], mQ.layout.shape[3] + ) + for rest_v in range(tKVpKV.shape[0]): + for rest_k in range(tKVpKV.shape[2]): + tKVpKV[rest_v, 0, rest_k] = cute.elem_less( + tKVcKV[(0, rest_v), 0, rest_k][3], mK.layout.shape[3] + ) + # /////////////////////////////////////////////////////////////////////////////// + # Prefetch Prologue + # /////////////////////////////////////////////////////////////////////////////// + # Start async loads of the last mn-tile, where we take care of the mn residue + for m in range(cute.size(tQsQ.shape[1])): + if cute.elem_less(tQcQ[0, m, 0][1], mQ.layout.shape[1]): + cute.copy( + gmem_tiled_copy_QKV, + tQgQ[None, m, None], + tQsQ[None, m, None], + pred=tQpQ[None, m, None], + ) + else: + # Clear the smem tiles to account for predicated off loads + tQsQ[None, m, None].fill(0) + for n in range(cute.size(tKsK.shape[1])): + if cute.elem_less(tKVcKV[0, n, 0][1], mK.layout.shape[1]): + cute.copy( + gmem_tiled_copy_QKV, + tKgK[None, n, None, n_block], + tKsK[None, n, None], + pred=tKVpKV[None, n, None], + ) + else: + # Clear the smem tiles to account for predicated off loads + tKsK[None, n, None].fill(0) + + cute.arch.cp_async_commit_group() + # /////////////////////////////////////////////////////////////////////////////// + # Softmax intermediate result: row_max and row_sum + # /////////////////////////////////////////////////////////////////////////////// + # shape: (atom_v_m * rest_m) + row_max = cute.make_fragment( + (acc_O.shape[0][0] * acc_O.shape[1]), cutlass.Float32 + ) + # shape: (atom_v_m * rest_m) + row_sum = cute.make_fragment( + (acc_O.shape[0][0] * acc_O.shape[1]), cutlass.Float32 + ) + row_max.fill(-cutlass.Float32.inf) + row_sum.fill(0.0) + + # group parameters for compute_one_n_block + basic_params = SimpleNamespace( + m_block=m_block, + n_block=n_block, + mQ=mQ, + mK=mK, + batch_size=batch_size, + num_head=num_head, + ) + mma_params = SimpleNamespace( + thr_mma=thr_mma, + tiled_mma=tiled_mma, + tSrQ=tSrQ, + tSrK=tSrK, + tOrVt=tOrVt, + acc_O=acc_O, + ) + gmem_copy_params = SimpleNamespace( + gmem_tiled_copy_QKV=gmem_tiled_copy_QKV, + tKVcKV=tKVcKV, + tKgK=tKgK, + tKsK=tKsK, + tVgV=tVgV, + tVsV=tVsV, + tKVpKV=tKVpKV, + ) + smem_copy_params = SimpleNamespace( + smem_tiled_copy_Q=smem_tiled_copy_Q, + smem_tiled_copy_K=smem_tiled_copy_K, + smem_tiled_copy_V=smem_tiled_copy_V, + tSsQ=tSsQ, + tSrQ_copy_view=tSrQ_copy_view, + tSsK=tSsK, + tSrK_copy_view=tSrK_copy_view, + tOsVt=tOsVt, + tOrVt_copy_view=tOrVt_copy_view, + ) + softmax_params = SimpleNamespace( + row_max=row_max, + row_sum=row_sum, + softmax_scale_log2=softmax_scale_log2, + ) + + # Start processing of the first n-block. + # For performance reason, we separate out two kinds of iterations: + # those that need masking on S, and those that don't. + # We need masking on S for the very last block when K and V has length not multiple of n_block_size. + # We also need masking on S if it's causal, for the last ceil_div(m_block_size, n_block_size) blocks. + # We will have at least 1 "masking" iteration. + mask_steps = 1 + if self._is_causal: + mask_steps = cute.ceil_div(self._m_block_size, self._n_block_size) + + for n_tile in range(mask_steps): + n_block = n_block_max - n_tile - 1 + basic_params.n_block = n_block + if self._is_causal: + if n_block >= 0: + self.compute_one_n_block( + basic_params, + mma_params, + gmem_copy_params, + smem_copy_params, + softmax_params, + is_first_n_block=(n_tile == 0), + in_mask_steps=True, + ) + else: + self.compute_one_n_block( + basic_params, + mma_params, + gmem_copy_params, + smem_copy_params, + softmax_params, + is_first_n_block=True, + in_mask_steps=True, + ) + + # Start async loads of rest k-tiles in reverse order, no k-residue handling needed + for n_tile in cutlass.range_dynamic(mask_steps, n_block_max, 1): + n_block = n_block_max - n_tile - 1 + basic_params.n_block = n_block + self.compute_one_n_block( + basic_params, + mma_params, + gmem_copy_params, + smem_copy_params, + softmax_params, + is_first_n_block=False, + in_mask_steps=False, + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue + # /////////////////////////////////////////////////////////////////////////////// + # normalize acc_O by row_sum and calculate the lse + self.normalize_softmax(acc_O, row_sum) + # store acc_O + rO = cute.make_fragment_like(acc_O, self._dtype) + rO.store(acc_O.load().to(self._dtype)) + # reuse sQ's data iterator + sO = cute.make_tensor(sQ.iterator, sO_layout) + + # smem copy atom for O + smem_copy_atom_O = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), self._dtype + ) + # tiled copy atom for O + smem_tiled_copy_O = cute.make_tiled_copy( + smem_copy_atom_O, + layout_tv=tiled_mma.tv_layout_C_tiled, + tiler_mn=(tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(1)), + ) + smem_thr_copy_O = smem_tiled_copy_O.get_slice(tidx) + taccOrO = smem_thr_copy_O.retile(rO) + taccOsO = smem_thr_copy_O.partition_D(sO) + # copy acc O from rmem to smem with the smem copy atom + cute.copy( + smem_copy_atom_O, + taccOrO, + taccOsO, + ) + gO = cute.local_tile( + mO[batch_size, None, num_head, None], + (self._m_block_size, self._head_dim_padded), + (m_block, 0), + ) + + gmem_thr_copy_O = gmem_tiled_copy_O.get_slice(tidx) + tOsO = gmem_thr_copy_O.partition_S(sO) + tOgO = gmem_thr_copy_O.partition_D(gO) + tOrO = cute.make_fragment_like(tOgO, self._dtype) + # sync before all smem stores are done. + cute.arch.barrier() + # load acc O from smem to rmem for wider vectorization + cute.copy( + gmem_tiled_copy_O, + tOsO, + tOrO, + ) + mcO = cute.make_identity_tensor(mO.layout.shape) + cO = cute.local_tile( + mcO[batch_size, None, num_head, None], + (self._m_block_size, self._head_dim_padded), + (m_block, 0), + ) + tOcO = gmem_thr_copy_O.partition_D(cO) + tOpO = cute.make_fragment( + cute.make_layout( + (tOgO.shape[0][1], tOgO.shape[1], tOgO.shape[2]), + stride=(tOgO.shape[2], 0, 1), + ), + cutlass.Boolean, + ) + for rest_v in range(tOpO.shape[0]): + for rest_n in range(cute.size(tOpO.shape[2])): + tOpO[rest_v, 0, rest_n] = cute.elem_less( + tOcO[(0, rest_v), 0, rest_n][3], mO.layout.shape[3] + ) + # copy acc O from rmem to gmem + for rest_m in range(cute.size(tOpO.shape[1])): + if cute.elem_less(tOcO[0, rest_m, 0][1], mO.layout.shape[1]): + cute.copy( + gmem_tiled_copy_O, + tOrO[None, rest_m, None], + tOgO[None, rest_m, None], + pred=tOpO[None, rest_m, None], + ) + + @cute.jit + def compute_one_n_block( + self, + basic_params: SimpleNamespace, + mma_params: SimpleNamespace, + gmem_copy_params: SimpleNamespace, + smem_copy_params: SimpleNamespace, + softmax_params: SimpleNamespace, + is_first_n_block: cutlass.Constexpr, + in_mask_steps: cutlass.Constexpr, + ): + """Compute one n_block of S/O. + + This function provides different variants for processing the first n block versus subsequent blocks, + as well as variants for handling masked and unmasked steps. + + :param basic_params: basic parameters + :type basic_params: SimpleNamespace + :param mma_params: mma parameters + :type mma_params: SimpleNamespace + :param gmem_copy_params: gmem copy parameters + :type gmem_copy_params: SimpleNamespace + :param smem_copy_params: smem copy parameters + :type smem_copy_params: SimpleNamespace + :param softmax_params: softmax parameters + :type softmax_params: SimpleNamespace + :param is_first_n_block: is first n block + :type is_first_n_block: cutlass.Constexpr + """ + acc_shape_S = mma_params.thr_mma.partition_shape_C( + (self._m_block_size, self._n_block_size) + ) + acc_S = cute.make_fragment(acc_shape_S, cutlass.Float32) + acc_S.fill(0.0) + + # wait for smem tile QK before mma calculation for S + cute.arch.cp_async_wait_group(0) + cute.arch.barrier() + # load smem tile V for O, special process for the first tile to avoid loading nan. + # The `if` here is a constexpr, won't be generated in the IR. + if is_first_n_block: + for n in range(cute.size(gmem_copy_params.tVsV.shape[1])): + if cute.elem_less( + gmem_copy_params.tKVcKV[0, n, 0][1], + basic_params.mK.layout.shape[1], + ): + cute.copy( + gmem_copy_params.gmem_tiled_copy_QKV, + gmem_copy_params.tVgV[None, n, None, basic_params.n_block], + gmem_copy_params.tVsV[None, n, None], + pred=gmem_copy_params.tKVpKV[None, n, None], + ) + else: + gmem_copy_params.tVsV[None, n, None].fill(0.0) + else: + cute.copy( + gmem_copy_params.gmem_tiled_copy_QKV, + gmem_copy_params.tVgV[None, None, None, basic_params.n_block], + gmem_copy_params.tVsV, + pred=gmem_copy_params.tKVpKV, + ) + + cute.arch.cp_async_commit_group() + # /////////////////////////////////////////////////////////////////////////////// + # S gemm calculation + # /////////////////////////////////////////////////////////////////////////////// + # load first QK k-block from smem to rmem for mma + cute.copy( + smem_copy_params.smem_tiled_copy_Q, + smem_copy_params.tSsQ[None, None, 0], + smem_copy_params.tSrQ_copy_view[None, None, 0], + ) + cute.copy( + smem_copy_params.smem_tiled_copy_K, + smem_copy_params.tSsK[None, None, 0], + smem_copy_params.tSrK_copy_view[None, None, 0], + ) + # mma for S + for k in range(cute.size(smem_copy_params.tSsQ.shape[2])): + # load next QK k-block from smem to rmem for mma + k_next = (k + 1) % cute.size(smem_copy_params.tSsQ.shape[2]) + cute.copy( + smem_copy_params.smem_tiled_copy_Q, + smem_copy_params.tSsQ[None, None, k_next], + smem_copy_params.tSrQ_copy_view[None, None, k_next], + ) + cute.copy( + smem_copy_params.smem_tiled_copy_K, + smem_copy_params.tSsK[None, None, k_next], + smem_copy_params.tSrK_copy_view[None, None, k_next], + ) + cute.gemm( + mma_params.tiled_mma, + acc_S, + mma_params.tSrQ[None, None, k], + mma_params.tSrK[None, None, k], + acc_S, + ) + + # wait for smem tile V for O + cute.arch.cp_async_wait_group(0) + cute.arch.barrier() + + if basic_params.n_block > 0: + cute.copy( + gmem_copy_params.gmem_tiled_copy_QKV, + gmem_copy_params.tKgK[None, None, None, basic_params.n_block - 1], + gmem_copy_params.tKsK, + pred=gmem_copy_params.tKVpKV, + ) + cute.arch.cp_async_commit_group() + # /////////////////////////////////////////////////////////////////////////////// + # online softmax + # /////////////////////////////////////////////////////////////////////////////// + self.softmax_rescale_O( + basic_params, + mma_params, + softmax_params, + acc_S, + is_first_n_block, + in_mask_steps, + ) + + rP = cute.make_fragment_like(acc_S, self._dtype) + rP.store(acc_S.load().to(self._dtype)) + # /////////////////////////////////////////////////////////////////////////////// + # O gemm calculation + # /////////////////////////////////////////////////////////////////////////////// + # Convert layout of acc_S to gemm O accept layout. + # Due to the mma instruction shape is 16x8x16, we need to convert from (4, MMA_M, MMA_N) to ((4, 2), MMA_M, MMA_N / 2) + # (4, MMA_M, MMA_N) -> (4, MMA_M, (2, MMA_N / 2)) + rP_layout_divided = cute.logical_divide(rP.layout, (None, None, 2)) + rP_mma_view = cute.make_layout( + ( + (rP_layout_divided.shape[0], rP_layout_divided.shape[2][0]), + rP_layout_divided.shape[1], + rP_layout_divided.shape[2][1], + ), + stride=( + (rP_layout_divided.stride[0], rP_layout_divided.stride[2][0]), + rP_layout_divided.stride[1], + rP_layout_divided.stride[2][1], + ), + ) + tOrS = cute.make_tensor(rP.iterator, rP_mma_view) + + # load first V k-block from smem to rmem for mma + cute.copy( + smem_copy_params.smem_tiled_copy_V, + smem_copy_params.tOsVt[None, None, 0], + smem_copy_params.tOrVt_copy_view[None, None, 0], + ) + # mma for O + for k in range(cute.size(tOrS.shape[2])): + # load next V k-block from smem to rmem for mma + k_next = (k + 1) % cute.size(tOrS.shape[2]) + cute.copy( + smem_copy_params.smem_tiled_copy_V, + smem_copy_params.tOsVt[None, None, k_next], + smem_copy_params.tOrVt_copy_view[None, None, k_next], + ) + cute.gemm( + mma_params.tiled_mma, + mma_params.acc_O, + tOrS[None, None, k], + mma_params.tOrVt[None, None, k], + mma_params.acc_O, + ) + + @cute.jit + def softmax_rescale_O( + self, + basic_params: SimpleNamespace, + mma_params: SimpleNamespace, + softmax_params: SimpleNamespace, + acc_S: cute.Tensor, + is_first_n_block: cutlass.Constexpr, + in_mask_steps: cutlass.Constexpr, + ): + """Apply online softmax and rescale acc_O. + + This function provides different variants for processing the first n block versus subsequent blocks, + as well as variants for handling masked and unmasked steps. + + :param basic_params: basic parameters + :type basic_params: SimpleNamespace + :param mma_params: mma parameters + :type mma_params: SimpleNamespace + :param softmax_params: softmax parameters + :type softmax_params: SimpleNamespace + :param acc_S: acc_S tensor + :type acc_S: cute.Tensor + :param is_first_n_block: is first n_block + :type is_first_n_block: cutlass.Constexpr + :param in_mask_steps: in mask steps + :type in_mask_steps: cutlass.Constexpr + """ + # Change acc_S to M,N layout view. + acc_S_mn = self._make_acc_tensor_mn_view(acc_S) + acc_O_mn = self._make_acc_tensor_mn_view(mma_params.acc_O) + row_max_prev = None + # if it is not the first tile, load the row r of previous row_max and compare with row_max_cur_row. + if not is_first_n_block: + row_max_prev = cute.make_fragment_like( + softmax_params.row_max, cutlass.Float32 + ) + cute.basic_copy(softmax_params.row_max, row_max_prev) + # if it is the first tile, create a mask for residual of S to -inf for softmax. + tScS_mn = None + if in_mask_steps: + mcS = cute.make_identity_tensor( + ( + basic_params.mQ.shape[0], + basic_params.mQ.shape[1], + basic_params.mQ.shape[2], + basic_params.mK.shape[1], + ) + ) + cS = cute.local_tile( + mcS[basic_params.batch_size, None, basic_params.num_head, None], + (self._m_block_size, self._n_block_size), + (basic_params.m_block, basic_params.n_block), + ) + tScS = mma_params.thr_mma.partition_C(cS) + tScS_mn = self._make_acc_tensor_mn_view(tScS) + + # Each iteration processes one row of acc_S + for r in range(cute.size(softmax_params.row_max)): + # mask residual of S with -inf + if in_mask_steps: + if not self._is_causal: + # traverse column index. + for c in range(cute.size(tScS_mn.shape[1])): + if cute.elem_less( + basic_params.mK.shape[1], tScS_mn[0, c][3] + 1 + ): + acc_S_mn[r, c] = -cutlass.Float32.inf + else: + # get the column index limit based on current row. Only consider the row index, so the column index sets to 0. + col_idx_limit = cutlass.min( + tScS_mn[r, 0][1] + 1, basic_params.mK.shape[1] + ) + # traverse column index. + for c in range(cute.size(tScS_mn.shape[1])): + # only consider the column index, so the row index sets to 0. + if cute.elem_less(col_idx_limit, tScS_mn[0, c][3] + 1): + acc_S_mn[r, c] = -cutlass.Float32.inf + + # (n_block_size) + acc_S_row = acc_S_mn[r, None].load() + # row_max_cur_row => f32 + row_max_cur_row = acc_S_row.reduce( + cute.ReductionOp.MAX, -cutlass.Float32.inf, 0 + ) + # quad reduction for row_max + row_max_cur_row = self._threadquad_reduce_max(row_max_cur_row) + row_max_prev_row = None + # if it is not the first tile, load the row r of previous row_max and compare with row_max_cur_row. + if not is_first_n_block: + row_max_prev_row = row_max_prev[r] + row_max_cur_row = cute.arch.fmax(row_max_prev_row, row_max_cur_row) + if self._is_causal: + row_max_cur_row = ( + 0.0 if row_max_cur_row == -cutlass.Float32.inf else row_max_cur_row + ) + + # compute exp(x - max) using exp2(x * log_2(e) - max * log_2(e)) + acc_S_row_exp = cute.TensorSSA( + self._exp2f( + acc_S_row * softmax_params.softmax_scale_log2 + - row_max_cur_row * softmax_params.softmax_scale_log2 + ), + tuple(acc_S_row.shape), + cutlass.Float32, + ) + # acc_S_row_sum => f32 + acc_S_row_sum = acc_S_row_exp.reduce( + cute.ReductionOp.ADD, cutlass.Float32.zero, 0 + ) + # if it is not the first tile, load the row r of previous row_max and minus row_max_cur_row to update row_sum. + if not is_first_n_block: + prev_minus_cur_exp = self._exp2f( + row_max_prev_row * softmax_params.softmax_scale_log2 + - row_max_cur_row * softmax_params.softmax_scale_log2 + ) + acc_S_row_sum = ( + acc_S_row_sum + softmax_params.row_sum[r] * prev_minus_cur_exp + ) + acc_O_mn[r, None] = acc_O_mn[r, None].load() * prev_minus_cur_exp + # update row_max, row_sum and acc_S + softmax_params.row_max[r] = row_max_cur_row + softmax_params.row_sum[r] = acc_S_row_sum + acc_S_mn[r, None] = acc_S_row_exp + + @cute.jit + def normalize_softmax( + self, + acc_O: cute.Tensor, + row_sum: cute.Tensor, + ): + """Normalize acc_O by row_sum. + + :param acc_O: input tensor + :type acc_O: cute.Tensor + :param row_sum: row_sum tensor + :type row_sum: cute.Tensor + """ + # do quad reduction for row_sum. + acc_O_mn = self._make_acc_tensor_mn_view(acc_O) + for r in range(cute.size(row_sum)): + row_sum[r] = self._threadquad_reduce_sum(row_sum[r]) + # if row_sum is zero or nan, set acc_O_mn_row to 1.0 + acc_O_mn_row_is_zero_or_nan = row_sum[r] == 0.0 or row_sum[r] != row_sum[r] + + scale = ( + 1.0 if acc_O_mn_row_is_zero_or_nan else cute.arch.rcp_approx(row_sum[r]) + ) + + acc_O_mn[r, None] = acc_O_mn[r, None].load() * scale + + def _make_acc_tensor_mn_view(self, acc: cute.Tensor) -> cute.Tensor: + """make acc tensor as mn layout view + + :param acc: input tensor + :type acc: cute.Tensor + :return: acc tensor mn layout view + :rtype: cute.Tensor + """ + acc_layout_col_major = cute.make_layout(acc.layout.shape) + acc_layout_mn = cute.make_layout( + ( + ( + acc_layout_col_major.shape[0][1], + acc_layout_col_major.shape[1], + ), # MMA_M + ( + acc_layout_col_major.shape[0][0], + acc_layout_col_major.shape[2], + ), # MMA_N + ), + stride=( + ( + acc_layout_col_major.stride[0][1], + acc_layout_col_major.stride[1], + ), # MMA_M + ( + acc_layout_col_major.stride[0][0], + acc_layout_col_major.stride[2], + ), # MMA_N + ), + ) + acc_layout_mn = cute.composition(acc.layout, acc_layout_mn) + return cute.make_tensor(acc.iterator, acc_layout_mn) + + def _threadquad_reduce(self, val: cutlass.Float32, op: Callable) -> cutlass.Float32: + """thread quad reduction + + :param val: register value + :type val: cutlass.Float32 + :param op: binary operator + :type op: Callable + :return: reduced value + :rtype: cutlass.Float32 + """ + val = op( + val, + cute.arch.shuffle_sync_bfly(val, offset=2, mask=-1, mask_and_clamp=31), + ) + val = op( + val, + cute.arch.shuffle_sync_bfly(val, offset=1, mask=-1, mask_and_clamp=31), + ) + return val + + def _threadquad_reduce_max(self, val: cutlass.Float32) -> cutlass.Float32: + """thread quad reduction max + + :param val: register value + :type val: cutlass.Float32 + :return: max value + :rtype: cutlass.Float32 + """ + return self._threadquad_reduce(val, lambda x, y: cute.arch.fmax(x, y)) + + def _threadquad_reduce_sum(self, val: cutlass.Float32) -> cutlass.Float32: + """thread quad reduction sum + + :param val: register value + :type val: cutlass.Float32 + :return: sum value + :rtype: cutlass.Float32 + """ + return self._threadquad_reduce(val, lambda x, y: x + y) + + def _exp2f( + self, x: Union[cute.TensorSSA, cutlass.Float32] + ) -> Union[cute.TensorSSA, cutlass.Float32]: + """exp2f calculation for both vector and scalar. + + :param x: input value + :type x: cute.TensorSSA or cutlass.Float32 + :return: exp2 value + :rtype: cute.TensorSSA or cutlass.Float32 + """ + if isinstance(x, cute.TensorSSA): + res = cute.make_fragment(x.shape, cutlass.Float32) + res.store(x) + + for i in range(cute.size(x.shape)): + res[i] = self._exp2f(res[i]) + + return res.load() + return cute.arch.exp2(x) + + +def run_flash_attention_fwd( + dtype: Type[cutlass.Numeric], + batch_size: int, + seqlen_q: int, + seqlen_k: int, + num_head: int, + head_dim: int, + softmax_scale: float = 1.0, + m_block_size: int = 128, + n_block_size: int = 128, + num_threads: int = 128, + is_causal: bool = False, + warmup_iterations: int = 0, + iterations: int = 1, + skip_ref_check: bool = False, +): + # Skip unsupported testcase + if not FlashAttentionForwardAmpere.can_implement( + dtype, + head_dim, + m_block_size, + n_block_size, + num_threads, + is_causal, + ): + raise TypeError( + f"Unsupported testcase {dtype}, {head_dim}, {m_block_size}, {n_block_size}, {num_threads}, {is_causal}" + ) + + # Create tensor Q/K/V/O + def create_tensor( + batch_size: int, + seqlen: int, + num_head: int, + head_dim: int, + dtype: Type[cutlass.Numeric], + ) -> cute.Tensor: + # (batch_size, seqlen, num_head, head_dim) + shape = (batch_size, seqlen, num_head, head_dim) + return ( + torch.empty(*shape, dtype=torch.int32).random_(-2, 2).to(dtype=dtype).cuda() + ) + + q = create_tensor( + batch_size, seqlen_q, num_head, head_dim, cutlass_torch.dtype(dtype) + ) + k = create_tensor( + batch_size, seqlen_k, num_head, head_dim, cutlass_torch.dtype(dtype) + ) + v = create_tensor( + batch_size, seqlen_k, num_head, head_dim, cutlass_torch.dtype(dtype) + ) + o = create_tensor( + batch_size, seqlen_q, num_head, head_dim, cutlass_torch.dtype(dtype) + ) + + fa2_fwd = FlashAttentionForwardAmpere( + head_dim, + m_block_size, + n_block_size, + num_threads, + is_causal, + ) + # assume input is 16B align. + q_tensor = ( + from_dlpack(q, assumed_align=16) + .mark_layout_dynamic(leading_dim=3) + .mark_compact_shape_dynamic( + mode=3, stride_order=q.dim_order(), divisibility=(128 // dtype.width) + ) + ) + k_tensor = ( + from_dlpack(k, assumed_align=16) + .mark_layout_dynamic(leading_dim=3) + .mark_compact_shape_dynamic( + mode=3, stride_order=k.dim_order(), divisibility=(128 // dtype.width) + ) + ) + v_tensor = ( + from_dlpack(v, assumed_align=16) + .mark_layout_dynamic(leading_dim=3) + .mark_compact_shape_dynamic( + mode=3, stride_order=v.dim_order(), divisibility=(128 // dtype.width) + ) + ) + o_tensor = ( + from_dlpack(o, assumed_align=16) + .mark_layout_dynamic(leading_dim=3) + .mark_compact_shape_dynamic( + mode=3, stride_order=o.dim_order(), divisibility=(128 // dtype.width) + ) + ) + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + # compile the fa2 forward pass + compiled_fa2_fwd = cute.compile( + fa2_fwd, q_tensor, k_tensor, v_tensor, o_tensor, softmax_scale, current_stream + ) + # warmup + for _ in range(warmup_iterations): + compiled_fa2_fwd( + q_tensor, + k_tensor, + v_tensor, + o_tensor, + softmax_scale, + current_stream, + ) + # run the compiled fa2 forward pass + for _ in range(iterations): + compiled_fa2_fwd( + q_tensor, + k_tensor, + v_tensor, + o_tensor, + softmax_scale, + current_stream, + ) + torch.cuda.synchronize() + + if skip_ref_check: + return + # reference implementation + q_ref = q.permute(0, 2, 1, 3) + k_ref = k.permute(0, 2, 1, 3) + v_ref = v.permute(0, 2, 1, 3) + torch.backends.cuda.enable_flash_sdp(enabled=True) + ref_o = torch.nn.functional.scaled_dot_product_attention( + q_ref, k_ref, v_ref, scale=softmax_scale, is_causal=is_causal + ).permute(0, 2, 1, 3) + + torch.testing.assert_close(o.cpu(), ref_o.cpu(), atol=1e-02, rtol=1e-04) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="example of flash attention v2 with CuTe on GPU" + ) + parser.add_argument("--dtype", type=cutlass.dtype, default=cutlass.BFloat16) + parser.add_argument("--batch_size", type=int, default=4) + parser.add_argument("--seqlen_q", type=int, default=8192) + parser.add_argument("--seqlen_k", type=int, default=8192) + parser.add_argument("--num_head", type=int, default=16) + parser.add_argument("--head_dim", type=int, default=128) + parser.add_argument("--softmax_scale", type=float, default=0.5) + parser.add_argument("--m_block_size", type=int, default=128) + parser.add_argument("--n_block_size", type=int, default=64) + parser.add_argument("--num_threads", type=int, default=128) + parser.add_argument("--is_causal", action="store_true", help="Enable causal mask") + parser.add_argument("--warmup_iterations", type=int, default=3) + parser.add_argument("--iterations", type=int, default=10) + parser.add_argument( + "--skip_ref_check", action="store_true", help="Skip reference check" + ) + + args = parser.parse_args() + run_flash_attention_fwd( + args.dtype, + args.batch_size, + args.seqlen_q, + args.seqlen_k, + args.num_head, + args.head_dim, + args.softmax_scale, + args.m_block_size, + args.n_block_size, + args.num_threads, + args.is_causal, + ) + + print("PASS") diff --git a/examples/python/CuTeDSL/ampere/sgemm.py b/examples/python/CuTeDSL/ampere/sgemm.py new file mode 100644 index 00000000..a4a032b4 --- /dev/null +++ b/examples/python/CuTeDSL/ampere/sgemm.py @@ -0,0 +1,780 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import time +from typing import Tuple + +import cuda.bindings.driver as cuda +import torch + +import cutlass +import cutlass.cute as cute +import cutlass.utils as utils +from cutlass.cute.runtime import from_dlpack + +""" +A dense FP32 SIMT GEMM (C = A * B) example using CUTE DSL. +- Matrix A is MxK, A can be row-major("K") or column-major("M") +- Matrix B is NxK, B can be row-major("N") or column-major("K") +- Matrix C is MxN, C can be row-major("N") or column-major("M") + +This GEMM kernel supports the following features: + - Utilizes FPU for matrix multiply-accumulate (MMA) operations + - Use multistage pipeline to overlap computation and memory access + * Shared memory pipeline: hides gmem-to-smem latency. + * Register pipeline: overlaps shared memory-to-register transfers with + computations and eliminates false data dependencies for + better parallelism. + - Use vectorized copies + - Add padding to reduce bank conflicts in global -> shared memory copies + - Use predication to avoid unnecessary copies or copies of stale data + +This GEMM works as follows: +1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using asynchronous copies. +2. Perform matrix multiply-accumulate (MMA) operations using simple fused multiply-add atomics. +3. Store results from registers (RMEM) to global memory (GMEM). + +To run this example: + +.. code-block:: bash + + python examples/ampere/sgemm.py \ + --mnk 8192,8192,8192 \ + --a_major m --b_major n --c_major n + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/ampere/sgemm.py \ + --mnk 8192,8192,8192 \ + --a_major m --b_major n --c_major n \ + --skip_ref_check --iterations 2 + +Constraints: +* Supported input, output, and accumulator data types: fp32 +* Default tile shape is set to be 128x128x8 +* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned +""" + + +class SGemm: + def __init__( + self, + cta_tiler: Tuple[int, int, int] = (128, 128, 8), + num_stages: int = 3, + num_threads: int = 256, + ): + self._cta_tiler = cta_tiler + self._num_stages = num_stages + self._num_threads = num_threads + assert num_threads > 0, "needs at least one thread" + assert num_threads % 16 == 0, "multiples of 16 required for MMA thread layout" + + self._bM, self._bN, self._bK = self._cta_tiler + assert self._bM % 16 == 0, "multiple of 16 required for tile dimension M" + assert self._bN % 16 == 0, "multiple of 16 required for tile dimension N" + assert self._num_stages >= 3, "num_stages must be greater than or equal to 3" + + @cute.jit + def __call__( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + self.a_major_mode = utils.LayoutEnum.from_tensor(mA) + self.b_major_mode = utils.LayoutEnum.from_tensor(mB) + self.c_major_mode = utils.LayoutEnum.from_tensor(mC) + + # /////////////////////////////////////////////////////////////////////////////// + # Create layouts for shared memory for A and B: + # - sA/sB is m/n-major to vectorized copies from shared + # memory to registers. This is because the MMA layouts + # for sA/sB are also m/n-major + # - When gA/gB is k-major, pad 4 elements to reduce bank conflicts + # /////////////////////////////////////////////////////////////////////////////// + + padding_a = 4 if self.a_major_mode == utils.LayoutEnum.ROW_MAJOR else 0 + padding_b = 4 if self.b_major_mode == utils.LayoutEnum.ROW_MAJOR else 0 + sA_layout = cute.make_layout( + (self._bM, self._bK, self._num_stages), + stride=(1, (self._bM + padding_a), self._bK * (self._bM + padding_a)), + ) + sB_layout = cute.make_layout( + (self._bN, self._bK, self._num_stages), + stride=(1, (self._bN + padding_b), self._bK * (self._bN + padding_b)), + ) + + smem_size = cute.size_in_bytes(mA.element_type, sA_layout) + cute.size_in_bytes( + mB.element_type, sB_layout + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Create copy layouts that will be used for asynchronous + # global memory -> shared memory copies: + # - The majorness of tA/tB follows the majorness of gA/gB + # - For k-major, these layouts will copy values one-by-one from + # from global memory, without vectorizing + # - For m/n-major, it will vectorize to a 128bit copy for faster + # data transfer between global and shared memory, as long + # as the alignment of the tensor allows it. Otherwise, it + # defaults to a non-vectorized copy + # /////////////////////////////////////////////////////////////////////////////// + + tA = cute.make_layout( + (self._num_threads // self._bK, self._bK), stride=(self._bK, 1) + ) + tB = cute.make_layout( + (self._num_threads // self._bK, self._bK), stride=(self._bK, 1) + ) + vA = cute.make_layout((1, 1)) + vB = cute.make_layout((1, 1)) + atom_async_copy_A = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mA.element_type.width, + ) + atom_async_copy_B = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mB.element_type.width, + ) + + if self.a_major_mode == utils.LayoutEnum.COL_MAJOR: + num_vectorized = 4 if (mA.layout.max_alignment % 16 == 0) else 1 + atom_async_copy_A = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mA.element_type.width * num_vectorized, + ) + major_mode_size = self._bM // num_vectorized + tA = cute.make_layout( + (major_mode_size, self._num_threads // major_mode_size), + stride=(1, major_mode_size), + ) + vA = cute.make_layout((num_vectorized, 1)) + + if self.b_major_mode == utils.LayoutEnum.COL_MAJOR: + num_vectorized = 4 if (mB.layout.max_alignment % 16 == 0) else 1 + atom_async_copy_B = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp(), + mA.element_type, + num_bits_per_copy=mB.element_type.width * num_vectorized, + ) + major_mode_size = self._bN // num_vectorized + tB = cute.make_layout( + (major_mode_size, self._num_threads // major_mode_size), + stride=(1, major_mode_size), + ) + vB = cute.make_layout((num_vectorized, 1)) + + tiled_copy_A = cute.make_tiled_copy_tv(atom_async_copy_A, tA, vA) + tiled_copy_B = cute.make_tiled_copy_tv(atom_async_copy_B, tB, vB) + + # /////////////////////////////////////////////////////////////////////////////// + # Create layouts for GEMM: + # We tile an MMA atom across a tensor. `atoms_layout` is the layout + # of atoms in the tiled MMA. (Because we use an `MmaUniversalOp`, + # which has a trivial 1x1x1 MMA trait, `atoms_layout` is also + # simply the thread layout for C.) `permutation_tiler` reorders the + # elements of the tensor that the tiled MMA is applied to. + # Different combinations of `atoms_layout` and `permutation_tiler` + # values can create different MMA thread-value patterns. + # + # Here, the MMA layout is set so that each thread copies four + # consecutive elements from shared memory to registers. + # `permutation_tiler_M/N` maps the elements handled by each thread + # to the permuted element in the tensor. + # For increasing indices in the tensor, the thread ID that reads it is: + # - (without permutation) ==> + # 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 0 1 2 ... 15 ...... + # - (with permutation) ==> + # 0 0 0 0 1 1 1 1 2 2 2 2 ... 15 15 15 15 0 0 0 0 1 1 1 1 ...... + # /////////////////////////////////////////////////////////////////////////////// + atoms_layout = cute.make_layout( + (self._num_threads // 16, 16, 1), stride=(16, 1, 0) + ) + if self.c_major_mode == utils.LayoutEnum.COL_MAJOR: + atoms_layout = cute.make_layout( + (16, self._num_threads // 16, 1), stride=(1, 16, 0) + ) + op = cute.nvgpu.MmaUniversalOp(cutlass.Float32) + permutation_tiler_M = cute.make_layout( + (atoms_layout.shape[0], 4), stride=(4, 1) + ) + permutation_tiler_N = cute.make_layout( + (atoms_layout.shape[1], 4), stride=(4, 1) + ) + tiled_mma = cute.make_tiled_mma( + op, + atoms_layout, + permutation_mnk=(permutation_tiler_M, permutation_tiler_N, None), + ) + + # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, 1) + grid_dim = *cute.ceil_div(mC.shape, (self._bM, self._bN)), 1 + + self.kernel( + mA, + mB, + mC, + sA_layout, + sB_layout, + tiled_copy_A, + tiled_copy_B, + tiled_mma, + epilogue_op, + ).launch( + grid=grid_dim, + block=[cute.size(atoms_layout), 1, 1], + smem=smem_size, + ) + + @cute.kernel + def kernel( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + sA_layout: cute.Layout, + sB_layout: cute.Layout, + tiled_copy_A: cute.TiledCopy, + tiled_copy_B: cute.TiledCopy, + tiled_mma: cute.TiledMma, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # Thread and block indices + tidx, tidy, tidz = cute.arch.thread_idx() + bidx, bidy, bidz = cute.arch.block_idx() + tiler_coord = (bidx, bidy, None) + thr_mma = tiled_mma.get_slice(tidx) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread block. + # gA: (BLK_M, BLK_K, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N) + # /////////////////////////////////////////////////////////////////////////////// + gA = cute.local_tile( + mA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1) + ) + gB = cute.local_tile( + mB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1) + ) + gC = cute.local_tile( + mC, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, 1, None) + ) + + # Move the pointer of gA/gB in the `-k`` direction, making the first + # tile (instead of the last one) irregular in shape when k is irregular. + # We first handle the irregular tile to avoid checking for this + # condition within the mainloop. + residue_k = mA.shape[1] - cutlass.Int32(self._bK) * gA.shape[2] + gA = cute.domain_offset((0, residue_k, 0), gA) + gB = cute.domain_offset((0, residue_k, 0), gB) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread. + # sA: (BLK_M, BLK_K, PIPE) , sB: (BLK_N, BLK_K, PIPE) + # tAgA: (CPY, CPY_M, CPY_K, k) , tBgB: (CPY, CPY_N, CPY_K, k) + # tAsA: (CPY, CPY_M, CPY_K, PIPE) , tBsB: (CPY, CPY_N, CPY_K, PIPE) + # /////////////////////////////////////////////////////////////////////////////// + # Create shared memory buffer + smem = cutlass.utils.SmemAllocator() + sA = smem.allocate_tensor(mA.element_type, sA_layout, 16) + sB = smem.allocate_tensor(mB.element_type, sB_layout, 16) + thr_copy_A = tiled_copy_A.get_slice(tidx) + thr_copy_B = tiled_copy_B.get_slice(tidx) + tAgA = thr_copy_A.partition_S(gA) + tAsA = thr_copy_A.partition_D(sA) + tBgB = thr_copy_B.partition_S(gB) + tBsB = thr_copy_B.partition_D(sB) + + # /////////////////////////////////////////////////////////////////////////////// + # Predicate: Mark indices that need to copy when the problem shape + # isn't a multiple of the tile shape. If tApA/B[i] is 0, then do not + # do the copy atom associated with index i. + # cA: (BLK_M, BLK_K) => (blk_m, blk_k) + # cB: (BLK_N, BLK_K) => (blk_n, blk_k) + # tAcA: (CPY, CPY_M, CPY_K) => (blk_m, blk_k) + # tBcB: (CPY, CPY_N, CPY_K) => (blk_n, blk_k) + # tApA: (rest_v, CPY_M, CPY_K), stride=(..., ..., 0) + # tBpB: (rest_v, CPY_N, CPY_K), stride=(..., ..., 0) + # CPY = (atom_v, rest_v) + # /////////////////////////////////////////////////////////////////////////////// + # Construct identity layout for sA and sB, used for predication + mcA = cute.make_identity_tensor(mA.shape) + mcB = cute.make_identity_tensor(mB.shape) + cA = cute.local_tile( + mcA, tiler=self._cta_tiler, coord=tiler_coord, proj=(1, None, 1) + ) + cB = cute.local_tile( + mcB, tiler=self._cta_tiler, coord=tiler_coord, proj=(None, 1, 1) + ) + cA = cute.domain_offset((0, residue_k, 0), cA) + cB = cute.domain_offset((0, residue_k, 0), cB) + # Repeat the partitioning with identity layouts + tAcA = thr_copy_A.partition_S(cA) + tBcB = thr_copy_B.partition_S(cB) + # Allocate predicate tensors for m and n + tApA = cute.make_fragment( + cute.make_layout( + ( + tAsA.shape[0][1], + cute.size(tAsA, mode=[1]), + cute.size(tAsA, mode=[2]), + ), + stride=(cute.size(tAsA, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + tBpB = cute.make_fragment( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=(cute.size(tBsB, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + # Allocate predicate tensors for m, n and k for residue k-tile + tApA_residue_k = cute.make_fragment( + cute.make_layout( + ( + tAsA.shape[0][1], + cute.size(tAsA, mode=[1]), + cute.size(tAsA, mode=[2]), + ), + stride=( + cute.size(tAsA, mode=[1]) * cute.size(tAsA, mode=[2]), + cute.size(tAsA, mode=[2]), + 1, + ), + ), + cutlass.Boolean, + ) + tBpB_residue_k = cute.make_fragment( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=( + cute.size(tBsB, mode=[1]) * cute.size(tBsB, mode=[2]), + cute.size(tBsB, mode=[2]), + 1, + ), + ), + cutlass.Boolean, + ) + # Set predicates for m/n bounds for mainloop + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cute.elem_less( + tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0] + ) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cute.elem_less( + tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0] + ) + + # Set predicates for m/n/k bounds for residue k tile + for rest_v in range(tApA_residue_k.shape[0]): + for m in range(tApA_residue_k.shape[1]): + for k in range(tApA_residue_k.shape[2]): + coord_A = tAcA[(0, rest_v), m, k, 0] + tApA_residue_k[rest_v, m, k] = cute.elem_less( + (coord_A[0], cutlass.Int32(-1)), (mA.shape[0], coord_A[1]) + ) + for rest_v in range(tBpB_residue_k.shape[0]): + for n in range(tBpB_residue_k.shape[1]): + for k in range(tBpB_residue_k.shape[2]): + coord_B = tBcB[(0, rest_v), n, k, 0] + tBpB_residue_k[rest_v, n, k] = cute.elem_less( + (coord_B[0], cutlass.Int32(-1)), (mB.shape[0], coord_B[1]) + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Prefetch Prologue + # /////////////////////////////////////////////////////////////////////////////// + # Start async loads for 0th k-tile, where we take care of the k-residue + k_pipe_max = cute.size(tAsA, mode=[3]) + k_tile_count = cute.size(tAgA, mode=[3]) + gmem_pipe_read = cutlass.Int32(0) + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, 0], + pred=tApA_residue_k, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, 0], + pred=tBpB_residue_k, + ) + cute.arch.cp_async_commit_group() + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(0) + ) + # Start async loads for 1st k-tile onwards, no k-residue handling needed + for k_tile in range(1, k_pipe_max - 1): + if k_tile < k_tile_count: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, k_tile], + pred=tApA, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, k_tile], + pred=tBpB, + ) + + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(0) + ) + cute.arch.cp_async_commit_group() + + # all tiles have been copied from global memory, so clear the + # predicate tensor + if k_tile_count < k_pipe_max: + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cutlass.Boolean(0) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cutlass.Boolean(0) + + # /////////////////////////////////////////////////////////////////////////////// + # Define A/B partitioning and C accumulators. + # /////////////////////////////////////////////////////////////////////////////// + tCsA = thr_mma.partition_A(sA) + tCsB = thr_mma.partition_B(sB) + tCgC = thr_mma.partition_C(gC) + tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0]) + tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0]) + tCrC = tiled_mma.make_fragment_C(tCgC) + # Clear the accumulator + tCrC.fill(0.0) + + # Current pipe index in smem to read from / write to + smem_pipe_read = cutlass.Int32(0) + smem_pipe_write = cutlass.Int32(k_pipe_max - 1) + + tCsA_p = tCsA[None, None, None, smem_pipe_read] + tCsB_p = tCsB[None, None, None, smem_pipe_read] + + # /////////////////////////////////////////////////////////////////////////////// + # PREFETCH register pipeline + # /////////////////////////////////////////////////////////////////////////////// + k_block_max = cute.size(tCrA, mode=[2]) + + if k_block_max > 1: + # Wait until our first prefetched tile is loaded in + cute.arch.cp_async_wait_group(k_pipe_max - 2) + cute.arch.barrier() + # Prefetch the first rmem from the first k-tile + cute.autovec_copy(tCsA_p[None, None, 0], tCrA[None, None, 0]) + cute.autovec_copy(tCsB_p[None, None, 0], tCrB[None, None, 0]) + + # /////////////////////////////////////////////////////////////////////////////// + # Mainloop + # 1. Shared memory pipeline (gmem -> smem): + # The default smem pipeline depth is 3, meaning that for shared + # memory buffers, we allocate three times the size described by the + # CTA tiler. We prefetch 2 of these buffers before entering the main + # loop. Considering only the transfer from global memory to shared + # memory, the general structure of the mainloop is: + # (1) copy k-tile from gmem to smem; + # (2) perform gemm computation on k-tile; + # (3) wait for the next copy to finish. + # The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command + # waits for the number of unfinished 'copy' to be <= 1. The advantage + # of this approach is that it allows for simultaneous production + # (i.e., step (1)) and consumption (i.e., step (2)) of smem. + # A common misconception is to prefetch N buffers and rewrite + # the pipeline logic to wait on N-1 pending copies. The disadvantage + # of this approach is that it requires fully consuming a buffer in + # order to open an empty buffer for the next copy. + # 2. Register pipeline (smem -> register): + # Similarly, the register pipeline produces i+1, consumes i, and + # produces i+2... Notably, i and i+1 do not use the same register, + # eliminating dependencies on the same register for better parallelism. + # 3. Combining the smem and register pipelines results in the mainloop. + # /////////////////////////////////////////////////////////////////////////////// + + for _ in cutlass.range_dynamic(k_tile_count, unroll=1): + for k_block in range(k_block_max): + if k_block == k_block_max - 1: + tCsA_p = tCsA[None, None, None, smem_pipe_read] + tCsB_p = tCsB[None, None, None, smem_pipe_read] + cute.arch.cp_async_wait_group(k_pipe_max - 2) + cute.arch.barrier() + + # Load A, B from shared memory to registers for k_block + 1 + k_block_next = (k_block + 1) % k_block_max # static + cute.autovec_copy( + tCsA_p[None, None, k_block_next], + tCrA[None, None, k_block_next], + ) + cute.autovec_copy( + tCsB_p[None, None, k_block_next], + tCrB[None, None, k_block_next], + ) + + # Fetch next A: To better interleave global memory access and + # compute instructions, we intentionally use the sequence: + # copy A, perform GEMM, then copy B. + if k_block == 0: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, gmem_pipe_read], + tAsA[None, None, None, smem_pipe_write], + # Use predicates because the m-mode may be irregular + pred=tApA, + ) + + # Thread-level register gemm for k_block + cute.gemm( + tiled_mma, + tCrC, + tCrA[None, None, k_block], + tCrB[None, None, k_block], + tCrC, + ) + + # Fetch next B and update smem pipeline read/write + if k_block == 0: + cute.copy( + tiled_copy_B, + tBgB[None, None, None, gmem_pipe_read], + tBsB[None, None, None, smem_pipe_write], + # Use predicates because the n-mode may be irregular + pred=tBpB, + ) + cute.arch.cp_async_commit_group() + smem_pipe_write = smem_pipe_read + smem_pipe_read = smem_pipe_read + 1 + if smem_pipe_read == k_pipe_max: + smem_pipe_read = cutlass.Int32(0) + # After copying all tiles, we avoid clearing the predicate + # tensor in the `mainloop` to prevent increasing its + # instruction count. Instead, we continue copying the + # first tile, though it won't be used. The 0-th tile is not + # copied due to its irregular shape, which could lead to + # illegal memory accesses. + gmem_pipe_read = ( + gmem_pipe_read + 1 + if gmem_pipe_read + 1 < k_tile_count + else cutlass.Int32(1) + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue + # Applies the epilogue operation to the accumulated results and copies + # them without vectorization. + # /////////////////////////////////////////////////////////////////////////////// + cute.arch.cp_async_wait_group(0) + cute.arch.barrier() + tCrC.store(epilogue_op(tCrC.load())) + + # predicate + cC = cute.make_identity_tensor(gC.shape) + tCpC = thr_mma.partition_C(cC) + predC = cute.make_fragment(tCrC.layout, cutlass.Boolean) + residue_m = mC.shape[0] - cutlass.Int32(self._bM) * bidx + residue_n = mC.shape[1] - cutlass.Int32(self._bN) * bidy + for i in range(cute.size(tCrC.shape)): + predC[i] = cute.elem_less(tCpC[i], (residue_m, residue_n)) + numIterM = cute.size(tCrC, mode=[1]) + numIterN = cute.size(tCrC, mode=[2]) + atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), mC.element_type) + cute.copy(atom, tCrC, tCgC, pred=predC) + return + + +def main( + a_major: str, + b_major: str, + c_major: str, + problem_shape: Tuple[int, int, int], + warmup_iterations: int = 2, + iterations: int = 100, + skip_ref_check: bool = False, +): + torch.manual_seed(1024) + M, N, K = problem_shape + + # Create and permute tensor A/B/C + def create_and_permute_tensor(mode0, mode1, is_mode0_major, dtype): + # is_mode0_major: (mode1, mode0) -> (mode0, mode1) + # else: (mode0, mode1) -> (mode0, mode1) + shape = (mode1, mode0) if is_mode0_major else (mode0, mode1) + permute_order = (1, 0) if is_mode0_major else (0, 1) + + return ( + torch.empty(*shape, dtype=torch.int32) + .random_(-5, 5) + .to(dtype=dtype) + .permute(permute_order) + .cuda() + ) + + a = create_and_permute_tensor(M, K, a_major == "m", torch.float32) + b = create_and_permute_tensor(N, K, b_major == "n", torch.float32) + c = create_and_permute_tensor(M, N, c_major == "m", torch.float32) + + divisibility_a = a.shape[1] if a_major == "k" else a.shape[0] + divisibility_b = b.shape[1] if b_major == "k" else b.shape[0] + divisibility_c = c.shape[1] if c_major == "n" else c.shape[0] + + a_tensor = ( + from_dlpack(a, assumed_align=16) + .mark_layout_dynamic(leading_dim=(1 if a_major == "k" else 0)) + .mark_compact_shape_dynamic( + mode=(1 if a_major == "k" else 0), + divisibility=divisibility_a, + ) + ) + + b_tensor = ( + from_dlpack(b, assumed_align=16) + .mark_layout_dynamic(leading_dim=(1 if b_major == "k" else 0)) + .mark_compact_shape_dynamic( + mode=(1 if b_major == "k" else 0), + divisibility=divisibility_b, + ) + ) + + c_tensor = ( + from_dlpack(c, assumed_align=16) + .mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0)) + .mark_compact_shape_dynamic( + mode=(1 if c_major == "n" else 0), + divisibility=divisibility_c, + ) + ) + + sgemm = SGemm() + + print("Compiling kernel with cute.compile ...") + start_time = time.time() + gemm = cute.compile(sgemm, a_tensor, b_tensor, c_tensor) + compilation_time = time.time() - start_time + print(f"Compilation time: {compilation_time:.4f} seconds") + + print("Executing GEMM kernel...") + + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + + # Create CUDA events for timing + start_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1] + end_event = cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT)[1] + + # Warmup + for _ in range(warmup_iterations): + gemm(a_tensor, b_tensor, c_tensor) + + # Use the current stream for CUDA events instead of the default stream + # Record start event + cuda.cuEventRecord(start_event, current_stream) + + # Execute the kernel + for _ in range(iterations): + gemm(a_tensor, b_tensor, c_tensor) + + # Record end event + cuda.cuEventRecord(end_event, current_stream) + cuda.cuEventSynchronize(end_event) + + # Calculate elapsed time + err, elapsed_time = cuda.cuEventElapsedTime(start_event, end_event) + + # Print execution results + print(f"Kernel execution time: {elapsed_time / iterations:.4f} ms") + + # Destroy events + cuda.cuEventDestroy(start_event) + cuda.cuEventDestroy(end_event) + + if not skip_ref_check: + print("Verifying results...") + ref = torch.einsum("mk,nk->mn", a, b) + torch.testing.assert_close(c.cpu(), ref.cpu(), atol=1e-03, rtol=1e-05) + print("Results verified successfully!") + + +if __name__ == "__main__": + + def parse_comma_separated_ints(s: str) -> Tuple[int, ...]: + try: + return tuple(int(x.strip()) for x in s.split(",")) + except ValueError: + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers." + ) + + parser = argparse.ArgumentParser() + parser.add_argument( + "--mnk", type=parse_comma_separated_ints, default=(256, 256, 64) + ) + parser.add_argument("--a_major", choices=["k", "m"], default="k") + parser.add_argument("--b_major", choices=["k", "n"], default="k") + parser.add_argument("--c_major", choices=["n", "m"], default="n") + parser.add_argument("--warmup_iterations", default=2, type=int) + parser.add_argument("--iterations", default=100, type=int) + parser.add_argument("--skip_ref_check", action="store_true") + + args = parser.parse_args() + print("Running SIMT GEMM example:") + main( + args.a_major, + args.b_major, + args.c_major, + args.mnk, + args.warmup_iterations, + args.iterations, + args.skip_ref_check, + ) + print("PASS") diff --git a/examples/python/CuTeDSL/ampere/tensorop_gemm.py b/examples/python/CuTeDSL/ampere/tensorop_gemm.py new file mode 100644 index 00000000..cc93f93d --- /dev/null +++ b/examples/python/CuTeDSL/ampere/tensorop_gemm.py @@ -0,0 +1,968 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import math +import time +from typing import Tuple, Type + +import cuda.bindings.driver as cuda +import torch + +import cutlass +import cutlass.cute as cute +import cutlass.torch as cutlass_torch +import cutlass.utils as utils +from cutlass.cute.runtime import from_dlpack + +""" +A dense GEMM (C = A * B) example for the NVIDIA Ampere architecture using CUTE DSL. +- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") +- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") +- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M") + +This GEMM kernel supports the following features: + - Utilizes Ampere's tensor cores for matrix multiply-accumulate (MMA) operations + - Supports multi-stage pipeline to overlap computation and memory access + - Implements shared memory buffering for epilogue to increase coalesed global memory access + +This GEMM works as follows: +1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using asynchronous copies. +2. Perform matrix multiply-accumulate (MMA) operations. +3. Store results from registers (RMEM) to shared memory (SMEM), then to global memory (GMEM). + +The Ampere tensor core instruction used operates as follows: +- Read matrix A from SMEM +- Read matrix B from SMEM +- Perform MMA operation and store the result in Accumulator(register) + +To run this example: + +.. code-block:: bash + + python examples/ampere/tensorop_gemm.py \ + --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1 \ + --ab_dtype Float16 \ + --c_dtype Float16 --acc_dtype Float32 \ + --a_major m --b_major n --c_major n + +The above example command computes with M=8192, N=8192, K=8192, +batch_count=1. The atom layout's shape is 2x2x1 and the input, mma +accumulator, and output data type are set as fp16, fp32 and fp16, +respectively. + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/ampere/tensorop_gemm.py \ + --mnkl 8192,8192,8192,1 --atom_layout_mnk 2,2,1 \ + --ab_dtype Float16 \ + --c_dtype Float16 --acc_dtype Float32 \ + --a_major m --b_major n --c_major n \ + --skip_ref_check --iterations 2 + +Constraints: +* Supported input and output data types: fp16 +* Support accumulator data types: f32 +* Default tile shape is set to be 128x128x32 +* Atom layout's MNK shape is set so that tile shape can be divided by MMA + instruction shape +* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned, + i.e, number of elements is a multiple of 8 +""" + + +class TensorOpGemm: + def __init__( + self, + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + atom_layout_mnk: Tuple[int, int, int], + ): + self.ab_dtype = ab_dtype + self.c_dtype = c_dtype + self.acc_dtype = acc_dtype + self.cta_tiler = (128, 128, 32) + self.num_stages = 3 + self.atom_layout_mnk = atom_layout_mnk + atom_lay_M, atom_lay_N, atom_lay_K = self.atom_layout_mnk + self.num_threads = atom_lay_M * atom_lay_N * atom_lay_K * 32 + + self.bM, self.bN, self.bK = self.cta_tiler + self.mma_inst_shape = (16, 8, 16) + mmaM, mmaN, mmaK = self.mma_inst_shape + + assert ( + self.bM % (atom_lay_M * mmaM) == 0 + ), "bM must be divisible by MMA instruction" + assert ( + self.bN % (atom_lay_N * mmaN) == 0 + ), "bN must be divisible by MMA instruction" + assert atom_lay_K == 1, "this example does not support atom layout K > 1" + assert self.bK % mmaK == 0, "bK must be divisible by MMA instruction" + assert self.num_stages >= 3, "num_stages must be greater than or equal to 3" + + @cute.jit + def __call__( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # The grid divides the problems's M, N, and L dimensions by the + # respective modes of the tile shape (bM, bN, 1). The K dimension is + # handled within a block via a multistage process. + + self.a_major_mode = utils.LayoutEnum.from_tensor(mA) + self.b_major_mode = utils.LayoutEnum.from_tensor(mB) + self.c_major_mode = utils.LayoutEnum.from_tensor(mC) + + # /////////////////////////////////////////////////////////////////////////////// + # Shared memory layout: + # /////////////////////////////////////////////////////////////////////////////// + + # Creates a layout with the size required for the provided tile + # size and num stages (stages are used for K dimension) that is also + # sectioned into 64x8 or 8x32 layout atoms. The swizzle is set so that + # the atom for the shared memory -> register copy does not encounter + # bank conflicts + + # assume the input is 16B align + ab_copy_bits = 128 + sA_layout = self._make_smem_layout_AB( + mA.element_type, + self.a_major_mode, + ab_copy_bits, + (self.cta_tiler[0], self.cta_tiler[2], self.num_stages), + ) + sB_layout = self._make_smem_layout_AB( + mB.element_type, + self.b_major_mode, + ab_copy_bits, + (self.cta_tiler[1], self.cta_tiler[2], self.num_stages), + ) + + # Creates a similar layout but without num_stages or layout atoms + sC_layout = self._make_smem_layout_C( + mC.element_type, + self.c_major_mode, + ab_copy_bits, + (self.cta_tiler[0], self.cta_tiler[1]), + ) + + # Shared memory allocated for operations with A, B will be + # overwritten for operations on C. This is to improve performance + # by reducing the size of shared memory requested by each block + smem_size = max( + cute.size_in_bytes(mC.element_type, sC_layout), + cute.size_in_bytes(mA.element_type, sA_layout) + + cute.size_in_bytes(mB.element_type, sB_layout), + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Tiled copy: + # The majorness of tA/tB/tC follows the majorness of gA/gB/gC, + # enabling merged accesses to global memory for faster data + # transfer between global and shared memory. + # /////////////////////////////////////////////////////////////////////////////// + + # Create a copy atom for a global to shared memory asynchronous copy + atom_async_copy = cute.make_copy_atom( + cute.nvgpu.cpasync.CopyG2SOp( + cache_mode=cute.nvgpu.cpasync.LoadCacheMode.GLOBAL + ), + mA.element_type, + num_bits_per_copy=ab_copy_bits, + ) + + # Create thread layouts for tiled copy from the copy atom where the + # thread layout simply follows the leading dimension of the tensor + tiled_copy_A = self._make_gmem_tiled_copy_AB( + atom_async_copy, mA.element_type, self.a_major_mode, ab_copy_bits + ) + tiled_copy_B = self._make_gmem_tiled_copy_AB( + atom_async_copy, mB.element_type, self.b_major_mode, ab_copy_bits + ) + + # Creates a synchonous copy atom and thread layouts for the epilogue + c_copy_bits = 128 + atom_sync_copy = cute.make_copy_atom( + cute.nvgpu.CopyUniversalOp(), + mC.element_type, + num_bits_per_copy=c_copy_bits, + ) + tiled_copy_C = self._make_gmem_tiled_copy_C( + atom_sync_copy, mC.element_type, self.c_major_mode, c_copy_bits + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Tiled MMA + # /////////////////////////////////////////////////////////////////////////////// + + # Creates a mma atom with 16x8x16 shape for MNK + op = cute.nvgpu.warp.MmaF16BF16Op( + self.ab_dtype, self.acc_dtype, self.mma_inst_shape + ) + + permutation_mnk = ( + self.atom_layout_mnk[0] * self.mma_inst_shape[0], + # if atom layout's N-mode is 1, to leverage the largest coalesced + # shared memory -> register copy, set the tiled mma's N mode to 16 + self.atom_layout_mnk[1] * self.mma_inst_shape[1] * 2, + self.atom_layout_mnk[2] * self.mma_inst_shape[2], + ) + + # Created a tiled mma that tiles the atom according to specified layout. + # For a 2x2x1 atom layout, the mma atom is duplicated 4 times, twice + # across M and twice across N + tC = cute.make_layout(self.atom_layout_mnk) + tiled_mma = cute.make_tiled_mma( + op, + tC, + permutation_mnk=permutation_mnk, + ) + + # grid_dim: ((m + BLK_M - 1) // BLK_M, (n + BLK_N - 1) // BLK_N, l) + grid_dim = cute.ceil_div(mC.shape, (self.bM, self.bN, 1)) + + self.kernel( + mA, + mB, + mC, + sA_layout, + sB_layout, + sC_layout, + tiled_copy_A, + tiled_copy_B, + tiled_copy_C, + tiled_mma, + epilogue_op, + ).launch( + grid=grid_dim, + block=[self.num_threads, 1, 1], + smem=smem_size, + ) + + @cute.kernel + def kernel( + self, + mA: cute.Tensor, + mB: cute.Tensor, + mC: cute.Tensor, + sA_layout: cute.ComposedLayout, + sB_layout: cute.ComposedLayout, + sC_layout: cute.ComposedLayout, + tiled_copy_A: cute.TiledCopy, + tiled_copy_B: cute.TiledCopy, + tiled_copy_C: cute.TiledCopy, + tiled_mma: cute.TiledMma, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + # Thread index, block index + tidx, _, _ = cute.arch.thread_idx() + bidx, bidy, bidz = cute.arch.block_idx() + tiler_coord = (bidx, bidy, None) + + # /////////////////////////////////////////////////////////////////////////////// + # Get the appropriate tiles for this thread block. + # gA: (BLK_M, BLK_N, k), gB: (BLK_N, BLK_K, k), gC: (BLK_M, BLK_N) + # /////////////////////////////////////////////////////////////////////////////// + gA = cute.local_tile( + mA[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, None, 1), + ) + gB = cute.local_tile( + mB[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(None, 1, 1), + ) + gC = cute.local_tile( + mC[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, 1, None), + ) + + # By default, if the tensor k mode does not divide into the tile k + # size, then last tiles in the k dimension are irregular. + # Instead, make the first tiles irregular when k is irregular. + # This allows us to handle the irregular tile first to avoid + # checking for this condition within the mainloop. + + # residual_k is a negative number indicating the amount needed to + # shift the pointer by in dimension k + residual_k = cute.size(mA, mode=[1]) - cutlass.Int32(self.bK) * cute.size( + gA, mode=[2] + ) + + # move the pointer of gA/gB in the `-k` direction + gA = cute.domain_offset((0, residual_k, 0), gA) + gB = cute.domain_offset((0, residual_k, 0), gB) + # input is 16B aligned + gA = cute.make_tensor(gA.iterator.align(16), gA.layout) + gB = cute.make_tensor(gB.iterator.align(16), gB.layout) + + # Construct identity layout for sA and sB (mirrors global tensors, + # used for predication only) + mcA = cute.make_identity_tensor(mA.layout.shape) + mcB = cute.make_identity_tensor(mB.layout.shape) + cA = cute.local_tile( + mcA[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, None, 1), + ) + cB = cute.local_tile( + mcB[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(None, 1, 1), + ) + + cA = cute.domain_offset((0, residual_k, 0), cA) + cB = cute.domain_offset((0, residual_k, 0), cB) + + # /////////////////////////////////////////////////////////////////////////////// + # Create shared memory buffers and get the appropriate fragments for this thread. + # sA: (BLK_M, BLK_K, PIPE) , sB: (BLK_N, BLK_K, PIPE) + # tAgA: (CPY, CPY_M, CPY_K, k) , tBgB: (CPY, CPY_N, CPY_K, k) + # tAsA: (CPY, CPY_M, CPY_K, PIPE) , tBsB: (CPY, CPY_N, CPY_K, PIPE) + # /////////////////////////////////////////////////////////////////////////////// + # Shared memory buffer + smem = cutlass.utils.SmemAllocator() + + sA = smem.allocate_tensor(mA.element_type, sA_layout, 16) + sB = smem.allocate_tensor(mB.element_type, sB_layout, 16) + sC = cute.make_tensor( + cute.recast_ptr(sA.iterator, dtype=self.c_dtype), sC_layout + ) + + thr_copy_A = tiled_copy_A.get_slice(tidx) + thr_copy_B = tiled_copy_B.get_slice(tidx) + thr_copy_C = tiled_copy_C.get_slice(tidx) + tAgA = thr_copy_A.partition_S(gA) + tAsA = thr_copy_A.partition_D(sA) + tBgB = thr_copy_B.partition_S(gB) + tBsB = thr_copy_B.partition_D(sB) + tCsC_epilogue = thr_copy_C.partition_S(sC) + tCgC_epilogue = thr_copy_C.partition_D(gC) + + # Repeat the partitioning with identity layouts + tAcA = thr_copy_A.partition_S(cA) + tBcB = thr_copy_B.partition_S(cB) + + # /////////////////////////////////////////////////////////////////////////////// + # Predicate: Mark indices that need to copy when problem_shape isn't a multiple + # of tile_shape + # /////////////////////////////////////////////////////////////////////////////// + + # For predication over the tensors A (M/K), B (N/K), and (in the + # epilogue) C (M/N), we will compute it in a fashion similar to an + # outer product. The predication along one of the dimensions is + # evaluated and stored in a predication tensor. Then, the + # predication for the remaining dimension is handled later via an + # if/else branch at the copy. + # For A and B, predication booleans along M/N are stored in a + # predication tensor and along K is handled via a if/else branch. + + # Allocate predicate tensors for M and N. Predication is checked + # at the granularity of a copy atom, so the predicate tensor does not + # need separate booleans for individual elements within a copy + # atom (for example, the elements of tAgA.shape[0][0].) + tApA = cute.make_fragment( + cute.make_layout( + ( + tAgA.shape[0][1], + cute.size(tAgA, mode=[1]), + cute.size(tAgA, mode=[2]), + ), + stride=(cute.size(tAgA, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + tBpB = cute.make_fragment( + cute.make_layout( + ( + tBsB.shape[0][1], + cute.size(tBsB, mode=[1]), + cute.size(tBsB, mode=[2]), + ), + stride=(cute.size(tBsB, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + # Set predicates for M/N bounds + for rest_v in range(tApA.shape[0]): + for m in range(tApA.shape[1]): + tApA[rest_v, m, 0] = cute.elem_less( + tAcA[(0, rest_v), m, 0, 0][0], mA.shape[0] + ) + for rest_v in range(tBpB.shape[0]): + for n in range(tBpB.shape[1]): + tBpB[rest_v, n, 0] = cute.elem_less( + tBcB[(0, rest_v), n, 0, 0][0], mB.shape[0] + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Prefetch Prologue + # /////////////////////////////////////////////////////////////////////////////// + # Clear the smem tiles to account for predicated off loads + tAsA.fill(0) + tBsB.fill(0) + cute.arch.sync_threads() + # Start async loads for the first k-tile. Here we take care of the k residue + # via if/else check along the k dimension. Because we shifted the identity tensor + # by the residue_k and because the identity tensor is a counting tensor, the + # values of any identity tensor element that is poison is less than -1 + num_smem_stages = cute.size(tAsA, mode=[3]) + k_tile_count = cute.size(tAgA, mode=[3]) + k_tile_index = cutlass.Int32(0) + + for k in range(tApA.shape[2]): + if cute.elem_less(cutlass.Int32(-1), tAcA[0, 0, k, 0][1]): + cute.copy( + tiled_copy_A, + tAgA[None, None, k, k_tile_index], + tAsA[None, None, k, 0], + pred=tApA[None, None, k], + ) + for k in range(tBpB.shape[2]): + if cute.elem_less(cutlass.Int32(-1), tBcB[0, 0, k, 0][1]): + cute.copy( + tiled_copy_B, + tBgB[None, None, k, k_tile_index], + tBsB[None, None, k, 0], + pred=tBpB[None, None, k], + ) + k_tile_index = k_tile_index + 1 + cute.arch.cp_async_commit_group() + + # Start async loads for rest of the k-tiles + for k_tile in range(1, num_smem_stages - 1): + if k_tile == k_tile_count: + tApA.fill(0) + tBpB.fill(0) + cute.copy( + tiled_copy_A, + tAgA[None, None, None, k_tile_index], + tAsA[None, None, None, k_tile], + pred=tApA, + ) + cute.copy( + tiled_copy_B, + tBgB[None, None, None, k_tile_index], + tBsB[None, None, None, k_tile], + pred=tBpB, + ) + k_tile_index = k_tile_index + 1 + cute.arch.cp_async_commit_group() + + # /////////////////////////////////////////////////////////////////////////////// + # Tile MMA compute thread partitions and allocate accumulators + # /////////////////////////////////////////////////////////////////////////////// + thr_mma = tiled_mma.get_slice(tidx) + tCsA = thr_mma.partition_A(sA) + tCsB = thr_mma.partition_B(sB) + tCsC = thr_mma.partition_C(sC) + tCgC = thr_mma.partition_C(gC) + tCrA = tiled_mma.make_fragment_A(tCsA[None, None, None, 0]) + tCrB = tiled_mma.make_fragment_B(tCsB[None, None, None, 0]) + tCrC = tiled_mma.make_fragment_C(tCgC) + # Clear the accumulator + tCrC.fill(0.0) + + # /////////////////////////////////////////////////////////////////////////////// + # Copy Atom A/B retiling + # /////////////////////////////////////////////////////////////////////////////// + + # Create the copy atoms for the copy from shared memory to register + atom_copy_s2r_A = cute.make_copy_atom( + cute.nvgpu.warp.LdMatrix8x8x16bOp( + self.a_major_mode != utils.LayoutEnum.ROW_MAJOR, 4 + ), + mA.element_type, + ) + atom_copy_s2r_B = cute.make_copy_atom( + cute.nvgpu.warp.LdMatrix8x8x16bOp( + self.b_major_mode != utils.LayoutEnum.ROW_MAJOR, 4 + ), + mB.element_type, + ) + + # Creates the tiled copy so that it matches the thread-value layout + # expected by the tiled mma + tiled_copy_s2r_A = cute.make_tiled_copy( + atom_copy_s2r_A, + layout_tv=tiled_mma.tv_layout_A_tiled, + tiler_mn=(tiled_mma.get_tile_size(0), tiled_mma.get_tile_size(2)), + ) + tiled_copy_s2r_B = cute.make_tiled_copy( + atom_copy_s2r_B, + layout_tv=tiled_mma.tv_layout_B_tiled, + tiler_mn=(tiled_mma.get_tile_size(1), tiled_mma.get_tile_size(2)), + ) + + thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx) + thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx) + tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA) + tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA) + tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB) + tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB) + + # Current pipe index in smem to read from / write to + smem_pipe_read = 0 + smem_pipe_write = num_smem_stages - 1 + + tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read] + tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read] + + # /////////////////////////////////////////////////////////////////////////////// + # PREFETCH register pipeline + # /////////////////////////////////////////////////////////////////////////////// + num_k_block = cute.size(tCrA, mode=[2]) + if num_k_block > 1: + # Wait until our first prefetched tile is loaded in + cute.arch.cp_async_wait_group(num_smem_stages - 2) + cute.arch.sync_threads() + # Prefetch the first k-block rmem from the first k-tile + cute.copy( + tiled_copy_s2r_A, + tCsA_p[None, None, 0], + tCrA_copy_view[None, None, 0], + ) + cute.copy( + tiled_copy_s2r_B, + tCsB_p[None, None, 0], + tCrB_copy_view[None, None, 0], + ) + + # /////////////////////////////////////////////////////////////////////////////// + # Mainloop + # 1. Shared memory pipeline (gmem -> smem): + # The default smem pipeline depth is 3, meaning that for shared + # memory buffers, we allocate three times the size described by the + # CTA tiler. We prefetch 2 of these buffers before entering the main + # loop. Considering only the transfer from global memory to shared + # memory, the general structure of the mainloop is: + # (1) copy k-tile from gmem to smem; + # (2) perform gemm computation on k-tile; + # (3) wait for the next copy to finish. + # The `cute.arch.cp_async_wait_group(num_smem_stages - 2)` command + # waits for the number of unfinished 'copy' to be <= 1. The advantage + # of this approach is that it allows for simultaneous production + # (i.e., step (1)) and consumption (i.e., step (2)) of smem. + # A common misconception is to prefetch N buffers and rewrite + # the pipeline logic to wait on N-1 pending copies. The disadvantage + # of this approach is that it requires fully consuming a buffer in + # order to open an empty buffer for the next copy. + # 2. Register pipeline (smem -> register): + # Similarly, the register pipeline produces i+1, consumes i, and + # produces i+2... Notably, i and i+1 do not use the same register, + # eliminating dependencies on the same register for better parallelism. + # 3. Combining the smem and register pipelines results in the mainloop. + # /////////////////////////////////////////////////////////////////////////////// + for k_tile in cutlass.range_dynamic(k_tile_count, unroll=1): + for k_block in range(num_k_block): + if k_block == num_k_block - 1: + tCsA_p = tCsA_copy_view[None, None, None, smem_pipe_read] + tCsB_p = tCsB_copy_view[None, None, None, smem_pipe_read] + cute.arch.cp_async_wait_group(num_smem_stages - 2) + cute.arch.sync_threads() + + # Load A, B from shared memory to registers for k_block + 1 + k_block_next = (k_block + 1) % num_k_block # static + cute.copy( + tiled_copy_s2r_A, + tCsA_p[None, None, k_block_next], + tCrA_copy_view[None, None, k_block_next], + ) + cute.copy( + tiled_copy_s2r_B, + tCsB_p[None, None, k_block_next], + tCrB_copy_view[None, None, k_block_next], + ) + + # Fetch next A: To better interleave global memory access and compute + # instructions, we intentionally use the sequence: copy A, perform GEMM, + # then copy B. + if k_block == 0: + if k_tile + num_smem_stages - 1 < k_tile_count: + cute.copy( + tiled_copy_A, + tAgA[None, None, None, k_tile_index], + tAsA[None, None, None, smem_pipe_write], + pred=tApA, + ) + + # Thread-level register gemm for k_block + cute.gemm( + tiled_mma, + tCrC, + tCrA[None, None, k_block], + tCrB[None, None, k_block], + tCrC, + ) + + # Fetch next B and update smem pipeline read/write + if k_block == 0: + if k_tile + num_smem_stages - 1 < k_tile_count: + cute.copy( + tiled_copy_B, + tBgB[None, None, None, k_tile_index], + tBsB[None, None, None, smem_pipe_write], + pred=tBpB, + ) + k_tile_index = k_tile_index + 1 + cute.arch.cp_async_commit_group() + smem_pipe_write = smem_pipe_read + smem_pipe_read = smem_pipe_read + 1 + if smem_pipe_read == num_smem_stages: + smem_pipe_read = 0 + + # Sync before epilogue + cute.arch.cp_async_wait_group(0) + cute.arch.sync_threads() + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue with fusion + # /////////////////////////////////////////////////////////////////////////////// + tCrD = cute.make_fragment_like(tCrC, self.c_dtype) + tCrD[None] = epilogue_op(tCrC.load()).to(self.c_dtype) + + # Copy results of D back to shared memory + cute.autovec_copy(tCrD, tCsC) + + # Create counting tensor for C + ceilM, ceilN, _ = cute.ceil_div(mC.shape, (self.bM, self.bN, 1)) + mcC = cute.make_identity_tensor( + ( + cute.size(ceilM) * self.cta_tiler[0], + cute.size(ceilN) * self.cta_tiler[1], + 1, + ) + ) + cC = cute.local_tile( + mcC[None, None, bidz], + tiler=self.cta_tiler, + coord=tiler_coord, + proj=(1, 1, None), + ) + tCcC = thr_copy_C.partition_S(cC) + + tCrC_epilogue = cute.make_fragment_like(tCsC_epilogue) + # Wait for all writes to shared memory to finish before starting copies + # using the new layouts + cute.arch.sync_threads() + cute.autovec_copy(tCsC_epilogue, tCrC_epilogue) + + # Create predication tensor for m + tCpC = cute.make_fragment( + cute.make_layout( + ( + tCgC_epilogue.shape[0][1], + cute.size(tCgC_epilogue, mode=[1]), + cute.size(tCgC_epilogue, mode=[2]), + ), + stride=(cute.size(tCgC_epilogue, mode=[1]), 1, 0), + ), + cutlass.Boolean, + ) + for rest_v in range(tCpC.shape[0]): + for m in range(tCpC.shape[1]): + tCpC[rest_v, m, 0] = cute.elem_less( + tCcC[(0, rest_v), m, 0][0], mC.shape[0] + ) + + # Copy to global memory using better vectorization + for rest_v in range(tCpC.shape[0]): + for n in range(tCpC.shape[2]): + if cute.elem_less(tCcC[(0, rest_v), 0, n][1], mC.shape[1]): + cute.copy( + tiled_copy_C, + tCrC_epilogue[None, None, n], + tCgC_epilogue[None, None, n], + pred=tCpC[None, None, n], + ) + return + + def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler): + major_mode_size = ( + smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0] + ) + major_mode_size = 64 if major_mode_size >= 64 else major_mode_size + + swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits)) + swizzle_bits = min(swizzle_bits, 3) + + layout_atom_outer = ( + cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size)) + ) + layout_atom = cute.make_composed_layout( + cute.make_swizzle(swizzle_bits, 3, 3), + 0, + layout_atom_outer, + ) + layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1, 2)) + return layout + + def _make_smem_layout_C(self, dtype, major_mode, copy_bits, smem_tiler): + major_mode_size = ( + smem_tiler[1] if major_mode == utils.LayoutEnum.ROW_MAJOR else smem_tiler[0] + ) + + swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits)) + swizzle_bits = min(swizzle_bits, 3) + + layout_atom_outer = ( + cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size)) + ) + layout_atom = cute.make_composed_layout( + cute.make_swizzle(swizzle_bits, 3, 4), + 0, + layout_atom_outer, + ) + + # Due to the thread layout of the mma, remove swizzle in C to + # prevent shared memory fragments owned by an single thread from + # holding swizzles + if major_mode == utils.LayoutEnum.COL_MAJOR: + layout_atom = cute.make_composed_layout( + cute.make_swizzle(0, 3, 4), 0, layout_atom_outer + ) + layout = cute.tile_to_shape( + layout_atom, + smem_tiler, + (0, 1), + ) + return layout + + def _make_gmem_tiled_copy_AB(self, atom_copy, dtype, major_mode, copy_bits): + copy_elems = copy_bits // dtype.width + shape_dim_1 = cute.size(self.bK) // copy_elems + # thread layout for copy + thread_layout = cute.make_layout( + (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1) + ) + if major_mode != utils.LayoutEnum.ROW_MAJOR: + shape_dim_0 = cute.size(self.bM) // copy_elems + thread_layout = cute.make_layout( + (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0) + ) + # Value layout for copy + value_layout = ( + cute.make_layout((1, copy_elems)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((copy_elems, 1)) + ) + return cute.make_tiled_copy_tv(atom_copy, thread_layout, value_layout) + + def _make_gmem_tiled_copy_C(self, atom_copy, dtype, major_mode, copy_bits): + copy_elems = copy_bits // dtype.width + shape_dim_1 = cute.size(self.bN) // copy_elems + # thread layout for copy + thread_layout = cute.make_layout( + (self.num_threads // shape_dim_1, shape_dim_1), stride=(shape_dim_1, 1) + ) + if major_mode != utils.LayoutEnum.ROW_MAJOR: + shape_dim_0 = cute.size(self.bM) // copy_elems + thread_layout = cute.make_layout( + (shape_dim_0, self.num_threads // shape_dim_0), stride=(1, shape_dim_0) + ) + value_layout = ( + cute.make_layout((1, copy_elems)) + if major_mode == utils.LayoutEnum.ROW_MAJOR + else cute.make_layout((copy_elems, 1)) + ) + tiler_mn, layout_tv = cute.make_layout_tv(thread_layout, value_layout) + return cute.make_tiled_copy(atom_copy, layout_tv, tiler_mn) + + +def run_tensor_op_gemm( + a_major: str, + b_major: str, + c_major: str, + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + problem_shape: Tuple[int, int, int, int], + atom_layout_mnk: Tuple[int, int, int], + warmup_iterations: int = 2, + iterations: int = 100, + skip_ref_check: bool = False, +): + M, N, K, L = problem_shape + + # Create and permute tensor A/B/C + def create_and_permute_tensor(l, mode0, mode1, is_mode0_major, dtype): + # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l) + # else: (l, mode0, mode1) -> (mode0, mode1, l) + shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1) + permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0) + + return ( + torch.empty(*shape, dtype=torch.int32) + .random_(-2, 2) + .to(dtype=dtype) + .permute(permute_order) + .cuda() + ) + + a = create_and_permute_tensor( + L, M, K, a_major == "m", cutlass_torch.dtype(ab_dtype) + ) + b = create_and_permute_tensor( + L, N, K, b_major == "n", cutlass_torch.dtype(ab_dtype) + ) + c = create_and_permute_tensor(L, M, N, c_major == "m", cutlass_torch.dtype(c_dtype)) + ref = torch.einsum("mkl,nkl->mnl", a, b).to(cutlass_torch.dtype(c_dtype)) + + tensor_op_gemm = TensorOpGemm( + ab_dtype, + c_dtype, + acc_dtype, + atom_layout_mnk, + ) + + # assume input is 16B aligned + a_tensor = ( + from_dlpack(a, assumed_align=16) + .mark_layout_dynamic(leading_dim=(1 if a_major == "k" else 0)) + .mark_compact_shape_dynamic( + mode=(1 if a_major == "k" else 0), + stride_order=(2, 0, 1) if a_major == "k" else (2, 1, 0), + divisibility=(128 // ab_dtype.width), + ) + ) + b_tensor = ( + from_dlpack(b, assumed_align=16) + .mark_layout_dynamic(leading_dim=(1 if b_major == "k" else 0)) + .mark_compact_shape_dynamic( + mode=(1 if b_major == "k" else 0), + stride_order=(2, 0, 1) if b_major == "k" else (2, 1, 0), + divisibility=(128 // ab_dtype.width), + ) + ) + c_tensor = ( + from_dlpack(c, assumed_align=16) + .mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0)) + .mark_compact_shape_dynamic( + mode=(1 if c_major == "n" else 0), + stride_order=(2, 0, 1) if c_major == "n" else (2, 1, 0), + divisibility=(128 // c_dtype.width), + ) + ) + + print("Compiling kernel with cute.compile ...") + gemm = cute.compile(tensor_op_gemm, a_tensor, b_tensor, c_tensor) + + print("Executing GEMM kernel...") + + # Warmup + for _ in range(warmup_iterations): + gemm(a_tensor, b_tensor, c_tensor) + + # Execute the kernel + for _ in range(iterations): + gemm(a_tensor, b_tensor, c_tensor) + + if not skip_ref_check: + print("Verifying results...") + torch.testing.assert_close(c.cpu(), ref.cpu(), atol=1e-03, rtol=1e-05) + print("Results verified successfully!") + + +if __name__ == "__main__": + + def parse_comma_separated_ints(s: str) -> Tuple[int, ...]: + try: + return tuple(int(x.strip()) for x in s.split(",")) + except ValueError: + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers." + ) + + parser = argparse.ArgumentParser( + description="example of multistage block matmul with CuTe on GPU" + ) + parser.add_argument( + "--mnkl", type=parse_comma_separated_ints, default=(112, 136, 40, 1) + ) + parser.add_argument( + "--atom_layout_mnk", type=parse_comma_separated_ints, default=(2, 2, 1) + ) + parser.add_argument( + "--ab_dtype", + type=cutlass.dtype, + choices=[cutlass.Float16], + default=cutlass.Float16, + ) + parser.add_argument( + "--acc_dtype", + type=cutlass.dtype, + choices=[cutlass.Float32], + default=cutlass.Float32, + ) + parser.add_argument( + "--c_dtype", + type=cutlass.dtype, + choices=[cutlass.Float16], + default=cutlass.Float16, + ) + parser.add_argument("--a_major", choices=["k", "m"], default="m") + parser.add_argument("--b_major", choices=["k", "n"], default="n") + parser.add_argument("--c_major", choices=["n", "m"], default="n") + parser.add_argument("--warmup_iterations", default=2, type=int) + parser.add_argument("--iterations", default=100, type=int) + parser.add_argument("--skip_ref_check", action="store_true") + + args = parser.parse_args() + print("Running Ampere tensor core GEMM example:") + run_tensor_op_gemm( + args.a_major, + args.b_major, + args.c_major, + args.ab_dtype, + args.c_dtype, + args.acc_dtype, + args.mnkl, + args.atom_layout_mnk, + args.warmup_iterations, + args.iterations, + args.skip_ref_check, + ) + print("PASS") diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm.py b/examples/python/CuTeDSL/blackwell/dense_gemm.py new file mode 100644 index 00000000..89696c8a --- /dev/null +++ b/examples/python/CuTeDSL/blackwell/dense_gemm.py @@ -0,0 +1,1922 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from typing import Optional, Type, Tuple, Union +import cuda.bindings.driver as cuda + +import torch + +import cutlass +import cutlass.cute as cute +import cutlass.utils as utils +from cutlass.cute.nvgpu import cpasync, tcgen05 +import cutlass.torch as cutlass_torch +import cutlass.utils.blackwell_helpers as sm100_utils +from cutlass.cute.runtime import from_dlpack + +""" +A high-performance batched dense GEMM (C = A * B) example for the NVIDIA Blackwell SM100 architecture +using CUTE DSL. +- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") +- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") +- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M") + +This GEMM kernel supports the following features: + - Utilizes Tensor Memory Access (TMA) for efficient memory operations + - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions) + - Implements TMA multicast with cluster to reduce L2 memory traffic + - Supports multi-stage pipeline to overlap computation and memory access + +This GEMM works as follows: +1. Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations. +2. Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction. +3. Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld. +4. Type convert C matrix to output type. +5. Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations, + or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations. +6. Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor: + e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0)) + +SM100 tcgen05.mma instructions operate as follows: +- Read matrix A from SMEM +- Read matrix B from SMEM +- Write accumulator to TMEM +The accumulator in TMEM must then be loaded to registers before writing back to GMEM. + +To run this example: + +.. code-block:: bash + + python examples/blackwell/dense_gemm.py \ + --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \ + --mma_tiler_mn 256,128 --cluster_shape_mn 2,1 \ + --mnkl 8192,8192,8192,1 \ + --use_tma_store --use_2cta_instrs + +The above example command compute batched gemm with M=8192, N=8192, K=8192, +batch_count=1. The Blackwell tcgen05 MMA tile shape used 2 cta with 256x128 +MMA tile and the cluster shape is (2,1). The input, mma accumulator and output +data type are set as fp16, fp32 and fp16, respectively. + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/blackwell/dense_gemm.py \ + --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \ + --mma_tiler_mn 256,128 --cluster_shape_mn 2,1 \ + --mnkl 8192,8192,8192,1 \ + --use_tma_store --use_2cta_instrs + +Constraints: +* Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2), + see detailed valid dtype combinations in below DenseGemmKernel class documentation +* A/B tensor must have the same data type +* Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True) +* Mma tiler N must be 32-256, step 32 +* Cluster shape M/N must be positive and power of 2, total cluster size <= 16 +* Cluster shape M must be multiple of 2 if use_2cta_instrs=True +* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned, + i.e, number of elements is a multiple of 4, 8, and 16 for TFloat32, + Float16/BFloat16, and Int8/Uint8/Float8, respectively. +* OOB tiles are not allowed when TMA store is disabled +""" + + +class DenseGemmKernel: + """ + This class implements batched matrix multiplication (C = A x B) with support for various data types + and architectural features specific to Blackwell GPUs. + + :param acc_dtype: Data type for accumulation during computation + :type acc_dtype: type[cutlass.Numeric] + :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation + :type use_2cta_instrs: bool + :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tiler (M,N) + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing + :type cluster_shape_mn: Tuple[int, int] + :param use_tma_store: Whether to use Tensor Memory Access (TMA) for storing results + :type use_tma_store: bool + + :note: In current version, A and B tensor must have the same data type + - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported + + :note: Supported A/B data types: + - TFloat32 + - Float16/BFloat16 + - Int8/Uint8 + - Float8E4M3FN/Float8E5M2 + + :note: Supported accumulator data types: + - Float32 (for all floating point A/B data types) + - Float16 (only for fp16 and fp8 A/B data types) + - Int32 (only for uint8/int8 A/B data types) + + :note: Supported C data types: + - Float32 (for float32 and int32 accumulator data types) + - Int32 (for float32 and int32 accumulator data types) + - Float16/BFloat16 (for fp16 and fp8 accumulator data types) + - Int8/Uint8 (for uint8/int8 accumulator data types) + - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types) + + :note: Constraints: + - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True) + - MMA tiler N must be 32-256, step 32 + - Cluster shape M must be multiple of 2 if use_2cta_instrs=True + - Cluster shape M/N must be positive and power of 2, total cluster size <= 16 + + Example: + >>> gemm = DenseGemmKernel( + ... acc_dtype=cutlass.Float32, + ... use_2cta_instrs=True, + ... mma_tiler_mn=(128, 128), + ... cluster_shape_mn=(2, 2) + ... ) + >>> gemm(a_tensor, b_tensor, c_tensor, stream) + """ + + def __init__( + self, + acc_dtype: Type[cutlass.Numeric], + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + use_tma_store: bool, + ): + """Initializes the configuration for a Blackwell dense GEMM kernel. + + This configuration includes several key aspects: + + 1. MMA Instruction Settings (tcgen05): + - acc_dtype: Data types for MMA accumulator. + - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler. + - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant + with cta_group=2 should be used. + + 2. Cluster Shape: + - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster. + + 3. Output C tensor store mode: + - use_tma_store: Boolean indicating whether to use Tensor Memory Access (TMA) for storing results. + + :param acc_dtype: Data type of the accumulator. + :type acc_dtype: type[cutlass.Numeric] + :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction. + :type mma_tiler_mn: Tuple[int, int] + :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant. + :type use_2cta_instrs: bool + :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster. + :type cluster_shape_mn: Tuple[int, int] + :param use_tma_store: Use Tensor Memory Access (TMA) or normal store for output C tensor. + :type use_tma_store: bool + """ + + self.acc_dtype: Type[cutlass.Numeric] = acc_dtype + self.use_2cta_instrs = use_2cta_instrs + self.cluster_shape_mn = cluster_shape_mn + # K dimension is deferred in _setup_attributes + self.mma_tiler = (*mma_tiler_mn, 1) + self.use_tma_store = use_tma_store + + self.cta_group = ( + tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE + ) + + self.occupancy = 1 + self.threads_per_cta = 128 + self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"] + + def _setup_attributes(self): + """Set up configurations that are dependent on GEMM inputs + + This method configures various attributes based on the input tensor properties + (data types, leading dimensions) and kernel settings: + - Configuring tiled MMA + - Computing MMA/cluster/tile shapes + - Computing cluster layout + - Computing multicast CTAs for A/B + - Computing epilogue subtile + - Setting up A/B/C stage counts in shared memory + - Computing A/B/C shared memory layout + - Computing tensor memory allocation columns + """ + # Configure tiled mma + tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + + # Compute mma/cluster/tile shapes + mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2]) + mma_inst_tile_k = 4 + self.mma_tiler = ( + self.mma_tiler[0], + self.mma_tiler[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + self.cta_tile_shape_mnk = ( + self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler[1], + self.mma_tiler[2], + ) + + # Compute cluster layout + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout((*self.cluster_shape_mn, 1)), + (tiled_mma.thr_id.shape,), + ) + + # Compute number of multicast CTAs for A/B + self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2]) + self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1]) + self.is_a_mcast = self.num_mcast_ctas_a > 1 + self.is_b_mcast = self.num_mcast_ctas_b > 1 + + # Compute epilogue subtile + if cutlass.const_expr(self.use_tma_store): + self.epi_tile = sm100_utils.compute_epilogue_tile_shape( + self.cta_tile_shape_mnk, + self.use_2cta_instrs, + self.c_layout, + self.c_dtype, + ) + else: + self.epi_tile = self.cta_tile_shape_mnk[:2] + + # Setup A/B/C stage count in shared memory + self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.b_dtype, + self.epi_tile, + self.c_dtype, + self.c_layout, + self.num_smem_capacity, + self.occupancy, + self.use_tma_store, + ) + + # Compute A/B/C shared memory layout + self.a_smem_layout_staged = sm100_utils.make_smem_layout_a( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.num_ab_stage, + ) + self.b_smem_layout_staged = sm100_utils.make_smem_layout_b( + tiled_mma, + self.mma_tiler, + self.b_dtype, + self.num_ab_stage, + ) + self.c_smem_layout_staged = ( + sm100_utils.make_smem_layout_epi( + self.c_dtype, + self.c_layout, + self.epi_tile, + self.num_c_stage, + ) + if cutlass.const_expr(self.use_tma_store) + else None + ) + + # Compute the number of tensor memory allocation columns + self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols( + tiled_mma, self.mma_tiler + ) + + @cute.jit + def __call__( + self, + a: cute.Tensor, + b: cute.Tensor, + c: cute.Tensor, + stream: cuda.CUstream, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + """Execute the GEMM operation in steps: + - Setup static attributes + - Setup TMA load/store atoms and tensors + - Compute grid size + - Define shared storage for kernel + - Launch the kernel synchronously + + :param a: Input tensor A + :type a: cute.Tensor + :param b: Input tensor B + :type b: cute.Tensor + :param c: Output tensor C + :type c: cute.Tensor + :param stream: CUDA stream for asynchronous execution + :type stream: cuda.CUstream + :param epilogue_op: Optional elementwise lambda function to apply to the output tensor + :type epilogue_op: cutlass.Constexpr + :raises TypeError: If input data types are incompatible with the MMA instruction. + :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled. + """ + # Setup static attributes before smem/grid/tma computation + self.a_dtype: Type[cutlass.Numeric] = a.element_type + self.b_dtype: Type[cutlass.Numeric] = b.element_type + self.c_dtype: Type[cutlass.Numeric] = c.element_type + self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode() + self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode() + self.c_layout = utils.LayoutEnum.from_tensor(c) + + # Check if input data types are compatible with MMA instruction + if cutlass.const_expr(self.a_dtype != self.b_dtype): + raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}") + + # Setup attributes that dependent on gemm inputs + self._setup_attributes() + + tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + atom_thr_size = cute.size(tiled_mma.thr_id.shape) + + # Setup TMA load for A + a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast) + a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0)) + tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A( + a_op, + a, + a_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + internal_type=( + cutlass.TFloat32 if a.element_type is cutlass.Float32 else None + ), + ) + + # Setup TMA load for B + b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast) + b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0)) + tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B( + b_op, + b, + b_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + internal_type=( + cutlass.TFloat32 if b.element_type is cutlass.Float32 else None + ), + ) + + a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout) + b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout) + self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size + + # Setup store for C + tma_atom_c = None + tma_tensor_c = None + if cutlass.const_expr(self.use_tma_store): + c_cta_v_layout = cute.composition( + cute.make_identity_layout(c.shape), self.epi_tile + ) + epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0)) + tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom( + cpasync.CopyBulkTensorTileS2GOp(), + c, + epi_smem_layout, + c_cta_v_layout, + ) + + # Compute grid size + grid = self._compute_grid(c, self.cta_tile_shape_mnk, self.cluster_shape_mn) + + self.buffer_align_bytes = 1024 + + c_smem_size = ( + cute.cosize(self.c_smem_layout_staged.outer) + if cutlass.const_expr(self.use_tma_store) + else 0 + ) + + # Define shared storage for kernel + @cute.struct + class SharedStorage: + ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] + ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] + acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] + tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_holding_buf: cutlass.Int32 + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC: cute.struct.Align[ + cute.struct.MemRange[ + self.c_dtype, + c_smem_size, + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_M, MMA_K, STAGE) + sA: cute.struct.Align[ + cute.struct.MemRange[ + self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer) + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_N, MMA_K, STAGE) + sB: cute.struct.Align[ + cute.struct.MemRange[ + self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer) + ], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + + # Launch the kernel synchronously + self.kernel( + tiled_mma, + tma_atom_a, + tma_tensor_a, + tma_atom_b, + tma_tensor_b, + tma_atom_c, + tma_tensor_c if cutlass.const_expr(self.use_tma_store) else c, + self.cluster_layout_vmnk, + self.a_smem_layout_staged, + self.b_smem_layout_staged, + self.c_smem_layout_staged, + self.epi_tile, + epilogue_op, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=(*self.cluster_shape_mn, 1), + smem=self.shared_storage.size_in_bytes(), + stream=stream, + ) + return + + # GPU device kernel + @cute.kernel + def kernel( + self, + tiled_mma: cute.TiledMma, + tma_atom_a: cute.CopyAtom, + mA_mkl: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB_nkl: cute.Tensor, + tma_atom_c: Optional[cute.CopyAtom], + mC_mnl: cute.Tensor, + cluster_layout_vmnk: cute.Layout, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None], + epi_tile: cute.Tile, + epilogue_op: cutlass.Constexpr, + ): + """ + GPU device kernel performing the batched GEMM computation. + """ + warp_idx = cute.arch.warp_idx() + warp_idx = cute.arch.make_warp_uniform(warp_idx) + + # + # Prefetch tma descriptor + # + if warp_idx == 0: + cpasync.prefetch_descriptor(tma_atom_a) + cpasync.prefetch_descriptor(tma_atom_b) + if cutlass.const_expr(self.use_tma_store): + cpasync.prefetch_descriptor(tma_atom_c) + + use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2 + + # + # Setup cta/thread coordinates + # + # Coords inside cluster + bidx, bidy, bidz = cute.arch.block_idx() + mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape) + is_leader_cta = mma_tile_coord_v == 0 + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord( + cta_rank_in_cluster + ) + # Coords outside cluster + cta_coord = (bidx, bidy, bidz) + mma_tile_coord_mnl = ( + cta_coord[0] // cute.size(tiled_mma.thr_id.shape), + cta_coord[1], + cta_coord[2], + ) + # Coords inside cta + tidx, _, _ = cute.arch.thread_idx() + + # + # Alloc and init: a+b full/empty, accumulator full, tensor memory dealloc barrier + # + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr + tmem_holding_buf = storage.tmem_holding_buf + + # Initialize mainloop ab_pipeline (barrier) and states + ab_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread) + num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1 + ab_pipeline_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, num_tma_producer + ) + ab_pipeline = utils.PipelineTmaUmma.create( + barrier_storage=storage.ab_full_mbar_ptr.data_ptr(), + num_stages=self.num_ab_stage, + producer_group=ab_pipeline_producer_group, + consumer_group=ab_pipeline_consumer_group, + tx_count=self.num_tma_load_bytes, + cta_layout_vmnk=cluster_layout_vmnk, + ) + ab_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.num_ab_stage + ) + ab_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.num_ab_stage + ) + + # Initialize acc_pipeline (barrier) and states + acc_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread) + acc_pipeline_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, self.threads_per_cta, self.threads_per_cta + ) + acc_pipeline = utils.PipelineUmmaAsync.create( + barrier_storage=storage.acc_full_mbar_ptr.data_ptr(), + num_stages=self.num_acc_stage, + producer_group=acc_pipeline_producer_group, + consumer_group=acc_pipeline_consumer_group, + cta_layout_vmnk=cluster_layout_vmnk, + ) + acc_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.num_acc_stage + ) + acc_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.num_acc_stage + ) + + # Tensor memory dealloc barrier init + if use_2cta_instrs: + if warp_idx == 0: + num_tmem_dealloc_threads = 32 + with cute.arch.elect_one(): + cute.arch.mbarrier_init_arrive_cnt( + tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads + ) + cute.arch.mbarrier_init_fence() + + # Cluster arrive after barrier init + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_arrive_relaxed() + + # + # Setup smem tensor A/B/C + # + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC = ( + storage.sC.get_tensor( + c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner + ) + if cutlass.const_expr(self.use_tma_store) + else None + ) + # (MMA, MMA_M, MMA_K, STAGE) + sA = storage.sA.get_tensor( + a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner + ) + # (MMA, MMA_N, MMA_K, STAGE) + sB = storage.sB.get_tensor( + b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner + ) + + # + # Compute multicast mask for A/B buffer full + # + a_full_mcast_mask = None + b_full_mcast_mask = None + if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs: + a_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2 + ) + b_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1 + ) + + # + # Local_tile partition global tensors + # + # (bM, bK, loopM, loopK, loopL) + gA_mkl = cute.local_tile( + mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None) + ) + # (bN, bK, loopN, loopK, loopL) + gB_nkl = cute.local_tile( + mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None) + ) + # (bM, bN, loopM, loopN, loopL) + gC_mnl = cute.local_tile( + mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None) + ) + k_block_cnt = cute.size(gA_mkl, mode=[3]) + + # + # Partition global tensor for TiledMMA_A/B/C + # + thr_mma = tiled_mma.get_slice(mma_tile_coord_v) + # (MMA, MMA_M, MMA_K, loopM, loopK, loopL) + tCgA = thr_mma.partition_A(gA_mkl) + # (MMA, MMA_N, MMA_K, loopN, loopK, loopL) + tCgB = thr_mma.partition_B(gB_nkl) + # (MMA, MMA_M, MMA_N, loopM, loopN, loopL) + tCgC = thr_mma.partition_C(gC_mnl) + + # + # Partition global/shared tensor for TMA load A/B + # + # TMA load A partition_S/D + a_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopM, loopK, loopL) + tAsA, tAgA = cpasync.tma_partition( + tma_atom_a, + block_in_cluster_coord_vmnk[2], + a_cta_layout, + cute.group_modes(sA, 0, 3), + cute.group_modes(tCgA, 0, 3), + ) + # TMA load B partition_S/D + b_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopN, loopK, loopL) + tBsB, tBgB = cpasync.tma_partition( + tma_atom_b, + block_in_cluster_coord_vmnk[1], + b_cta_layout, + cute.group_modes(sB, 0, 3), + cute.group_modes(tCgB, 0, 3), + ) + + # + # Partition shared/tensor memory tensor for TiledMMA_A/B/C + # + # (MMA, MMA_M, MMA_K, STAGE) + tCrA = tiled_mma.make_fragment_A(sA) + # (MMA, MMA_N, MMA_K, STAGE) + tCrB = tiled_mma.make_fragment_B(sB) + # (MMA, MMA_M, MMA_N) + acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2]) + # (MMA, MMA_M, MMA_N) + tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape) + + # + # Cluster wait before tensor memory alloc + # + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_wait() + + # + # Alloc tensor memory buffer + # + if warp_idx == 0: + cute.arch.alloc_tmem( + self.num_tmem_alloc_cols, tmem_holding_buf, is_two_cta=use_2cta_instrs + ) + + # + # Bar sync for retrieve tensor memory ptr from shared memory + # + cute.arch.barrier() + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf + ) + # (MMA, MMA_M, MMA_N) + tCtAcc = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout) + + # + # Partition for epilogue + # + tiled_copy_t2r, tTR_tAcc, tTR_rAcc = self.epilog_tmem_copy_and_partition( + tidx, tCtAcc, tCgC, epi_tile, use_2cta_instrs + ) + + tTR_rC = None + tiled_copy_r2s = None + simt_atom = None + tRS_rC = None + tRS_sC = None + bSG_sC = None + bSG_gC = None + tTR_gC = None + if cutlass.const_expr(self.use_tma_store): + tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype) + tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition( + tiled_copy_t2r, tTR_rC, tidx, sC + ) + tma_atom_c, bSG_sC, bSG_gC = self.epilog_gmem_copy_and_partition( + tidx, tma_atom_c, tCgC, epi_tile, sC + ) + else: + simt_atom, tTR_rC, tTR_gC = self.epilog_gmem_copy_and_partition( + tidx, tiled_copy_t2r, tCgC, epi_tile, sC + ) + + # + # Slice to per mma tile index + # + # ((atom_v, rest_v), loopK) + tAgA = tAgA[(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])] + # ((atom_v, rest_v), loopK) + tBgB = tBgB[(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])] + if cutlass.const_expr(self.use_tma_store): + # ((ATOM_V, REST_V), EPI_M, EPI_N) + bSG_gC = bSG_gC[(None, None, None, *mma_tile_coord_mnl)] + else: + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N) + tTR_gC = tTR_gC[(None, None, None, None, None, *mma_tile_coord_mnl)] + + # + # Pipelining TMA load A/B and MMA mainloop + # + prefetch_k_block_cnt = cutlass.min(self.num_ab_stage - 2, k_block_cnt) + + if warp_idx == 0: + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + peek_ab_empty_status = cutlass.Boolean(1) + if ab_producer_state.count < k_block_cnt: + peek_ab_empty_status = ab_pipeline.producer_try_acquire( + ab_producer_state + ) + # + # Prefetch TMA load A/B + # + for prefetch_idx in cutlass.range_dynamic(prefetch_k_block_cnt, unroll=1): + # Conditionally wait for AB buffer empty + ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status) + + # TMA load A/B + cute.copy( + tma_atom_a, + tAgA[(None, ab_producer_state.count)], + tAsA[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + mcast_mask=a_full_mcast_mask, + ) + cute.copy( + tma_atom_b, + tBgB[(None, ab_producer_state.count)], + tBsB[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + mcast_mask=b_full_mcast_mask, + ) + + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1 + ab_producer_state.advance() + peek_ab_empty_status = cutlass.Boolean(1) + if ab_producer_state.count < k_block_cnt: + peek_ab_empty_status = ab_pipeline.producer_try_acquire( + ab_producer_state + ) + + # Peek (try_wait) AB buffer full for k_block = 0 + peek_ab_full_status = cutlass.Boolean(1) + if ab_consumer_state.count < k_block_cnt and is_leader_cta: + peek_ab_full_status = ab_pipeline.consumer_try_wait(ab_consumer_state) + + # + # MMA mainloop + # + for k_block in cutlass.range_dynamic(0, k_block_cnt, 1, unroll=1): + # Conditionally wait for AB buffer empty + ab_pipeline.producer_acquire(ab_producer_state, peek_ab_empty_status) + + if ab_producer_state.count < k_block_cnt: + # TMA load A/B + cute.copy( + tma_atom_a, + tAgA[(None, ab_producer_state.count)], + tAsA[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + mcast_mask=a_full_mcast_mask, + ) + cute.copy( + tma_atom_b, + tBgB[(None, ab_producer_state.count)], + tBsB[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + mcast_mask=b_full_mcast_mask, + ) + + if is_leader_cta: + # Conditionally wait for AB buffer full + ab_pipeline.consumer_wait(ab_consumer_state, peek_ab_full_status) + + # tCtAcc += tCrA * tCrB + num_kphases = cute.size(tCrA, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = (None, None, kphase_idx, ab_consumer_state.index) + + cute.gemm( + tiled_mma, + tCtAcc, + tCrA[kphase_coord], + tCrB[kphase_coord], + tCtAcc, + ) + # Enable accumulate on tCtAcc after first kphase + tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + + # Async arrive AB buffer empty + ab_pipeline.consumer_release(ab_consumer_state) + + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1 + ab_producer_state.advance() + peek_ab_empty_status = cutlass.Boolean(1) + if ab_producer_state.count < k_block_cnt: + peek_ab_empty_status = ab_pipeline.producer_try_acquire( + ab_producer_state + ) + + # Peek (try_wait) AB buffer full for k_block = k_block + 1 + ab_consumer_state.advance() + peek_ab_full_status = cutlass.Boolean(1) + if ab_consumer_state.count < k_block_cnt: + if is_leader_cta: + peek_ab_full_status = ab_pipeline.consumer_try_wait( + ab_consumer_state + ) + + # Async arrive accumulator buffer full + if is_leader_cta: + acc_pipeline.producer_commit(acc_producer_state) + + # + # Epilogue + # + + # Release tensor memory allocation lock + if warp_idx == 0: + cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs) + + # Wait for accumulator buffer full + acc_pipeline.consumer_wait(acc_consumer_state) + + tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc)) + if cutlass.const_expr(self.use_tma_store): + bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC)) + else: + tTR_gC = cute.group_modes(tTR_gC, 3, cute.rank(tTR_gC)) + + c_pipeline = None + if cutlass.const_expr(self.use_tma_store): + # Initialize tma store c_pipeline + c_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, self.threads_per_cta, self.threads_per_cta + ) + c_pipeline = utils.PipelineTmaStore.create( + num_stages=self.num_c_stage, + producer_group=c_producer_group, + ) + + # + # Store accumulator to global memory in subtiles + # + subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3]) + for subtile_idx in cutlass.range_dynamic(subtile_cnt): + # + # Load accumulator from tensor memory buffer to register + # + tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)] + cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc) + + if cutlass.const_expr(self.use_tma_store): + # + # Perform epilogue op on accumulator and convert to C type + # + acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load() + acc_vec = epilogue_op(acc_vec.to(self.c_dtype)) + tRS_rC.store(acc_vec) + + # + # Store C to shared memory + # + c_buffer = subtile_idx % self.num_c_stage + cute.copy(tiled_copy_r2s, tRS_rC, tRS_sC[(None, None, None, c_buffer)]) + # Fence and barrier to make sure shared memory store is visible to TMA store + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + cute.arch.barrier() + + # + # TMA store C to global memory + # + if warp_idx == 0: + cute.copy( + tma_atom_c, + bSG_sC[(None, c_buffer)], + bSG_gC[(None, subtile_idx)], + ) + # Fence and barrier to make sure TMA store is completed to recollect C buffer + c_pipeline.producer_commit() + c_pipeline.producer_acquire() + cute.arch.barrier() + else: + # + # Perform epilogue op on accumulator and convert to C type + # + acc_vec = tTR_rAcc.load() + acc_vec = epilogue_op(acc_vec.to(self.c_dtype)) + tTR_rC.store(acc_vec) + + # + # Store C to global memory + # + cute.copy(simt_atom, tTR_rC, tTR_gC[(None, None, None, subtile_idx)]) + + # + # Dealloc the tensor memory buffer + # + cute.arch.barrier() + if warp_idx == 0: + if use_2cta_instrs: + cute.arch.mbarrier_arrive( + tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1 + ) + cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0) + cute.arch.dealloc_tmem( + tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs + ) + + # + # Wait for C store complete + # + if cutlass.const_expr(self.use_tma_store): + c_pipeline.producer_tail() + + # + # Wait A/B buffer empty + # + if warp_idx == 0: + # Reverse prefetch_k_block_cnt times to next available buffer + for i in cutlass.range_dynamic(prefetch_k_block_cnt): + ab_producer_state.reverse() + ab_pipeline.producer_tail(ab_producer_state) + return + + def epilog_tmem_copy_and_partition( + self, + tidx: cutlass.Int32, + tAcc: cute.Tensor, + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + use_2cta_instrs: Union[cutlass.Boolean, bool], + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination). + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param tAcc: The accumulator tensor to be copied and partitioned + :type tAcc: cute.Tensor + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param use_2cta_instrs: Whether use_2cta_instrs is enabled + :type use_2cta_instrs: bool + + :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where: + - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + - tTR_tAcc: The partitioned accumulator tensor + - tTR_rAcc: The accumulated tensor in register used to hold t2r results + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + # Make tiledCopy for tensor memory load + copy_atom_t2r = sm100_utils.get_tmem_load_op( + self.cta_tile_shape_mnk, + self.c_layout, + self.c_dtype, + self.acc_dtype, + epi_tile, + use_2cta_instrs, + ) + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N) + tAcc_epi = cute.flat_divide( + tAcc[((None, None), 0, 0)], + epi_tile, + ) + # (EPI_TILE_M, EPI_TILE_N) + tiled_copy_t2r = tcgen05.make_tmem_copy( + copy_atom_t2r, tAcc_epi[(None, None, 0, 0)] + ) + + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M) + tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi) + + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_mnl_epi = cute.flat_divide( + gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile + ) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL) + tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi) + # (T2R, T2R_M, T2R_N) + tTR_rAcc = cute.make_fragment( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype + ) + return tiled_copy_t2r, tTR_tAcc, tTR_rAcc + + def epilog_smem_copy_and_partition( + self, + tiled_copy_t2r: cute.TiledCopy, + tTR_rC: cute.Tensor, + tidx: cutlass.Int32, + sC: cute.Tensor, + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination). + + :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + :type tiled_copy_t2r: cute.TiledCopy + :param tTR_rC: The partitioned accumulator tensor + :type tTR_rC: cute.Tensor + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + + :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where: + - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s) + - tRS_rC: The partitioned tensor C (register source) + - tRS_sC: The partitioned tensor C (smem destination) + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + copy_atom_r2s = sm100_utils.get_smem_store_op( + self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r + ) + tiled_copy_r2s = cute.make_tiled_copy( + copy_atom_r2s, + layout_tv=tiled_copy_t2r.layout_dst_tv_tiled, + tiler_mn=tiled_copy_t2r.tiler_mn, + ) + # (R2S, R2S_M, R2S_N, PIPE_D) + thr_copy_r2s = tiled_copy_r2s.get_slice(tidx) + tRS_sC = thr_copy_r2s.partition_D(sC) + # (R2S, R2S_M, R2S_N) + tRS_rC = tiled_copy_r2s.retile(tTR_rC) + return tiled_copy_r2s, tRS_rC, tRS_sC + + def epilog_gmem_copy_and_partition( + self, + tidx: cutlass.Int32, + atom: Union[cute.CopyAtom, cute.TiledCopy], + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + sC: cute.Tensor, + ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]: + """Make tiledCopy for global memory store, then use it to: + - partition register array (source) and global memory (destination) for none TMA store version; + - partition shared memory (source) and global memory (destination) for TMA store version. + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version + :type atom: cute.CopyAtom or cute.TiledCopy + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + + :return: A tuple containing either: + - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where: + - tma_atom_c: The TMA copy atom + - bSG_sC: The partitioned shared memory tensor C + - bSG_gC: The partitioned global tensor C + - For non-TMA store: (simt_atom, tTR_rC, tTR_gC) where: + - simt_atom: The SIMT copy atom + - tTR_rC: The register tensor C + - tTR_gC: The partitioned global tensor C + :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor] + """ + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_epi = cute.flat_divide( + gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile + ) + if cutlass.const_expr(self.use_tma_store): + tma_atom_c = atom + sC_for_tma_partition = cute.group_modes(sC, 0, 2) + gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2) + # ((ATOM_V, REST_V), EPI_M, EPI_N) + # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL) + bSG_sC, bSG_gC = cpasync.tma_partition( + tma_atom_c, + 0, + cute.make_layout(1), + sC_for_tma_partition, + gC_for_tma_partition, + ) + return tma_atom_c, bSG_sC, bSG_gC + else: + tiled_copy_t2r = atom + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL) + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + tTR_gC = thr_copy_t2r.partition_D(gC_epi) + # (T2R, T2R_M, T2R_N) + tTR_rC = cute.make_fragment( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.c_dtype + ) + simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), self.c_dtype) + return simt_atom, tTR_rC, tTR_gC + + @staticmethod + def _compute_stages( + tiled_mma: cute.TiledMma, + mma_tiler_mnk: Tuple[int, int, int], + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + epi_tile: cute.Tile, + c_dtype: Type[cutlass.Numeric], + c_layout: utils.LayoutEnum, + num_smem_capacity: int, + occupancy: int, + use_tma_store: bool, + ) -> Tuple[int, int, int]: + """Computes the number of stages for A/B/C operands based on heuristics. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler_mnk: The shape (M, N, K) of the MMA tile. + :type mma_tiler_mnk: tuple[int, int, int] + :param a_dtype: Data type of operand A. + :type a_dtype: type[cutlass.Numeric] + :param b_dtype: Data type of operand B. + :type b_dtype: type[cutlass.Numeric] + :param epi_tile: The epilogue tile shape. + :type epi_tile: cute.Tile + :param c_dtype: Data type of operand C (output). + :type c_dtype: type[cutlass.Numeric] + :param c_layout: Layout enum of operand C in global memory. + :type c_layout: utils.LayoutEnum + :param num_smem_capacity: Total available shared memory capacity in bytes. + :type num_smem_capacity: int + :param occupancy: Target number of CTAs per SM (occupancy). + :type occupancy: int + :param use_tma_store: Whether TMA store is enabled. + :type use_tma_store: bool + + :return: A tuple containing the computed number of stages for: + (ACC stages, A/B operand stages, epilogue stages) + :rtype: tuple[int, int, int] + """ + # Default ACC stages + num_acc_stage = 1 + # Default C stages + num_c_stage = 2 if use_tma_store else 0 + + # Calculate smem layout and size for one stage of A, B, and C + a_smem_layout_stage_one = sm100_utils.make_smem_layout_a( + tiled_mma, + mma_tiler_mnk, + a_dtype, + 1, # a tmp 1 stage is provided + ) + b_smem_layout_staged_one = sm100_utils.make_smem_layout_b( + tiled_mma, + mma_tiler_mnk, + b_dtype, + 1, # a tmp 1 stage is provided + ) + c_smem_layout_staged_one = ( + sm100_utils.make_smem_layout_epi( + c_dtype, + c_layout, + epi_tile, + 1, + ) + if use_tma_store + else None + ) + ab_bytes_per_stage = cute.size_in_bytes( + a_dtype, a_smem_layout_stage_one + ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one) + mbar_helpers_bytes = 1024 + c_bytes_per_stage = ( + cute.size_in_bytes(c_dtype, c_smem_layout_staged_one) + if use_tma_store + else 0 + ) + c_bytes = c_bytes_per_stage * num_c_stage + + # Calculate A/B stages: + # Start with total smem per CTA (capacity / occupancy) + # Subtract reserved bytes and initial C stages bytes + # Divide remaining by bytes needed per A/B stage + num_ab_stage = ( + num_smem_capacity - (occupancy + 1) * (mbar_helpers_bytes + c_bytes) + ) // ab_bytes_per_stage + + # Refine epilogue stages: + # Calculate remaining smem after allocating for A/B stages and reserved bytes + # Add remaining unused smem to epilogue + if use_tma_store: + num_c_stage += ( + num_smem_capacity + - ab_bytes_per_stage * num_ab_stage + - (occupancy + 1) * (mbar_helpers_bytes + c_bytes) + ) // ((occupancy + 1) * c_bytes_per_stage) + return num_acc_stage, num_ab_stage, num_c_stage + + @staticmethod + def _compute_grid( + c: cute.Tensor, + cta_tile_shape_mnk: Tuple[int, int, int], + cluster_shape_mn: Tuple[int, int], + ) -> Tuple[int, int, int]: + """Compute grid shape for the output tensor C. + + :param c: The output tensor C + :type c: cute.Tensor + :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile. + :type cta_tile_shape_mnk: tuple[int, int, int] + :param cluster_shape_mn: Shape of each cluster in M, N dimensions. + :type cluster_shape_mn: tuple[int, int] + + :return: Grid shape for kernel launch. + :rtype: tuple[int, int, int] + """ + + cluster_shape_mnl = (*cluster_shape_mn, 1) + + grid = cute.round_up( + ( + cute.ceil_div(c.layout.shape[0], cta_tile_shape_mnk[0]), + cute.ceil_div(c.layout.shape[1], cta_tile_shape_mnk[1]), + c.layout.shape[2], + ), + cluster_shape_mnl, + ) + + return grid + + @staticmethod + def _get_tma_atom_kind( + atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean + ) -> Union[ + cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp + ]: + """ + Select the appropriate TMA copy atom based on the number of SMs and the multicast flag. + + :param atom_sm_cnt: The number of SMs + :type atom_sm_cnt: cutlass.Int32 + :param mcast: The multicast flag + :type mcast: cutlass.Boolean + + :return: The appropriate TMA copy atom kind + :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp + + :raise ValueError: If the atom_sm_cnt is invalid + """ + if atom_sm_cnt == 2 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 2 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 1 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE) + elif atom_sm_cnt == 1 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE) + + raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}") + + @staticmethod + def _compute_num_tmem_alloc_cols( + tiled_mma: cute.TiledMma, mma_tiler: Tuple[int, int, int] + ) -> int: + """ + Compute the number of tensor memory allocation columns. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler: The shape (M, N, K) of the MMA tile. + :type mma_tiler: tuple[int, int, int] + + :return: The number of tensor memory allocation columns. + :rtype: int + """ + acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2]) + tCtAcc_fake = tiled_mma.make_fragment_C(acc_shape) + return sm100_utils.get_num_tmem_alloc_cols(tCtAcc_fake) + + @staticmethod + def is_valid_dtypes( + ab_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + ) -> bool: + """ + Check if the dtypes are valid + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param acc_dtype: The data type of the accumulator + :type acc_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + + :return: True if the dtypes are valid, False otherwise + :rtype: bool + """ + is_valid = True + if ab_dtype not in { + cutlass.Float16, + cutlass.BFloat16, + cutlass.TFloat32, + cutlass.Uint8, + cutlass.Int8, + cutlass.Float8E4M3FN, + cutlass.Float8E5M2, + }: + is_valid = False + if ( + acc_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.Int32} + or acc_dtype == cutlass.Float16 + and ab_dtype + not in {cutlass.Float16, cutlass.Float8E4M3FN, cutlass.Float8E5M2} + or acc_dtype == cutlass.Int32 + and ab_dtype not in {cutlass.Uint8, cutlass.Int8} + ): + is_valid = False + if ( + acc_dtype == cutlass.Float32 + and c_dtype + not in { + cutlass.Float32, + cutlass.Float16, + cutlass.BFloat16, + cutlass.Float8E4M3FN, + cutlass.Float8E5M2, + cutlass.Int32, + cutlass.Int8, + cutlass.Uint8, + } + or acc_dtype == cutlass.Float16 + and c_dtype + not in { + cutlass.BFloat16, + cutlass.Float16, + } + or acc_dtype == cutlass.Int32 + and c_dtype + not in { + cutlass.BFloat16, + cutlass.Float16, + cutlass.Float32, + cutlass.Int32, + cutlass.Int8, + cutlass.Uint8, + } + ): + is_valid = False + return is_valid + + @staticmethod + def is_valid_mma_tiler_and_cluster_shape( + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + ) -> bool: + """ + Check if the mma tiler and cluster shape are valid + + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster + :type cluster_shape_mn: Tuple[int, int] + + :return: True if the mma tiler and cluster shape are valid, False otherwise + :rtype: bool + """ + is_valid = True + # Skip invalid mma tile shape + if not ( + (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128]) + or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256]) + ): + is_valid = False + if mma_tiler_mn[1] not in range(32, 257, 32): + is_valid = False + # Skip illegal cluster shape + if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0: + is_valid = False + # Skip invalid cluster shape + is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0 + if ( + cluster_shape_mn[0] * cluster_shape_mn[1] > 16 + or cluster_shape_mn[0] <= 0 + or cluster_shape_mn[1] <= 0 + or not is_power_of_2(cluster_shape_mn[0]) + or not is_power_of_2(cluster_shape_mn[1]) + ): + is_valid = False + return is_valid + + @staticmethod + def is_valid_tensor_alignment( + m: int, + n: int, + k: int, + l: int, + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + ) -> bool: + """ + Check if the tensor alignment is valid + + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param k: The number of columns in the A tensor + :type k: int + :param l: The number of columns in the C tensor + :type l: int + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param a_major: The major axis of the A tensor + :type a_major: str + :param b_major: The major axis of the B tensor + :type b_major: str + :param c_major: The major axis of the C tensor + :type c_major: str + + :return: True if the problem shape is valid, False otherwise + :rtype: bool + """ + is_valid = True + + def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape): + major_mode_idx = 0 if is_mode0_major else 1 + num_major_elements = tensor_shape[major_mode_idx] + num_contiguous_elements = 16 * 8 // dtype.width + return num_major_elements % num_contiguous_elements == 0 + + if ( + not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l)) + or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l)) + or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l)) + ): + is_valid = False + return is_valid + + @staticmethod + def is_valid_epilog_store_option( + use_2cta_instrs: bool, + use_tma_store: bool, + m: int, + n: int, + mma_tiler_mn: Tuple[int, int], + ) -> bool: + """ + Check if the epilogue store option is valid + + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param use_tma_store: Whether to use TMA store + :type use_tma_store: bool + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + + :return: True if the epilogue store option is valid, False otherwise + :rtype: bool + """ + + is_valid = True + # None TMA store version does not have predication, can not support OOB tiles + cta_tile_shape_mn = ( + mma_tiler_mn[0] // (2 if use_2cta_instrs else 1), + mma_tiler_mn[1], + ) + if not use_tma_store: + if not (m % cta_tile_shape_mn[0] == 0 and n % cta_tile_shape_mn[1] == 0): + is_valid = False + return is_valid + + @staticmethod + def can_implement( + ab_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + use_tma_store: bool, + m: int, + n: int, + k: int, + l: int, + a_major: str, + b_major: str, + c_major: str, + ) -> bool: + """ + Check if the gemm can be implemented + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param acc_dtype: The data type of the accumulator + :type acc_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster + :type cluster_shape_mn: Tuple[int, int] + :param use_tma_store: Whether to use TMA store + :type use_tma_store: bool + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param k: The number of columns in the A tensor + :type k: int + :param l: The number of columns in the C tensor + :type l: int + :param a_major: The major axis of the A tensor + :type a_major: str + :param b_major: The major axis of the B tensor + :type b_major: str + :param c_major: The major axis of the C tensor + :type c_major: str + + :return: True if the gemm can be implemented, False otherwise + :rtype: bool + """ + can_implement = True + # Skip unsupported types + if not DenseGemmKernel.is_valid_dtypes(ab_dtype, acc_dtype, c_dtype): + can_implement = False + # Skip invalid mma tile shape and cluster shape + if not DenseGemmKernel.is_valid_mma_tiler_and_cluster_shape( + use_2cta_instrs, mma_tiler_mn, cluster_shape_mn + ): + can_implement = False + # Skip illegal problem shape for load/store alignment + if not DenseGemmKernel.is_valid_tensor_alignment( + m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major + ): + can_implement = False + # Skip invalid epilogue store option + if not DenseGemmKernel.is_valid_epilog_store_option( + use_2cta_instrs, use_tma_store, m, n, mma_tiler_mn + ): + can_implement = False + return can_implement + + +def run_dense_gemm( + mnkl: Tuple[int, int, int, int], + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + use_2cta_instrs: bool, + use_tma_store: bool, + tolerance: float, + warmup_iterations: int = 0, + iterations: int = 1, + skip_ref_check: bool = False, + measure_launch_overhead=False, +): + """ + Prepare A/B/C tensors, launch GPU kernel, and reference checking. + """ + print(f"Running B100 Dense GEMM test with:") + print(f"mnkl: {mnkl}") + print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}") + print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}") + print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}") + print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}") + print(f"Use TMA Store: {'True' if use_tma_store else 'False'}") + print(f"Tolerance: {tolerance}") + print(f"Warmup iterations: {warmup_iterations}") + print(f"Iterations: {iterations}") + print(f"Skip reference checking: {skip_ref_check}") + + # Unpack parameters + m, n, k, l = mnkl + + # Skip unsupported testcase + if not DenseGemmKernel.can_implement( + ab_dtype, + acc_dtype, + c_dtype, + use_2cta_instrs, + mma_tiler_mn, + cluster_shape_mn, + use_tma_store, + m, + n, + k, + l, + a_major, + b_major, + c_major, + ): + raise TypeError( + f"Unsupported testcase {ab_dtype}, {acc_dtype}, {c_dtype}, {use_2cta_instrs}, {mma_tiler_mn}, {cluster_shape_mn}, {use_tma_store}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {c_major}" + ) + + if not torch.cuda.is_available(): + raise RuntimeError("GPU is required to run this example!") + + torch.manual_seed(1111) + + # Create and permute tensor A/B/C + def create_and_permute_tensor( + l, mode0, mode1, is_mode0_major, dtype, is_dynamic_layout=True + ): + # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l) + # else: (l, mode0, mode1) -> (mode0, mode1, l) + shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1) + permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0) + is_unsigned = dtype in {cutlass.Uint8} + # Temporarily use uint8 as torch does not support fp8 type + torch_dtype = ( + cutlass_torch.dtype(dtype) + if dtype not in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} + else torch.uint8 + ) + + # Create dtype torch tensor (cpu) + torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor( + shape, + torch_dtype, + permute_order=permute_order, + init_type=cutlass_torch.TensorInitType.RANDOM, + init_config=cutlass_torch.RandomInitConfig( + min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2 + ), + ) + # Create dtype torch tensor (gpu) + torch_tensor = torch_tensor_cpu.cuda() + + # Create f32 torch tensor (cpu) + f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32) + + # Create dtype cute tensor (gpu) + cute_tensor = from_dlpack(torch_tensor, assumed_align=16) + cute_tensor.element_type = dtype + if is_dynamic_layout: + cute_tensor = cute_tensor.mark_layout_dynamic( + leading_dim=(0 if is_mode0_major else 1) + ) + cute_tensor = cutlass_torch.convert_cute_tensor( + f32_torch_tensor, + cute_tensor, + dtype, + is_dynamic_layout=is_dynamic_layout, + ) + + return f32_torch_tensor, cute_tensor, torch_tensor + + a_ref, a_tensor, a_torch = create_and_permute_tensor( + l, m, k, a_major == "m", ab_dtype, is_dynamic_layout=True + ) + b_ref, b_tensor, b_torch = create_and_permute_tensor( + l, n, k, b_major == "n", ab_dtype, is_dynamic_layout=True + ) + c_ref, c_tensor, c_torch = create_and_permute_tensor( + l, m, n, c_major == "m", c_dtype, is_dynamic_layout=True + ) + + # Configure gemm kernel + gemm = DenseGemmKernel( + acc_dtype, + use_2cta_instrs, + mma_tiler_mn, + cluster_shape_mn, + use_tma_store, + ) + + torch_stream = torch.cuda.Stream() + stream = cuda.CUstream(torch_stream.cuda_stream) + # Compile gemm kernel + compiled_gemm = cute.compile(gemm, a_tensor, b_tensor, c_tensor, stream) + + # Launch GPU kernel + # Warm up + for i in range(warmup_iterations): + compiled_gemm(a_tensor, b_tensor, c_tensor, stream) + # Execution + for i in range(iterations): + compiled_gemm(a_tensor, b_tensor, c_tensor, stream) + + # Compute reference result + if not skip_ref_check: + if ab_dtype in { + cutlass.Int8, + cutlass.Uint8, + cutlass.Float8E4M3FN, + cutlass.Float8E5M2, + }: + ref = torch.einsum("mkl,nkl->mnl", a_ref.cpu(), b_ref.cpu()) + else: + ref = (torch.einsum("mkl,nkl->mnl", a_ref, b_ref)).cpu() + + # Copy gpu result back + gpu_c = c_torch.cpu() + + # Convert ref to c_type + if c_dtype == cutlass.Float32: + ref_c = ref + elif c_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}: + # m major: (l, n, m) -> (m, n, l) + # k major: (l, m, n) -> (m, n, l) + permute_order = (1, 2, 0) if c_major == "n" else (2, 1, 0) + shape = (l, m, n) if c_major == "n" else (l, n, m) + f8_torch_tensor = cutlass_torch.create_and_permute_torch_tensor( + shape, + torch.uint8, + permute_order=permute_order, + init_type=cutlass_torch.TensorInitType.SKIP, + ).cuda() + # Create dtype cute tensor (gpu) + ref_c_tensor = from_dlpack( + f8_torch_tensor, assumed_align=16 + ).mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0)) + ref_c_tensor.element_type = c_dtype + ref_c_tensor = cutlass_torch.convert_cute_tensor( + ref, + ref_c_tensor, + c_dtype, + is_dynamic_layout=True, + ) + + ref_c = f8_torch_tensor.cpu() + else: + ref_c = ref.to(cutlass_torch.dtype(c_dtype)) + + # Reference checking ref_c and gpu_c + torch.testing.assert_close( + gpu_c, + ref_c, + atol=tolerance, + rtol=1e-05, + ) + + +if __name__ == "__main__": + + def parse_comma_separated_ints(s: str) -> Tuple[int, ...]: + try: + return tuple(int(x.strip()) for x in s.split(",")) + # or: return tuple([int(x.strip()) for x in s.split(",")]) + except ValueError: + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers." + ) + + parser = argparse.ArgumentParser( + description="Example of MxNxKxL GEMM on Blackwell." + ) + + parser.add_argument( + "--mnkl", + type=parse_comma_separated_ints, + default=(256, 256, 512, 1), + help="mnkl dimensions (comma-separated)", + ) + parser.add_argument( + "--mma_tiler_mn", + type=parse_comma_separated_ints, + default=(128, 128), + help="Mma tiler (comma-separated)", + ) + parser.add_argument( + "--cluster_shape_mn", + type=parse_comma_separated_ints, + default=(1, 1), + help="Cluster shape (comma-separated)", + ) + parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.TFloat32) + parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float32) + parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32) + parser.add_argument( + "--use_2cta_instrs", + action="store_true", + help="Enable 2CTA MMA instructions feature", + ) + parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k") + parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k") + parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n") + parser.add_argument( + "--use_tma_store", action="store_true", help="Use tma store or not" + ) + parser.add_argument( + "--tolerance", type=float, default=1e-01, help="Tolerance for validation" + ) + parser.add_argument( + "--warmup_iterations", type=int, default=0, help="Warmup iterations" + ) + parser.add_argument("--iterations", type=int, default=1, help="Iterations") + parser.add_argument( + "--skip_ref_check", action="store_true", help="Skip reference checking" + ) + + args = parser.parse_args() + + if len(args.mnkl) != 4: + parser.error("--mnkl must contain exactly 4 values") + + if len(args.mma_tiler_mn) != 2: + parser.error("--mma_tiler_mn must contain exactly 2 values") + + if len(args.cluster_shape_mn) != 2: + parser.error("--cluster_shape_mn must contain exactly 2 values") + + run_dense_gemm( + args.mnkl, + args.ab_dtype, + args.c_dtype, + args.acc_dtype, + args.a_major, + args.b_major, + args.c_major, + args.mma_tiler_mn, + args.cluster_shape_mn, + args.use_2cta_instrs, + args.use_tma_store, + args.tolerance, + args.warmup_iterations, + args.iterations, + args.skip_ref_check, + ) + print("PASS") diff --git a/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py new file mode 100644 index 00000000..abc2597d --- /dev/null +++ b/examples/python/CuTeDSL/blackwell/dense_gemm_persistent.py @@ -0,0 +1,2144 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +from typing import Optional, Type, Tuple, Union + +import cuda.bindings.driver as cuda +import torch + +import cutlass +import cutlass.cute as cute +from cutlass.cute.nvgpu import cpasync, tcgen05 +import cutlass.torch as cutlass_torch +import cutlass.utils as utils +import cutlass.utils.blackwell_helpers as sm100_utils +from cutlass.cute.runtime import from_dlpack + + +""" +A high-performance persistent batched dense GEMM example for the NVIDIA Blackwell SM100 architecture +using CUTE DSL. +- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") +- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") +- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M") + +This GEMM kernel supports the following features: + - Utilizes Tensor Memory Access (TMA) for efficient memory operations + - Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions) + - Implements TMA multicast with cluster to reduce L2 memory traffic + - Support persistent tile scheduling to better overlap memory load/store with mma between tiles + - Support warp specialization to avoid explicit pipelining between mainloop load and mma + +This GEMM works as follows: +1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations. +2. MMA warp: Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction. +3. EPILOGUE warp: + - Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld. + - Type convert C matrix to output type. + - Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations, + or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations. + - Optionally accept an elementwise lambda function epilogue_op to apply to the output tensor: + e.g., relu can set epilogue_op = lambda x: cute.where(x > 0, x, cute.full_like(x, 0)) + +SM100 tcgen05.mma instructions operate as follows: +- Read matrix A from SMEM +- Read matrix B from SMEM +- Write accumulator to TMEM +The accumulator in TMEM must then be loaded to registers before writing back to GMEM. + +Input arguments to this example is same as dense_gemm.py. + +.. code-block:: bash + + python examples/blackwell/dense_gemm_persistent.py \ + --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \ + --mma_tiler_mn 256,128 --cluster_shape_mn 2,1 \ + --mnkl 8192,8192,8192,1 \ + --use_tma_store --use_2cta_instrs + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/blackwell/dense_gemm_persistent.py \ + --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \ + --mma_tiler_mn 256,128 --cluster_shape_mn 2,1 \ + --mnkl 8192,8192,8192,1 \ + --use_tma_store --use_2cta_instrs \ + --warmup_iterations 1 --iterations 10 --skip_ref_check + + +Constraints are same as dense_gemm.py: +* Supported input data types: fp16, bf16, tf32, int8, uint8, fp8 (e4m3fn, e5m2), + see detailed valid dtype combinations in below PersistentDenseGemmKernel class documentation +* A/B tensor must have the same data type +* Mma tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True) +* Mma tiler N must be 32-256, step 32 +* Cluster shape M/N must be positive and power of 2, total cluster size <= 16 +* Cluster shape M must be multiple of 2 if use_2cta_instrs=True +* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned, + i.e, number of elements is a multiple of 4, 8, and 16 for TFloat32, + Float16/BFloat16, and Int8/Uint8/Float8, respectively. +* OOB tiles are not allowed when TMA store is disabled +""" + + +class PersistentDenseGemmKernel: + """This class implements batched matrix multiplication (C = A x B) with support for various data types + and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization. + + :param acc_dtype: Data type for accumulation during computation + :type acc_dtype: type[cutlass.Numeric] + :param use_2cta_instrs: Whether to use CTA group 2 for advanced thread cooperation + :type use_2cta_instrs: bool + :param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N) + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing + :type cluster_shape_mn: Tuple[int, int] + :param use_tma_store: Whether to use Tensor Memory Access (TMA) for storing results + :type use_tma_store: bool + + :note: In current version, A and B tensor must have the same data type + - i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported + + :note: Supported A/B data types: + - TFloat32 + - Float16/BFloat16 + - Int8/Uint8 + - Float8E4M3FN/Float8E5M2 + + :note: Supported accumulator data types: + - Float32 (for all floating point A/B data types) + - Float16 (only for fp16 and fp8 A/B data types) + - Int32 (only for uint8/int8 A/B data types) + + :note: Supported C data types: + - Float32 (for float32 and int32 accumulator data types) + - Int32 (for float32 and int32 accumulator data types) + - Float16/BFloat16 (for fp16 and fp8 accumulator data types) + - Int8/Uint8 (for uint8/int8 accumulator data types) + - Float8E4M3FN/Float8E5M2 (for float32 accumulator data types) + + :note: Constraints: + - MMA tiler M must be 64/128 (use_2cta_instrs=False) or 128/256 (use_2cta_instrs=True) + - MMA tiler N must be 32-256, step 32 + - Cluster shape M must be multiple of 2 if use_2cta_instrs=True + - Cluster shape M/N must be positive and power of 2, total cluster size <= 16 + + Example: + >>> gemm = PersistentDenseGemmKernel( + ... acc_dtype=cutlass.Float32, + ... use_2cta_instrs=True, + ... mma_tiler_mn=(128, 128), + ... cluster_shape_mn=(2, 2) + ... ) + >>> gemm(a_tensor, b_tensor, c_tensor, max_active_clusters, stream) + """ + + def __init__( + self, + acc_dtype: Type[cutlass.Numeric], + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + use_tma_store: bool, + ): + """Initializes the configuration for a Blackwell dense GEMM kernel. + + This configuration includes several key aspects: + + 1. MMA Instruction Settings (tcgen05): + - acc_dtype: Data types for MMA accumulator. + - mma_tiler_mn: The (M, N) shape of the MMA instruction tiler. + - use_2cta_instrs: Boolean indicating if the tcgen05 MMA variant + with cta_group=2 should be used. + + 2. Cluster Shape: + - cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster. + + 3. Output C tensor store mode: + - use_tma_store: Boolean indicating whether to use Tensor Memory Access (TMA) for storing results. + + :param acc_dtype: Data type of the accumulator. + :type acc_dtype: type[cutlass.Numeric] + :param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction. + :type mma_tiler_mn: Tuple[int, int] + :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant. + :type use_2cta_instrs: bool + :param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster. + :type cluster_shape_mn: Tuple[int, int] + :param use_tma_store: Use Tensor Memory Access (TMA) or normal store for output C tensor. + :type use_tma_store: bool + """ + + self.acc_dtype: Type[cutlass.Numeric] = acc_dtype + self.use_2cta_instrs = use_2cta_instrs + self.cluster_shape_mn = cluster_shape_mn + # K dimension is deferred in _setup_attributes + self.mma_tiler = (*mma_tiler_mn, 1) + self.use_tma_store = use_tma_store + + self.cta_group = ( + tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE + ) + + self.occupancy = 1 + # Set specialized warp ids + self.epilog_warp_id = ( + 0, + 1, + 2, + 3, + ) + self.mma_warp_id = 4 + self.tma_warp_id = 5 + self.threads_per_cta = 32 * len( + (self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id) + ) + # Set barrier id for cta sync, epilogue sync and tmem ptr sync + self.cta_sync_bar_id = 0 + self.epilog_sync_bar_id = 1 + self.tmem_ptr_sync_bar_id = 2 + self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"] + + def _setup_attributes(self): + """Set up configurations that are dependent on GEMM inputs + + This method configures various attributes based on the input tensor properties + (data types, leading dimensions) and kernel settings: + - Configuring tiled MMA + - Computing MMA/cluster/tile shapes + - Computing cluster layout + - Computing multicast CTAs for A/B + - Computing epilogue subtile + - Setting up A/B/C stage counts in shared memory + - Computing A/B/C shared memory layout + - Computing tensor memory allocation columns + """ + # Configure tiled mma + tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + + # Compute mma/cluster/tile shapes + mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2]) + mma_inst_tile_k = 4 + self.mma_tiler = ( + self.mma_tiler[0], + self.mma_tiler[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + self.cta_tile_shape_mnk = ( + self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler[1], + self.mma_tiler[2], + ) + + # Compute cluster layout + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout((*self.cluster_shape_mn, 1)), + (tiled_mma.thr_id.shape,), + ) + + # Compute number of multicast CTAs for A/B + self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2]) + self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1]) + self.is_a_mcast = self.num_mcast_ctas_a > 1 + self.is_b_mcast = self.num_mcast_ctas_b > 1 + + # Compute epilogue subtile + if cutlass.const_expr(self.use_tma_store): + self.epi_tile = sm100_utils.compute_epilogue_tile_shape( + self.cta_tile_shape_mnk, + self.use_2cta_instrs, + self.c_layout, + self.c_dtype, + ) + else: + self.epi_tile = self.cta_tile_shape_mnk[:2] + + # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory + self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.b_dtype, + self.epi_tile, + self.c_dtype, + self.c_layout, + self.num_smem_capacity, + self.occupancy, + self.use_tma_store, + ) + + # Compute A/B/C shared memory layout + self.a_smem_layout_staged = sm100_utils.make_smem_layout_a( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.num_ab_stage, + ) + self.b_smem_layout_staged = sm100_utils.make_smem_layout_b( + tiled_mma, + self.mma_tiler, + self.b_dtype, + self.num_ab_stage, + ) + self.c_smem_layout_staged = ( + sm100_utils.make_smem_layout_epi( + self.c_dtype, + self.c_layout, + self.epi_tile, + self.num_c_stage, + ) + if cutlass.const_expr(self.use_tma_store) + else None + ) + + # Compute the number of tensor memory allocation columns + self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols( + tiled_mma, self.mma_tiler, self.num_acc_stage + ) + + @cute.jit + def __call__( + self, + a: cute.Tensor, + b: cute.Tensor, + c: cute.Tensor, + max_active_clusters: cutlass.Constexpr, + stream: cuda.CUstream, + epilogue_op: cutlass.Constexpr = lambda x: x, + ): + """Execute the GEMM operation in steps: + - Setup static attributes before smem/grid/tma computation + - Setup TMA load/store atoms and tensors + - Compute grid size with regard to hardware constraints + - Define shared storage for kernel + - Launch the kernel synchronously + + :param a: Input tensor A + :type a: cute.Tensor + :param b: Input tensor B + :type b: cute.Tensor + :param c: Output tensor C + :type c: cute.Tensor + :param max_active_clusters: Maximum number of active clusters + :type max_active_clusters: cutlass.Constexpr + :param stream: CUDA stream for asynchronous execution + :type stream: cuda.CUstream + :param epilogue_op: Optional elementwise lambda function to apply to the output tensor + :type epilogue_op: cutlass.Constexpr + :raises TypeError: If input data types are incompatible with the MMA instruction. + :raises AssertionError: If OOB (Out-Of-Bounds) tiles are present when TMA store is disabled. + """ + # Setup static attributes before smem/grid/tma computation + self.a_dtype: Type[cutlass.Numeric] = a.element_type + self.b_dtype: Type[cutlass.Numeric] = b.element_type + self.c_dtype: Type[cutlass.Numeric] = c.element_type + self.a_major_mode = utils.LayoutEnum.from_tensor(a).mma_major_mode() + self.b_major_mode = utils.LayoutEnum.from_tensor(b).mma_major_mode() + self.c_layout = utils.LayoutEnum.from_tensor(c) + + # Check if input data types are compatible with MMA instruction + if cutlass.const_expr(self.a_dtype != self.b_dtype): + raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}") + + # Setup attributes that dependent on gemm inputs + self._setup_attributes() + + tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + atom_thr_size = cute.size(tiled_mma.thr_id.shape) + + # Setup TMA load for A + a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast) + a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0)) + tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A( + a_op, + a, + a_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + internal_type=( + cutlass.TFloat32 if a.element_type is cutlass.Float32 else None + ), + ) + + # Setup TMA load for B + b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast) + b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0)) + tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B( + b_op, + b, + b_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + internal_type=( + cutlass.TFloat32 if b.element_type is cutlass.Float32 else None + ), + ) + + a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout) + b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout) + self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size + + # Setup TMA store for C + tma_atom_c = None + tma_tensor_c = None + if cutlass.const_expr(self.use_tma_store): + c_cta_v_layout = cute.composition( + cute.make_identity_layout(c.shape), self.epi_tile + ) + epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0)) + tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom( + cpasync.CopyBulkTensorTileS2GOp(), + c, + epi_smem_layout, + c_cta_v_layout, + ) + + # Compute grid size + self.tile_sched_params, grid = self._compute_grid( + c, self.cta_tile_shape_mnk, self.cluster_shape_mn, max_active_clusters + ) + + self.buffer_align_bytes = 1024 + + c_smem_size = ( + cute.cosize(self.c_smem_layout_staged.outer) + if cutlass.const_expr(self.use_tma_store) + else 0 + ) + + # Define shared storage for kernel + @cute.struct + class SharedStorage: + ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] + ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] + acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] + acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] + tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_holding_buf: cutlass.Int32 + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC: cute.struct.Align[ + cute.struct.MemRange[ + self.c_dtype, + c_smem_size, + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_M, MMA_K, STAGE) + sA: cute.struct.Align[ + cute.struct.MemRange[ + self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer) + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_N, MMA_K, STAGE) + sB: cute.struct.Align[ + cute.struct.MemRange[ + self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer) + ], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + + # Launch the kernel synchronously + self.kernel( + tiled_mma, + tma_atom_a, + tma_tensor_a, + tma_atom_b, + tma_tensor_b, + tma_atom_c, + tma_tensor_c if cutlass.const_expr(self.use_tma_store) else c, + self.cluster_layout_vmnk, + self.a_smem_layout_staged, + self.b_smem_layout_staged, + self.c_smem_layout_staged, + self.epi_tile, + self.tile_sched_params, + epilogue_op, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=(*self.cluster_shape_mn, 1), + smem=self.shared_storage.size_in_bytes(), + stream=stream, + ) + return + + # GPU device kernel + @cute.kernel + def kernel( + self, + tiled_mma: cute.TiledMma, + tma_atom_a: cute.CopyAtom, + mA_mkl: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB_nkl: cute.Tensor, + tma_atom_c: Optional[cute.CopyAtom], + mC_mnl: cute.Tensor, + cluster_layout_vmnk: cute.Layout, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None], + epi_tile: cute.Tile, + tile_sched_params: utils.PersistentTileSchedulerParams, + epilogue_op: cutlass.Constexpr, + ): + """ + GPU device kernel performing the Persistent batched GEMM computation. + """ + warp_idx = cute.arch.warp_idx() + warp_idx = cute.arch.make_warp_uniform(warp_idx) + + # + # Prefetch tma desc + # + if warp_idx == self.tma_warp_id: + cpasync.prefetch_descriptor(tma_atom_a) + cpasync.prefetch_descriptor(tma_atom_b) + if cutlass.const_expr(self.use_tma_store): + cpasync.prefetch_descriptor(tma_atom_c) + + use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2 + + # + # Setup cta/thread coordinates + # + # Coords inside cluster + bidx, bidy, bidz = cute.arch.block_idx() + mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape) + is_leader_cta = mma_tile_coord_v == 0 + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord( + cta_rank_in_cluster + ) + # Coord inside cta + tidx, _, _ = cute.arch.thread_idx() + + # + # Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier + # + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr + tmem_holding_buf = storage.tmem_holding_buf + + # Initialize mainloop ab_pipeline (barrier) and states + ab_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread) + num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1 + ab_pipeline_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, num_tma_producer + ) + ab_pipeline = utils.PipelineTmaUmma.create( + barrier_storage=storage.ab_full_mbar_ptr.data_ptr(), + num_stages=self.num_ab_stage, + producer_group=ab_pipeline_producer_group, + consumer_group=ab_pipeline_consumer_group, + tx_count=self.num_tma_load_bytes, + cta_layout_vmnk=cluster_layout_vmnk, + ) + + # Initialize acc_pipeline (barrier) and states + acc_pipeline_producer_group = utils.CooperativeGroup(utils.Agent.Thread) + num_acc_consumer_threads = len(self.epilog_warp_id) * ( + 2 if use_2cta_instrs else 1 + ) + acc_pipeline_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, num_acc_consumer_threads + ) + acc_pipeline = utils.PipelineUmmaAsync.create( + barrier_storage=storage.acc_full_mbar_ptr.data_ptr(), + num_stages=self.num_acc_stage, + producer_group=acc_pipeline_producer_group, + consumer_group=acc_pipeline_consumer_group, + cta_layout_vmnk=cluster_layout_vmnk, + ) + + # Tensor memory dealloc barrier init + if use_2cta_instrs: + if warp_idx == self.tma_warp_id: + num_tmem_dealloc_threads = 32 + with cute.arch.elect_one(): + cute.arch.mbarrier_init_arrive_cnt( + tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads + ) + cute.arch.mbarrier_init_fence() + + # Cluster arrive after barrier init + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_arrive_relaxed() + + # + # Setup smem tensor A/B/C + # + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC = ( + storage.sC.get_tensor( + c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner + ) + if cutlass.const_expr(self.use_tma_store) + else None + ) + # (MMA, MMA_M, MMA_K, STAGE) + sA = storage.sA.get_tensor( + a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner + ) + # (MMA, MMA_N, MMA_K, STAGE) + sB = storage.sB.get_tensor( + b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner + ) + + # + # Compute multicast mask for A/B buffer full + # + a_full_mcast_mask = None + b_full_mcast_mask = None + if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs: + a_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2 + ) + b_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1 + ) + + # + # Local_tile partition global tensors + # + # (bM, bK, loopM, loopK, loopL) + gA_mkl = cute.local_tile( + mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None) + ) + # (bN, bK, loopN, loopK, loopL) + gB_nkl = cute.local_tile( + mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None) + ) + # (bM, bN, loopM, loopN, loopL) + gC_mnl = cute.local_tile( + mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None) + ) + k_block_cnt = cute.size(gA_mkl, mode=[3]) + + # + # Partition global tensor for TiledMMA_A/B/C + # + thr_mma = tiled_mma.get_slice(mma_tile_coord_v) + # (MMA, MMA_M, MMA_K, loopM, loopK, loopL) + tCgA = thr_mma.partition_A(gA_mkl) + # (MMA, MMA_N, MMA_K, loopN, loopK, loopL) + tCgB = thr_mma.partition_B(gB_nkl) + # (MMA, MMA_M, MMA_N, loopM, loopN, loopL) + tCgC = thr_mma.partition_C(gC_mnl) + + # + # Partition global/shared tensor for TMA load A/B + # + # TMA load A partition_S/D + a_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopM, loopK, loopL) + tAsA, tAgA = cpasync.tma_partition( + tma_atom_a, + block_in_cluster_coord_vmnk[2], + a_cta_layout, + cute.group_modes(sA, 0, 3), + cute.group_modes(tCgA, 0, 3), + ) + # TMA load B partition_S/D + b_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopM, loopK, loopL) + tBsB, tBgB = cpasync.tma_partition( + tma_atom_b, + block_in_cluster_coord_vmnk[1], + b_cta_layout, + cute.group_modes(sB, 0, 3), + cute.group_modes(tCgB, 0, 3), + ) + + # + # Partition shared/tensor memory tensor for TiledMMA_A/B/C + # + # (MMA, MMA_M, MMA_K, STAGE) + tCrA = tiled_mma.make_fragment_A(sA) + # (MMA, MMA_N, MMA_K, STAGE) + tCrB = tiled_mma.make_fragment_B(sB) + # (MMA, MMA_M, MMA_N) + acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2]) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_fake = tiled_mma.make_fragment_C( + cute.append(acc_shape, self.num_acc_stage) + ) + + # + # Cluster wait before tensor memory alloc + # + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_wait() + else: + cute.arch.barrier( + barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta + ) + + # + # Specialized TMA load warp + # + + if warp_idx == self.tma_warp_id: + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + ab_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.num_ab_stage + ) + + while work_tile.is_valid_tile: + + # Get tile coord from tile scheduler + cur_tile_coord = work_tile.tile_idx + mma_tile_coord_mnl = ( + cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape), + cur_tile_coord[1], + cur_tile_coord[2], + ) + + # + # Slice to per mma tile index + # + # ((atom_v, rest_v), loopK) + tAgA_slice = tAgA[ + (None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2]) + ] + # ((atom_v, rest_v), loopK) + tBgB_slice = tBgB[ + (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2]) + ] + + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + ab_producer_state.reset_count() + peek_ab_empty_status = cutlass.Boolean(1) + if ab_producer_state.count < k_block_cnt: + peek_ab_empty_status = ab_pipeline.producer_try_acquire( + ab_producer_state + ) + # + # Tma load loop + # + for k_block in cutlass.range_dynamic(0, k_block_cnt, 1, unroll=1): + # Conditionally wait for AB buffer empty + ab_pipeline.producer_acquire( + ab_producer_state, peek_ab_empty_status + ) + + # TMA load A/B + cute.copy( + tma_atom_a, + tAgA_slice[(None, ab_producer_state.count)], + tAsA[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + mcast_mask=a_full_mcast_mask, + ) + cute.copy( + tma_atom_b, + tBgB_slice[(None, ab_producer_state.count)], + tBsB[(None, ab_producer_state.index)], + tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state), + mcast_mask=b_full_mcast_mask, + ) + + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1 + ab_producer_state.advance() + peek_ab_empty_status = cutlass.Boolean(1) + if ab_producer_state.count < k_block_cnt: + peek_ab_empty_status = ab_pipeline.producer_try_acquire( + ab_producer_state + ) + + # + # Advance to next tile + # + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + + # + # Wait A/B buffer empty + # + ab_pipeline.producer_tail(ab_producer_state) + + # + # Specialized MMA warp + # + if warp_idx == self.mma_warp_id: + # + # Bar sync for retrieve tensor memory ptr from shared mem + # + tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id)) + cute.arch.barrier( + barrier_id=self.tmem_ptr_sync_bar_id, + number_of_threads=tmem_ptr_read_threads, + ) + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, + alignment=16, + ptr_to_buffer_holding_addr=tmem_holding_buf, + ) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout) + + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + ab_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.num_ab_stage + ) + acc_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.num_acc_stage + ) + + while work_tile.is_valid_tile: + + # Get tile coord from tile scheduler + cur_tile_coord = work_tile.tile_idx + mma_tile_coord_mnl = ( + cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape), + cur_tile_coord[1], + cur_tile_coord[2], + ) + + # Set tensor memory buffer for current tile + # (MMA, MMA_M, MMA_N) + tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)] + + # Peek (try_wait) AB buffer full for k_block = 0 + ab_consumer_state.reset_count() + peek_ab_full_status = cutlass.Boolean(1) + if ab_consumer_state.count < k_block_cnt and is_leader_cta: + peek_ab_full_status = ab_pipeline.consumer_try_wait( + ab_consumer_state + ) + + # + # Wait for accumulator buffer empty + # + if is_leader_cta: + acc_pipeline.producer_acquire(acc_producer_state) + + # + # Reset the ACCUMULATE field for each tile + # + tiled_mma.set(tcgen05.Field.ACCUMULATE, False) + + # + # Mma mainloop + # + for k_block in cutlass.range_dynamic(0, k_block_cnt, 1, unroll=1): + if is_leader_cta: + # Conditionally wait for AB buffer full + ab_pipeline.consumer_wait( + ab_consumer_state, peek_ab_full_status + ) + + # tCtAcc += tCrA * tCrB + num_kphases = cute.size(tCrA, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = ( + None, + None, + kphase_idx, + ab_consumer_state.index, + ) + + cute.gemm( + tiled_mma, + tCtAcc, + tCrA[kphase_coord], + tCrB[kphase_coord], + tCtAcc, + ) + # Enable accumulate on tCtAcc after first kphase + tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + + # Async arrive AB buffer empty + ab_pipeline.consumer_release(ab_consumer_state) + + # Peek (try_wait) AB buffer full for k_block = k_block + 1 + ab_consumer_state.advance() + peek_ab_full_status = cutlass.Boolean(1) + if ab_consumer_state.count < k_block_cnt: + if is_leader_cta: + peek_ab_full_status = ab_pipeline.consumer_try_wait( + ab_consumer_state + ) + + # + # Async arrive accumulator buffer full + # + if is_leader_cta: + acc_pipeline.producer_commit(acc_producer_state) + acc_producer_state.advance() + + # + # Advance to next tile + # + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + + # + # Wait for accumulator buffer empty + # + acc_pipeline.producer_tail(acc_producer_state) + # + # Specialized epilogue warps + # + if warp_idx < self.mma_warp_id: + # + # Alloc tensor memory buffer + # + if warp_idx == self.epilog_warp_id[0]: + cute.arch.alloc_tmem( + self.num_tmem_alloc_cols, + tmem_holding_buf, + is_two_cta=use_2cta_instrs, + ) + + # + # Bar sync for retrieve tensor memory ptr from shared memory + # + tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id)) + cute.arch.barrier( + barrier_id=self.tmem_ptr_sync_bar_id, + number_of_threads=tmem_ptr_read_threads, + ) + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, + alignment=16, + ptr_to_buffer_holding_addr=tmem_holding_buf, + ) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout) + + # + # Partition for epilogue + # + epi_tidx = tidx + tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = ( + self.epilog_tmem_copy_and_partition( + epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs + ) + ) + + tTR_rC = None + tiled_copy_r2s = None + simt_atom = None + tRS_rC = None + tRS_sC = None + bSG_sC = None + bSG_gC_partitioned = None + tTR_gC_partitioned = None + if cutlass.const_expr(self.use_tma_store): + tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype) + tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition( + tiled_copy_t2r, tTR_rC, epi_tidx, sC + ) + tma_atom_c, bSG_sC, bSG_gC_partitioned = ( + self.epilog_gmem_copy_and_partition( + epi_tidx, tma_atom_c, tCgC, epi_tile, sC + ) + ) + else: + simt_atom, tTR_rC, tTR_gC_partitioned = ( + self.epilog_gmem_copy_and_partition( + epi_tidx, tiled_copy_t2r, tCgC, epi_tile, sC + ) + ) + + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + acc_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.num_acc_stage + ) + + c_pipeline = None + if cutlass.const_expr(self.use_tma_store): + # Threads/warps participating in tma store pipeline + c_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, + 32 * len(self.epilog_warp_id), + 32 * len(self.epilog_warp_id), + ) + c_pipeline = utils.PipelineTmaStore.create( + num_stages=self.num_c_stage, + producer_group=c_producer_group, + ) + + while work_tile.is_valid_tile: + + # Get tile coord from tile scheduler + cur_tile_coord = work_tile.tile_idx + mma_tile_coord_mnl = ( + cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape), + cur_tile_coord[1], + cur_tile_coord[2], + ) + + # + # Slice to per mma tile index + # + bSG_gC = None + tTR_gC = None + if cutlass.const_expr(self.use_tma_store): + # ((ATOM_V, REST_V), EPI_M, EPI_N) + bSG_gC = bSG_gC_partitioned[ + ( + None, + None, + None, + *mma_tile_coord_mnl, + ) + ] + else: + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N) + tTR_gC = tTR_gC_partitioned[ + ( + None, + None, + None, + None, + None, + *mma_tile_coord_mnl, + ) + ] + + # Set tensor memory buffer for current tile + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M) + tTR_tAcc = tTR_tAcc_base[ + (None, None, None, None, None, acc_consumer_state.index) + ] + + # + # Wait for accumulator buffer full + # + acc_pipeline.consumer_wait(acc_consumer_state) + + tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc)) + if cutlass.const_expr(self.use_tma_store): + bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC)) + else: + tTR_gC = cute.group_modes(tTR_gC, 3, cute.rank(tTR_gC)) + + # + # Store accumulator to global memory in subtiles + # + subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3]) + num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt + for subtile_idx in cutlass.range_dynamic(subtile_cnt): + # + # Load accumulator from tensor memory buffer to register + # + tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)] + cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc) + + if cutlass.const_expr(self.use_tma_store): + # + # Convert to C type + # + acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load() + acc_vec = epilogue_op(acc_vec.to(self.c_dtype)) + tRS_rC.store(acc_vec) + + # + # Store C to shared memory + # + c_buffer = (num_prev_subtiles + subtile_idx) % self.num_c_stage + cute.copy( + tiled_copy_r2s, + tRS_rC, + tRS_sC[(None, None, None, c_buffer)], + ) + # Fence and barrier to make sure shared memory store is visible to TMA store + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + epilog_threads = 32 * len(self.epilog_warp_id) + cute.arch.barrier( + barrier_id=self.epilog_sync_bar_id, + number_of_threads=epilog_threads, + ) + + # + # TMA store C to global memory + # + if warp_idx == self.epilog_warp_id[0]: + cute.copy( + tma_atom_c, + bSG_sC[(None, c_buffer)], + bSG_gC[(None, subtile_idx)], + ) + # Fence and barrier to make sure shared memory store is visible to TMA store + c_pipeline.producer_commit() + c_pipeline.producer_acquire() + cute.arch.barrier( + barrier_id=self.epilog_sync_bar_id, + number_of_threads=epilog_threads, + ) + else: + # + # Convert to C type + # + acc_vec = tTR_rAcc.load() + acc_vec = epilogue_op(acc_vec.to(self.c_dtype)) + tTR_rC.store(acc_vec) + + # + # Store C to global memory + # + cute.copy( + simt_atom, tTR_rC, tTR_gC[(None, None, None, subtile_idx)] + ) + + # + # Async arrive accumulator buffer empty + # + with cute.arch.elect_one(): + acc_pipeline.consumer_release(acc_consumer_state) + acc_consumer_state.advance() + + # + # Advance to next tile + # + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + + # + # Dealloc the tensor memory buffer + # + if warp_idx == self.epilog_warp_id[0]: + cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs) + epilog_threads = 32 * len(self.epilog_warp_id) + cute.arch.barrier( + barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads + ) + if warp_idx == self.epilog_warp_id[0]: + if use_2cta_instrs: + cute.arch.mbarrier_arrive( + tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1 + ) + cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0) + cute.arch.dealloc_tmem( + tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs + ) + # + # Wait for C store complete + # + if cutlass.const_expr(self.use_tma_store): + c_pipeline.producer_tail() + + def epilog_tmem_copy_and_partition( + self, + tidx: cutlass.Int32, + tAcc: cute.Tensor, + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + use_2cta_instrs: Union[cutlass.Boolean, bool], + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination). + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param tAcc: The accumulator tensor to be copied and partitioned + :type tAcc: cute.Tensor + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param use_2cta_instrs: Whether use_2cta_instrs is enabled + :type use_2cta_instrs: bool + + :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where: + - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + - tTR_tAcc: The partitioned accumulator tensor + - tTR_rAcc: The accumulated tensor in register used to hold t2r results + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + # Make tiledCopy for tensor memory load + copy_atom_t2r = sm100_utils.get_tmem_load_op( + self.cta_tile_shape_mnk, + self.c_layout, + self.c_dtype, + self.acc_dtype, + epi_tile, + use_2cta_instrs, + ) + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE) + tAcc_epi = cute.flat_divide( + tAcc[((None, None), 0, 0, None)], + epi_tile, + ) + # (EPI_TILE_M, EPI_TILE_N) + tiled_copy_t2r = tcgen05.make_tmem_copy( + copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)] + ) + + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE) + tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi) + + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_mnl_epi = cute.flat_divide( + gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile + ) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL) + tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi) + # (T2R, T2R_M, T2R_N) + tTR_rAcc = cute.make_fragment( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype + ) + return tiled_copy_t2r, tTR_tAcc, tTR_rAcc + + def epilog_smem_copy_and_partition( + self, + tiled_copy_t2r: cute.TiledCopy, + tTR_rC: cute.Tensor, + tidx: cutlass.Int32, + sC: cute.Tensor, + ) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination). + + :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + :type tiled_copy_t2r: cute.TiledCopy + :param tTR_rC: The partitioned accumulator tensor + :type tTR_rC: cute.Tensor + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + :type sepi: cute.Tensor + + :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where: + - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s) + - tRS_rC: The partitioned tensor C (register source) + - tRS_sC: The partitioned tensor C (smem destination) + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + copy_atom_r2s = sm100_utils.get_smem_store_op( + self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r + ) + tiled_copy_r2s = cute.make_tiled_copy( + copy_atom_r2s, + layout_tv=tiled_copy_t2r.layout_dst_tv_tiled, + tiler_mn=tiled_copy_t2r.tiler_mn, + ) + # (R2S, R2S_M, R2S_N, PIPE_D) + thr_copy_r2s = tiled_copy_r2s.get_slice(tidx) + tRS_sC = thr_copy_r2s.partition_D(sC) + # (R2S, R2S_M, R2S_N) + tRS_rC = tiled_copy_r2s.retile(tTR_rC) + return tiled_copy_r2s, tRS_rC, tRS_sC + + def epilog_gmem_copy_and_partition( + self, + tidx: cutlass.Int32, + atom: Union[cute.CopyAtom, cute.TiledCopy], + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + sC: cute.Tensor, + ) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]: + """Make tiledCopy for global memory store, then use it to: + - partition register array (source) and global memory (destination) for none TMA store version; + - partition shared memory (source) and global memory (destination) for TMA store version. + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version + :type atom: cute.CopyAtom or cute.TiledCopy + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + + :return: A tuple containing either: + - For TMA store: (tma_atom_c, bSG_sC, bSG_gC) where: + - tma_atom_c: The TMA copy atom + - bSG_sC: The partitioned shared memory tensor C + - bSG_gC: The partitioned global tensor C + - For non-TMA store: (simt_atom, tTR_rC, tTR_gC) where: + - simt_atom: The SIMT copy atom + - tTR_rC: The register tensor C + - tTR_gC: The partitioned global tensor C + :rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor] + """ + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_epi = cute.flat_divide( + gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile + ) + if cutlass.const_expr(self.use_tma_store): + tma_atom_c = atom + sC_for_tma_partition = cute.group_modes(sC, 0, 2) + gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2) + # ((ATOM_V, REST_V), EPI_M, EPI_N) + # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL) + bSG_sC, bSG_gC = cpasync.tma_partition( + tma_atom_c, + 0, + cute.make_layout(1), + sC_for_tma_partition, + gC_for_tma_partition, + ) + return tma_atom_c, bSG_sC, bSG_gC + else: + tiled_copy_t2r = atom + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL) + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + tTR_gC = thr_copy_t2r.partition_D(gC_epi) + # (T2R, T2R_M, T2R_N) + tTR_rC = cute.make_fragment( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.c_dtype + ) + simt_atom = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), self.c_dtype) + return simt_atom, tTR_rC, tTR_gC + + @staticmethod + def _compute_stages( + tiled_mma: cute.TiledMma, + mma_tiler_mnk: Tuple[int, int, int], + a_dtype: Type[cutlass.Numeric], + b_dtype: Type[cutlass.Numeric], + epi_tile: cute.Tile, + c_dtype: Type[cutlass.Numeric], + c_layout: utils.LayoutEnum, + num_smem_capacity: int, + occupancy: int, + use_tma_store: bool, + ) -> Tuple[int, int, int]: + """Computes the number of stages for A/B/C operands based on heuristics. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler. + :type mma_tiler_mnk: tuple[int, int, int] + :param a_dtype: Data type of operand A. + :type a_dtype: type[cutlass.Numeric] + :param b_dtype: Data type of operand B. + :type b_dtype: type[cutlass.Numeric] + :param epi_tile: The epilogue tile shape. + :type epi_tile: cute.Tile + :param c_dtype: Data type of operand C (output). + :type c_dtype: type[cutlass.Numeric] + :param c_layout: Layout enum of operand C. + :type c_layout: utils.LayoutEnum + :param num_smem_capacity: Total available shared memory capacity in bytes. + :type num_smem_capacity: int + :param occupancy: Target number of CTAs per SM (occupancy). + :type occupancy: int + :param use_tma_store: Whether TMA store is enabled. + :type use_tma_store: bool + + :return: A tuple containing the computed number of stages for: + (ACC stages, A/B operand stages, C stages) + :rtype: tuple[int, int, int] + """ + # Default ACC stages + num_acc_stage = 2 + + # Default C stages + num_c_stage = 2 if use_tma_store else 0 + + # Calculate smem layout and size for one stage of A, B, and C + a_smem_layout_stage_one = sm100_utils.make_smem_layout_a( + tiled_mma, + mma_tiler_mnk, + a_dtype, + 1, # a tmp 1 stage is provided + ) + b_smem_layout_staged_one = sm100_utils.make_smem_layout_b( + tiled_mma, + mma_tiler_mnk, + b_dtype, + 1, # a tmp 1 stage is provided + ) + c_smem_layout_staged_one = ( + sm100_utils.make_smem_layout_epi( + c_dtype, + c_layout, + epi_tile, + 1, + ) + if use_tma_store + else None + ) + ab_bytes_per_stage = cute.size_in_bytes( + a_dtype, a_smem_layout_stage_one + ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one) + mbar_helpers_bytes = 1024 + c_bytes_per_stage = ( + cute.size_in_bytes(c_dtype, c_smem_layout_staged_one) + if use_tma_store + else 0 + ) + c_bytes = c_bytes_per_stage * num_c_stage + + # Calculate A/B stages: + # Start with total smem per CTA (capacity / occupancy) + # Subtract reserved bytes and initial C stages bytes + # Divide remaining by bytes needed per A/B stage + num_ab_stage = ( + num_smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes) + ) // ab_bytes_per_stage + + # Refine epilogue stages: + # Calculate remaining smem after allocating for A/B stages and reserved bytes + # Add remaining unused smem to epilogue + if use_tma_store: + num_c_stage += ( + num_smem_capacity + - occupancy * ab_bytes_per_stage * num_ab_stage + - occupancy * (mbar_helpers_bytes + c_bytes) + ) // (occupancy * c_bytes_per_stage) + return num_acc_stage, num_ab_stage, num_c_stage + + @staticmethod + def _compute_grid( + c: cute.Tensor, + cta_tile_shape_mnk: Tuple[int, int, int], + cluster_shape_mn: Tuple[int, int], + max_active_clusters: cutlass.Constexpr, + ) -> Tuple[utils.PersistentTileSchedulerParams, Tuple[int, int, int]]: + """Use persistent tile scheduler to compute the grid size for the output tensor C. + + :param c: The output tensor C + :type c: cute.Tensor + :param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile. + :type cta_tile_shape_mnk: tuple[int, int, int] + :param cluster_shape_mn: Shape of each cluster in M, N dimensions. + :type cluster_shape_mn: tuple[int, int] + :param max_active_clusters: Maximum number of active clusters. + :type max_active_clusters: cutlass.Constexpr + + :return: A tuple containing: + - tile_sched_params: Parameters for the persistent tile scheduler. + - grid: Grid shape for kernel launch. + :rtype: Tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]] + """ + c_shape = cute.slice_(cta_tile_shape_mnk, (None, None, 0)) + gc = cute.zipped_divide(c, tiler=c_shape) + num_ctas_mnl = gc[(0, (None, None, None))].shape + cluster_shape_mnl = (*cluster_shape_mn, 1) + + tile_sched_params = utils.PersistentTileSchedulerParams( + num_ctas_mnl, cluster_shape_mnl + ) + grid = utils.StaticPersistentTileScheduler.get_grid_shape( + tile_sched_params, max_active_clusters + ) + + return tile_sched_params, grid + + @staticmethod + def _get_tma_atom_kind( + atom_sm_cnt: cutlass.Int32, mcast: cutlass.Boolean + ) -> Union[ + cpasync.CopyBulkTensorTileG2SMulticastOp, cpasync.CopyBulkTensorTileG2SOp + ]: + """ + Select the appropriate TMA copy atom based on the number of SMs and the multicast flag. + + :param atom_sm_cnt: The number of SMs + :type atom_sm_cnt: cutlass.Int32 + :param mcast: The multicast flag + :type mcast: cutlass.Boolean + + :return: The appropriate TMA copy atom kind + :rtype: cpasync.CopyBulkTensorTileG2SMulticastOp or cpasync.CopyBulkTensorTileG2SOp + + :raise ValueError: If the atom_sm_cnt is invalid + """ + if atom_sm_cnt == 2 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 2 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 1 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE) + elif atom_sm_cnt == 1 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE) + + raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}") + + @staticmethod + def _compute_num_tmem_alloc_cols( + tiled_mma: cute.TiledMma, + mma_tiler: Tuple[int, int, int], + num_acc_stage: int, + ) -> int: + """ + Compute the number of tensor memory allocation columns. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler: The shape (M, N, K) of the MMA tile. + :type mma_tiler: tuple[int, int, int] + :param num_acc_stage: The stage of the accumulator tensor. + :type num_acc_stage: int + + :return: The number of tensor memory allocation columns. + :rtype: int + """ + acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2]) + tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage)) + num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols(tCtAcc_fake) + + return num_tmem_alloc_cols + + @staticmethod + def is_valid_dtypes( + ab_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + ) -> bool: + """ + Check if the dtypes are valid + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param acc_dtype: The data type of the accumulator + :type acc_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + + :return: True if the dtypes are valid, False otherwise + :rtype: bool + """ + is_valid = True + if ab_dtype not in { + cutlass.Float16, + cutlass.BFloat16, + cutlass.TFloat32, + cutlass.Uint8, + cutlass.Int8, + cutlass.Float8E4M3FN, + cutlass.Float8E5M2, + }: + is_valid = False + if ( + acc_dtype not in {cutlass.Float32, cutlass.Float16, cutlass.Int32} + or acc_dtype == cutlass.Float16 + and ab_dtype + not in {cutlass.Float16, cutlass.Float8E4M3FN, cutlass.Float8E5M2} + or acc_dtype == cutlass.Int32 + and ab_dtype not in {cutlass.Uint8, cutlass.Int8} + ): + is_valid = False + if ( + acc_dtype == cutlass.Float32 + and c_dtype + not in { + cutlass.Float32, + cutlass.Float16, + cutlass.BFloat16, + cutlass.Float8E4M3FN, + cutlass.Float8E5M2, + cutlass.Int32, + cutlass.Int8, + cutlass.Uint8, + } + or acc_dtype == cutlass.Float16 + and c_dtype + not in { + cutlass.BFloat16, + cutlass.Float16, + } + or acc_dtype == cutlass.Int32 + and c_dtype + not in { + cutlass.BFloat16, + cutlass.Float16, + cutlass.Float32, + cutlass.Int32, + cutlass.Int8, + cutlass.Uint8, + } + ): + is_valid = False + return is_valid + + @staticmethod + def is_valid_mma_tiler_and_cluster_shape( + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + ) -> bool: + """ + Check if the mma tiler and cluster shape are valid + + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster + :type cluster_shape_mn: Tuple[int, int] + + :return: True if the mma tiler and cluster shape are valid, False otherwise + :rtype: bool + """ + is_valid = True + # Skip invalid mma tile shape + if not ( + (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128]) + or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256]) + ): + is_valid = False + if mma_tiler_mn[1] not in range(32, 257, 32): + is_valid = False + # Skip illegal cluster shape + if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0: + is_valid = False + # Skip invalid cluster shape + is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0 + if ( + cluster_shape_mn[0] * cluster_shape_mn[1] > 16 + or cluster_shape_mn[0] <= 0 + or cluster_shape_mn[1] <= 0 + or not is_power_of_2(cluster_shape_mn[0]) + or not is_power_of_2(cluster_shape_mn[1]) + ): + is_valid = False + return is_valid + + @staticmethod + def is_valid_tensor_alignment( + m: int, + n: int, + k: int, + l: int, + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + ) -> bool: + """ + Check if the tensor alignment is valid + + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param k: The number of columns in the A tensor + :type k: int + :param l: The number of columns in the C tensor + :type l: int + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param a_major: The major axis of the A tensor + :type a_major: str + :param b_major: The major axis of the B tensor + :type b_major: str + :param c_major: The major axis of the C tensor + :type c_major: str + + :return: True if the problem shape is valid, False otherwise + :rtype: bool + """ + is_valid = True + + def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape): + major_mode_idx = 0 if is_mode0_major else 1 + num_major_elements = tensor_shape[major_mode_idx] + num_contiguous_elements = 16 * 8 // dtype.width + return num_major_elements % num_contiguous_elements == 0 + + if ( + not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l)) + or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l)) + or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l)) + ): + is_valid = False + return is_valid + + @staticmethod + def is_valid_epilog_store_option( + use_2cta_instrs: bool, + use_tma_store: bool, + m: int, + n: int, + mma_tiler_mn: Tuple[int, int], + ) -> bool: + """ + Check if the epilogue store option is valid + + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param use_tma_store: Whether to use TMA store + :type use_tma_store: bool + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + + :return: True if the epilogue store option is valid, False otherwise + :rtype: bool + """ + + is_valid = True + # None TMA store version does not have predication, can not support OOB tiles + cta_tile_shape_mn = ( + mma_tiler_mn[0] // (2 if use_2cta_instrs else 1), + mma_tiler_mn[1], + ) + if not use_tma_store: + if not (m % cta_tile_shape_mn[0] == 0 and n % cta_tile_shape_mn[1] == 0): + is_valid = False + return is_valid + + @staticmethod + def can_implement( + ab_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + use_2cta_instrs: bool, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + use_tma_store: bool, + m: int, + n: int, + k: int, + l: int, + a_major: str, + b_major: str, + c_major: str, + ) -> bool: + """ + Check if the gemm can be implemented + + :param ab_dtype: The data type of the A and B operands + :type ab_dtype: Type[cutlass.Numeric] + :param acc_dtype: The data type of the accumulator + :type acc_dtype: Type[cutlass.Numeric] + :param c_dtype: The data type of the output tensor + :type c_dtype: Type[cutlass.Numeric] + :param use_2cta_instrs: Whether to use 2 CTA groups + :type use_2cta_instrs: bool + :param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler + :type mma_tiler_mn: Tuple[int, int] + :param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster + :type cluster_shape_mn: Tuple[int, int] + :param use_tma_store: Whether to use TMA store + :type use_tma_store: bool + :param m: The number of rows in the A tensor + :type m: int + :param n: The number of columns in the B tensor + :type n: int + :param k: The number of columns in the A tensor + :type k: int + :param l: The number of columns in the C tensor + :type l: int + :param a_major: The major axis of the A tensor + :type a_major: str + :param b_major: The major axis of the B tensor + :type b_major: str + :param c_major: The major axis of the C tensor + :type c_major: str + + :return: True if the gemm can be implemented, False otherwise + :rtype: bool + """ + can_implement = True + # Skip unsupported types + if not PersistentDenseGemmKernel.is_valid_dtypes(ab_dtype, acc_dtype, c_dtype): + can_implement = False + # Skip invalid mma tile shape and cluster shape + if not PersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape( + use_2cta_instrs, mma_tiler_mn, cluster_shape_mn + ): + can_implement = False + # Skip illegal problem shape for load/store alignment + if not PersistentDenseGemmKernel.is_valid_tensor_alignment( + m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major + ): + can_implement = False + # Skip invalid epilogue store option + if not PersistentDenseGemmKernel.is_valid_epilog_store_option( + use_2cta_instrs, use_tma_store, m, n, mma_tiler_mn + ): + can_implement = False + return can_implement + + +def run_dense_gemm( + mnkl: Tuple[int, int, int, int], + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + mma_tiler_mn: Tuple[int, int], + cluster_shape_mn: Tuple[int, int], + use_2cta_instrs: bool, + use_tma_store: bool, + tolerance: float, + warmup_iterations: int = 0, + iterations: int = 1, + skip_ref_check: bool = False, +): + """ + Prepare A/B/C tensors, launch GPU kernel, and reference checking. + """ + print(f"Running Blackwell Persistent Dense GEMM test with:") + print(f"mnkl: {mnkl}") + print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}") + print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}") + print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}") + print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}") + print(f"Use TMA Store: {'True' if use_tma_store else 'False'}") + print(f"Tolerance: {tolerance}") + print(f"Warmup iterations: {warmup_iterations}") + print(f"Iterations: {iterations}") + print(f"Skip reference checking: {skip_ref_check}") + + # Unpack parameters + m, n, k, l = mnkl + + # Skip unsupported testcase + if not PersistentDenseGemmKernel.can_implement( + ab_dtype, + acc_dtype, + c_dtype, + use_2cta_instrs, + mma_tiler_mn, + cluster_shape_mn, + use_tma_store, + m, + n, + k, + l, + a_major, + b_major, + c_major, + ): + raise TypeError( + f"Unsupported testcase {ab_dtype}, {acc_dtype}, {c_dtype}, {use_2cta_instrs}, {mma_tiler_mn}, {cluster_shape_mn}, {use_tma_store}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {c_major}" + ) + + if not torch.cuda.is_available(): + raise RuntimeError("GPU is required to run this example!") + + torch.manual_seed(1111) + + # Create and permute tensor A/B/C + def create_and_permute_tensor( + l, mode0, mode1, is_mode0_major, dtype, is_dynamic_layout=True + ): + # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l) + # else: (l, mode0, mode1) -> (mode0, mode1, l) + shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1) + permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0) + is_unsigned = dtype in {cutlass.Uint8} + # Temporarily use uint8 as torch does not support fp8 type + torch_dtype = ( + cutlass_torch.dtype(dtype) + if dtype not in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} + else torch.uint8 + ) + + # Create dtype torch tensor (cpu) + torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor( + shape, + torch_dtype, + permute_order=permute_order, + init_type=cutlass_torch.TensorInitType.RANDOM, + init_config=cutlass_torch.RandomInitConfig( + min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2 + ), + ) + # Create dtype torch tensor (gpu) + torch_tensor = torch_tensor_cpu.cuda() + + # Create f32 torch tensor (cpu) + f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32) + + # Create dtype cute tensor (gpu) + cute_tensor = from_dlpack(torch_tensor, assumed_align=16) + cute_tensor.element_type = dtype + if is_dynamic_layout: + cute_tensor = cute_tensor.mark_layout_dynamic( + leading_dim=(0 if is_mode0_major else 1) + ) + cute_tensor = cutlass_torch.convert_cute_tensor( + f32_torch_tensor, + cute_tensor, + dtype, + is_dynamic_layout=is_dynamic_layout, + ) + + return f32_torch_tensor, cute_tensor, torch_tensor + + a_ref, a_tensor, a_torch = create_and_permute_tensor( + l, m, k, a_major == "m", ab_dtype, is_dynamic_layout=True + ) + b_ref, b_tensor, b_torch = create_and_permute_tensor( + l, n, k, b_major == "n", ab_dtype, is_dynamic_layout=True + ) + c_ref, c_tensor, c_torch = create_and_permute_tensor( + l, m, n, c_major == "m", c_dtype, is_dynamic_layout=True + ) + + # Configure gemm kernel + gemm = PersistentDenseGemmKernel( + acc_dtype, + use_2cta_instrs, + mma_tiler_mn, + cluster_shape_mn, + use_tma_store, + ) + + # Compute max active clusters on current device + hardware_info = cutlass.utils.HardwareInfo() + max_active_clusters = hardware_info.get_max_active_clusters( + cluster_shape_mn[0] * cluster_shape_mn[1] + ) + + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + # Compile gemm kernel + compiled_gemm = cute.compile( + gemm, a_tensor, b_tensor, c_tensor, max_active_clusters, current_stream + ) + + # Launch GPU kernel + # Warm up + for i in range(warmup_iterations): + compiled_gemm(a_tensor, b_tensor, c_tensor, current_stream) + # Execution + for i in range(iterations): + compiled_gemm(a_tensor, b_tensor, c_tensor, current_stream) + + # Compute reference result + if not skip_ref_check: + if ab_dtype in { + cutlass.Int8, + cutlass.Uint8, + cutlass.Float8E4M3FN, + cutlass.Float8E5M2, + }: + ref = torch.einsum("mkl,nkl->mnl", a_ref.cpu(), b_ref.cpu()) + else: + ref = (torch.einsum("mkl,nkl->mnl", a_ref, b_ref)).cpu() + + # Copy gpu result back + gpu_c = c_torch.cpu() + + # Convert ref to c_type + if c_dtype == cutlass.Float32: + ref_c = ref + elif c_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN}: + # m major: (l, n, m) -> (m, n, l) + # n major: (l, m, n) -> (m, n, l) + permute_order = (1, 2, 0) if c_major == "n" else (2, 1, 0) + shape = (l, m, n) if c_major == "n" else (l, n, m) + f8_torch_tensor = cutlass_torch.create_and_permute_torch_tensor( + shape, + torch.uint8, + permute_order=permute_order, + init_type=cutlass_torch.TensorInitType.SKIP, + ).cuda() + # Create dtype cute tensor (gpu) + ref_c_tensor = from_dlpack( + f8_torch_tensor, assumed_align=16 + ).mark_layout_dynamic(leading_dim=(1 if c_major == "n" else 0)) + ref_c_tensor.element_type = c_dtype + ref_c_tensor = cutlass_torch.convert_cute_tensor( + ref, + ref_c_tensor, + c_dtype, + is_dynamic_layout=True, + ) + + ref_c = f8_torch_tensor.cpu() + else: + ref_c = ref.to(cutlass_torch.dtype(c_dtype)) + + # Reference checking ref_c and gpu_c + torch.testing.assert_close( + gpu_c, + ref_c, + atol=tolerance, + rtol=1e-05, + ) + + +if __name__ == "__main__": + + def parse_comma_separated_ints(s: str) -> Tuple[int, ...]: + try: + return tuple(int(x.strip()) for x in s.split(",")) + except ValueError: + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers." + ) + + parser = argparse.ArgumentParser( + description="Example of Dense Persistent GEMM on Blackwell." + ) + + parser.add_argument( + "--mnkl", + type=parse_comma_separated_ints, + default=(256, 256, 512, 1), + help="mnkl dimensions (comma-separated)", + ) + parser.add_argument( + "--mma_tiler_mn", + type=parse_comma_separated_ints, + default=(128, 128), + help="Mma tile shape (comma-separated)", + ) + parser.add_argument( + "--cluster_shape_mn", + type=parse_comma_separated_ints, + default=(1, 1), + help="Cluster shape (comma-separated)", + ) + parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.TFloat32) + parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float32) + parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32) + parser.add_argument( + "--use_2cta_instrs", + action="store_true", + help="Enable 2CTA MMA instructions feature", + ) + parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k") + parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k") + parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n") + parser.add_argument( + "--use_tma_store", action="store_true", help="Use tma store or not" + ) + parser.add_argument( + "--tolerance", type=float, default=1e-01, help="Tolerance for validation" + ) + parser.add_argument( + "--warmup_iterations", type=int, default=0, help="Warmup iterations" + ) + parser.add_argument( + "--iterations", + type=int, + default=1, + help="Number of iterations to run the kernel", + ) + parser.add_argument( + "--skip_ref_check", action="store_true", help="Skip reference checking" + ) + + args = parser.parse_args() + + if len(args.mnkl) != 4: + parser.error("--mnkl must contain exactly 4 values") + + if len(args.mma_tiler_mn) != 2: + parser.error("--mma_tiler_mn must contain exactly 2 values") + + if len(args.cluster_shape_mn) != 2: + parser.error("--cluster_shape_mn must contain exactly 2 values") + + run_dense_gemm( + args.mnkl, + args.ab_dtype, + args.c_dtype, + args.acc_dtype, + args.a_major, + args.b_major, + args.c_major, + args.mma_tiler_mn, + args.cluster_shape_mn, + args.use_2cta_instrs, + args.use_tma_store, + args.tolerance, + args.warmup_iterations, + args.iterations, + args.skip_ref_check, + ) + print("PASS") diff --git a/examples/python/CuTeDSL/blackwell/fmha.py b/examples/python/CuTeDSL/blackwell/fmha.py new file mode 100644 index 00000000..144ba01b --- /dev/null +++ b/examples/python/CuTeDSL/blackwell/fmha.py @@ -0,0 +1,2984 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import enum +import math +import time +from typing import Type, Tuple + +import torch +import torch.nn.functional as F +import cuda.bindings.driver as cuda + +import cutlass +import cutlass.cute as cute +import cutlass.cute.nvgpu.tcgen05 as tcgen05 +import cutlass.utils as utils +import cutlass.torch as cutlass_torch +import cutlass.utils.blackwell_helpers as sm100_utils +from cutlass.cute.runtime import from_dlpack + +""" +A fused multi-head attention (FMHA) example for the NVIDIA Blackwell SM100 architecture using CUTE DSL + +This example demonstrates an implementation of fused multi-head attention using a TMA + Blackwell SM100 +TensorCore warp-specialized persistent kernel. The implementation integrates the Q*K^T matrix multiplication, +softmax normalization, and softmax(Q*K^T)*V into a single kernel, avoiding intermediate data movement between +global memory and shared memory, thus improving computational efficiency. + +The kernel implements key optimizations including: +- Warp specialization for different computation phases (load, MMA, softmax, correction, epilogue) +- Pipeline stages between different warps for overlapping computation and memory access +- Support for different precision data types +- Optional causal masking for autoregressive models + +To run this example: + +.. code-block:: bash + + python examples/blackwell/fmha.py \ + --qk_acc_dtype Float32 --pv_acc_dtype Float32 \ + --mma_tiler_mn 128,128 \ + --q_shape 4,1024,8,64 --k_shape 4,1024,8,64 \ + --is_persistent + +The above example runs FMHA with batch size 4, sequence length 1024, 8 attention heads, and head +dimension 64. The Blackwell tcgen05 MMA tile shape is (128, 128), and the kernel uses fp16 for input/output +with fp32 for accumulation. + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/blackwell/fmha.py \ + --qk_acc_dtype Float32 --pv_acc_dtype Float32 \ + --mma_tiler_mn 128,128 \ + --q_shape 4,1024,8,64 --k_shape 4,1024,8,64 \ + --is_persistent --warmup_iterations 10 \ + --iterations 10 --skip_ref_check + +Constraints for this example: +* Supported head dimensions: 32, 64, and 128 +* Number of heads in Q must be divisible by number of heads in K +* mma_tiler_mn must be 128,128 +* Batch size must be the same for Q, K, and V tensors +* For causal masking, use --has_casual_mask (note: specify without =True/False) +* For persistent scheduling, use --is_persistent (note: specify without =True/False) +""" + +class FmhaStaticTileSchedulerParams: + def __init__( + self, + is_persistent: bool, + problem_shape_mbh: cute.Shape, + *, + loc=None, + ip=None, + ): + self.is_persistent = is_persistent + self.problem_shape_mbh = problem_shape_mbh + self._loc = loc + self._ip = ip + + def __extract_mlir_values__(self): + values, self._values_pos = [], [] + for obj in [self.is_persistent, self.problem_shape_mbh]: + obj_values = cutlass.extract_mlir_values(obj) + values += obj_values + self._values_pos.append(len(obj_values)) + return values + + def __new_from_mlir_values__(self, values): + obj_list = [] + for obj, n_items in zip( + [self.is_persistent, self.problem_shape_mbh], self._values_pos + ): + obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items])) + values = values[n_items:] + return FmhaStaticTileSchedulerParams(*(tuple(obj_list)), loc=self._loc) + + +def create_fmha_static_tile_scheduler_params( + is_persistent: bool, + problem_shape_mbh: cute.Shape, +) -> FmhaStaticTileSchedulerParams: + return FmhaStaticTileSchedulerParams(is_persistent, problem_shape_mbh) + + +class FmhaStaticTileScheduler: + + def __init__( + self, + params: FmhaStaticTileSchedulerParams, + current_work_linear_idx: cutlass.Int32, + blk_coord: cute.Coord, + grid_shape: cute.Shape, + *, + loc=None, + ip=None, + ): + self._params = params + self._blk_coord = blk_coord + self._grid_shape = grid_shape + self._is_persistent = params.is_persistent + self._current_work_linear_idx = current_work_linear_idx + self._problem_shape_mbh = cute.make_layout( + params.problem_shape_mbh, loc=loc, ip=ip + ) + self._num_blocks = cute.size(self._problem_shape_mbh, loc=loc, ip=ip) + self._is_first_block = True + self.num_persistent_sm = cute.size(grid_shape, loc=loc, ip=ip) + self._loc = loc + self._ip = ip + + # called by host + @staticmethod + def get_grid_shape( + params: FmhaStaticTileSchedulerParams, + *, + loc=None, + ip=None, + ) -> cute.Shape: + if params.is_persistent: + hardware_info = cutlass.utils.HardwareInfo() + sm_count = hardware_info.get_device_multiprocessor_count() + return ( + cutlass.min( + sm_count, cute.size(params.problem_shape_mbh, loc=loc, ip=ip) + ), + 1, + 1, + ) + else: + return params.problem_shape_mbh + + def get_current_work(self, *, loc=None, ip=None) -> utils.WorkTileInfo: + is_valid = ( + self._current_work_linear_idx < self._num_blocks + if self._is_persistent + else self._is_first_block + ) + + blk_coord = (0, 0, 0) + if self._is_persistent: + blk_coord = self._problem_shape_mbh.get_hier_coord( + self._current_work_linear_idx, loc=loc, ip=ip + ) + else: + blk_coord = self._blk_coord + + # cur_tile_coord is (mid, 0, (bid, hid)) + cur_tile_coord = ( + blk_coord[0], + 0, + (blk_coord[1], blk_coord[2]), + ) + + return utils.WorkTileInfo(cur_tile_coord, is_valid) + + def initial_work_tile_info(self, *, loc=None, ip=None): + return self.get_current_work(loc=loc, ip=ip) + + def advance_to_next_work(self, *, advance_count=1, loc=None, ip=None): + if self._is_persistent: + self._current_work_linear_idx += advance_count * self.num_persistent_sm + self._is_first_block = False + + def __extract_mlir_values__(self): + values = cutlass.extract_mlir_values(self._params) + values.extend(cutlass.extract_mlir_values(self._current_work_linear_idx)) + values.extend(cutlass.extract_mlir_values(self._blk_coord)) + values.extend(cutlass.extract_mlir_values(self._grid_shape)) + return values + + def __new_from_mlir_values__(self, values): + assert len(values) == 10 + new_params = cutlass.new_from_mlir_values(self._params, values[0:3]) + new_current_work_linear_idx = cutlass.new_from_mlir_values( + self._current_work_linear_idx, [values[3]] + ) + new_blk_coord = cutlass.new_from_mlir_values(self._blk_coord, values[4:7]) + new_grid_shape = cutlass.new_from_mlir_values(self._grid_shape, values[7:]) + return FmhaStaticTileScheduler( + new_params, new_current_work_linear_idx, new_blk_coord, new_grid_shape + ) + + +def create_fmha_static_tile_scheduler( + params: FmhaStaticTileSchedulerParams, + blk_coord: cute.Coord, + grid_shape: cute.Shape, +) -> FmhaStaticTileScheduler: + return FmhaStaticTileScheduler(params, blk_coord[0], blk_coord, grid_shape) + + +class MaskType(enum.Enum): + NO_MASK = enum.auto() + RESIDUAL_MASK = enum.auto() + CAUSAL_MASK = enum.auto() + + +class FusedMask: + def __init__( + self, + mask_type: MaskType, + seq_len_k: cutlass.Int32, + *, + loc=None, + ip=None, + ): + self._mask_type = mask_type + self._seq_len_k = seq_len_k + self._loc = loc + self._ip = ip + + def get_trip_count( + self, + blk_coord: cute.Coord, + tile_shape: cute.Shape, + ) -> cutlass.Int32: + result = 0 + if ( + self._mask_type == MaskType.NO_MASK + or self._mask_type == MaskType.RESIDUAL_MASK + ): + result = cute.ceil_div(self._seq_len_k, tile_shape[1]) + elif self._mask_type == MaskType.CAUSAL_MASK: + max_blocks_k = cute.ceil_div(self._seq_len_k, tile_shape[1]) + max_blocks_q = cute.ceil_div( + (blk_coord[0] + 1) * tile_shape[0], tile_shape[1] + ) + result = cutlass.min(max_blocks_k, max_blocks_q) + return result + + @cute.jit + def get_masked_trip_count( + self, + blk_coord: cute.Coord, + tile_shape: cute.Shape, + ) -> cutlass.Int32: + result = 0 + if self._mask_type == MaskType.NO_MASK: + result = 0 + elif self._mask_type == MaskType.RESIDUAL_MASK: + if self._seq_len_k % tile_shape[1] != 0: + result = 1 + else: + result = 0 + elif self._mask_type == MaskType.CAUSAL_MASK: + result = cute.ceil_div(tile_shape[0], tile_shape[1]) + return result + + @cute.jit + def get_unmasked_trip_count( + self, + blk_coord: cute.Coord, + tile_shape: cute.Shape, + ) -> cutlass.Int32: + result = 0 + if self._mask_type == MaskType.NO_MASK: + result = self.get_trip_count(blk_coord, tile_shape) + elif self._mask_type == MaskType.RESIDUAL_MASK: + if self._seq_len_k % tile_shape[1] != 0: + result = self.get_trip_count(blk_coord, tile_shape) - 1 + else: + result = self.get_trip_count(blk_coord, tile_shape) + elif self._mask_type == MaskType.CAUSAL_MASK: + result = self.get_trip_count( + blk_coord, tile_shape + ) - self.get_masked_trip_count(blk_coord, tile_shape) + return result + + @cute.jit + def apply_mask( + self, + acc_qk: cute.Tensor, + index_qk: cute.Tensor, + ): + if self._mask_type == MaskType.RESIDUAL_MASK: + for i in range(cute.size(acc_qk)): + pos = index_qk[i] + if pos[1] >= self._seq_len_k: + acc_qk[i] = -cutlass.Float32.inf + elif self._mask_type == MaskType.CAUSAL_MASK: + for i in range(cute.size(acc_qk)): + pos = index_qk[i] + if pos[0] < pos[1] or pos[1] >= self._seq_len_k: + acc_qk[i] = -cutlass.Float32.inf + + def __extract_mlir_values__(self): + values, self._values_pos = [], [] + for obj in [self._mask_type, self._seq_len_k]: + obj_values = cutlass.extract_mlir_values(obj) + values += obj_values + self._values_pos.append(len(obj_values)) + return values + + def __new_from_mlir_values__(self, values): + obj_list = [] + for obj, n_items in zip([self._mask_type, self._seq_len_k], self._values_pos): + obj_list.append(cutlass.new_from_mlir_values(obj, values[:n_items])) + values = values[n_items:] + return FusedMask(*(tuple(obj_list)), loc=self._loc) + + +def create_fused_mask( + mask_type: MaskType, + seq_len_k: cutlass.Int32, +) -> FusedMask: + return FusedMask(mask_type, seq_len_k) + + +class BlackwellFusedMultiHeadAttentionForward: + def __init__( + self, + qk_acc_dtype: Type[cutlass.Numeric], + pv_acc_dtype: Type[cutlass.Numeric], + mma_tiler: Tuple[int, int, int], + is_persistent: bool, + mask_type: MaskType, + ): + """Initializes the configuration for a Blackwell Fused Multi-Head Attention (FMHA) kernel. + + This configuration includes several key aspects: + + 1. Data Type Settings: + - qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator + - pv_acc_dtype: Data type for P*V matrix multiplication accumulator + + 2. MMA Instruction Settings: + - mma_tiler: The (M, N, K) shape of the MMA instruction unit + - qk_mma_tiler: MMA shape for Q*K^T computation + - pv_mma_tiler: MMA shape for P*V computation + + 3. Kernel Execution Mode: + - is_persistent: Boolean indicating whether to use persistent kernel mode + - mask_type: Specifies the type of mask to use (no mask, residual mask, or causal mask) + + :param qk_acc_dtype: Data type for Q*K^T matrix multiplication accumulator + :type qk_acc_dtype: Type[cutlass.Numeric] + :param pv_acc_dtype: Data type for P*V matrix multiplication accumulator + :type pv_acc_dtype: Type[cutlass.Numeric] + :param mma_tiler: The (M, N, K) shape of the MMA instruction + :type mma_tiler: Tuple[int, int, int] + :param is_persistent: Whether to use persistent kernel mode + :type is_persistent: bool + :param mask_type: Type of mask to use + :type mask_type: MaskType + """ + + self.qk_acc_dtype = qk_acc_dtype + self.pv_acc_dtype = pv_acc_dtype + self.cta_tiler = ( + 2 * mma_tiler[0], # 2 Q tile per CTA + mma_tiler[1], + mma_tiler[2], + ) + self.qk_mma_tiler = mma_tiler + self.pv_mma_tiler = ( + mma_tiler[0], + mma_tiler[2], + mma_tiler[1], + ) + self.cluster_shape_mn = (1, 1) + self.is_persistent = is_persistent + self.mask_type = mask_type + + self.softmax0_warp_ids = (0, 1, 2, 3) + self.softmax1_warp_ids = (4, 5, 6, 7) + self.correction_warp_ids = (8, 9, 10, 11) + self.mma_warp_id = 12 + self.load_warp_id = 13 + self.epilogue_warp_id = 14 + self.empty_warp_id = 15 + SM100_TMEM_CAPACITY_COLUMNS = 512 + self.tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS + + self.threads_per_warp = 32 + self.threads_per_cta = self.threads_per_warp * len( + ( + *self.softmax0_warp_ids, + *self.softmax1_warp_ids, + *self.correction_warp_ids, + self.mma_warp_id, + self.load_warp_id, + self.epilogue_warp_id, + self.empty_warp_id, + ) + ) + + self.cta_sync_bar_id = 0 + self.tmem_alloc_sync_bar_id = 1 + + self.tmem_s0_offset = 0 + self.tmem_s1_offset = 128 + self.tmem_o0_offset = 256 + self.tmem_o1_offset = 384 + self.tmem_p0_offset = 32 + self.tmem_p1_offset = 160 + + # vec buffer for row_max & row_sum + self.tmem_vec0_offset = 0 + self.tmem_vec1_offset = 128 + + self.num_regs_softmax = 192 + self.num_regs_correction = 96 + self.num_regs_other = 32 + self.num_regs_empty = 24 + + self.buffer_align_bytes = 1024 + + num_warps_per_warpgroup = 4 + self.softmax_warpgroup_count = ( + len((*self.softmax0_warp_ids, *self.softmax1_warp_ids)) + // num_warps_per_warpgroup + ) + + def _setup_attributes(self): + """Set up configurations and parameters for the FMHA kernel operation. + + This method initializes and configures various attributes required for the + execution of the fused multi-head attention kernel, mainly about the pipeline stages: + + - Sets up staging parameters for Q, K, V inputs and accumulator data + - Configures pipeline stages for softmax, correction, and epilogue operations + """ + + self.q_stage = 2 + self.kv_stage = 4 if self.q_dtype.width == 8 else 3 + self.acc_stage = 1 + self.softmax_corr_stage = 1 + self.mma_corr_stage = 2 + self.mma_softmax_stage = 1 + self.epi_stage = 2 + + @cute.jit + def __call__( + self, + q: cute.Tensor, + k: cute.Tensor, + v: cute.Tensor, + o: cute.Tensor, + scale_softmax_log2: cutlass.Float32, + scale_output: cutlass.Float32, + stream: cuda.CUstream, + ): + """Execute the Fused Multi-Head Attention operation on the provided tensors. + + This method prepares the input tensors for processing, validates their shapes and types, + configures the computation parameters, and launches the CUDA kernel. + + The method handles: + 1. Tensor layout transformations for specific memory access patterns + 2. Validation of tensor shapes and data types + 3. Initialization of hardware-specific parameters and memory layouts + 4. Configuration of TMA (Tensor Memory Access) operations + 5. Grid and work scheduling computation + 6. Kernel launch with appropriate parameters + + :param q: The query tensor with shape [seq_len_q, d_head, h_q, b] + :type q: cute.Tensor + :param k: The key tensor with shape [seq_len_k, d_head, h_k, b] + :type k: cute.Tensor + :param v: The value tensor with shape [seq_len_k, d_head, h_v, b] + :type v: cute.Tensor + :param o: The output tensor with shape [seq_len_q, d_head, h_q, b] + :type o: cute.Tensor + :param scale_softmax_log2: The log2 scale factor for softmax + :type scale_softmax_log2: cutlass.Float32 + :param scale_output: The scale factor for the output + :type scale_output: cutlass.Float32 + :param stream: The CUDA stream to execute the kernel on + :type stream: cuda.CUstream + :raises TypeError: If tensor data types don't match or aren't supported + :raises RuntimeError: If tensor layouts aren't in supported formats + """ + + # setup static attributes before smem/grid/tma computation + self.q_dtype = q.element_type + self.k_dtype = k.element_type + self.v_dtype = v.element_type + self.o_dtype = o.element_type + + # (s, d, 1, h_k, b) -> (s, d, ((h_r, h_k), b)) + k = cute.make_tensor( + k.iterator, + cute.make_layout( + (k.shape[0], k.shape[1], ((q.shape[2], k.shape[3]), k.shape[4])), + stride=( + k.layout.stride[0], + k.layout.stride[1], + ((0, k.layout.stride[3]), k.layout.stride[4]), + ), + ), + ) + # (s, d, 1, h_k, b) -> (d, s, ((h_r, h_k), b)) + v = cute.make_tensor( + v.iterator, + cute.make_layout( + (v.shape[1], v.shape[0], ((q.shape[2], v.shape[3]), v.shape[4])), + stride=( + v.layout.stride[1], + v.layout.stride[0], + ((0, v.layout.stride[3]), v.layout.stride[4]), + ), + ), + ) + + # (s, d, h_r, h_k, b) -> (s, d, ((h_r, h_k), b)) + q = cute.group_modes(cute.group_modes(q, begin=2, end=4), begin=2, end=4) + o = cute.group_modes(cute.group_modes(o, begin=2, end=4), begin=2, end=4) + + self.q_major_mode = utils.LayoutEnum.from_tensor(q).mma_major_mode() + self.k_major_mode = utils.LayoutEnum.from_tensor(k).mma_major_mode() + self.v_major_mode = utils.LayoutEnum.from_tensor(v).mma_major_mode() + self.o_layout = utils.LayoutEnum.from_tensor(o) + + if cutlass.const_expr(self.q_major_mode != tcgen05.OperandMajorMode.K): + raise RuntimeError("The layout of q is not supported") + if cutlass.const_expr(self.k_major_mode != tcgen05.OperandMajorMode.K): + raise RuntimeError("The layout of k is not supported") + if cutlass.const_expr(self.v_major_mode != tcgen05.OperandMajorMode.MN): + raise RuntimeError("The layout of v is not supported") + + # check type consistency + if cutlass.const_expr(self.q_dtype != self.k_dtype): + raise TypeError(f"Type mismatch: {self.q_dtype} != {self.k_dtype}") + if cutlass.const_expr(self.q_dtype != self.v_dtype): + raise TypeError(f"Type mismatch: {self.q_dtype} != {self.v_dtype}") + self._setup_attributes() + + cta_group = tcgen05.CtaGroup.ONE + # the intermediate tensor p is from tmem & k-major + p_source = tcgen05.OperandSource.TMEM + p_major_mode = tcgen05.OperandMajorMode.K + qk_tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.q_dtype, + self.q_major_mode, + self.k_major_mode, + self.qk_acc_dtype, + cta_group, + self.qk_mma_tiler[:2], + ) + pv_tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.v_dtype, + p_major_mode, + self.v_major_mode, + self.pv_acc_dtype, + cta_group, + self.pv_mma_tiler[:2], + p_source, + ) + + self.cluster_shape_mnk = (*self.cluster_shape_mn, 1) + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout(self.cluster_shape_mnk), + (qk_tiled_mma.thr_id.shape,), + ) + + self.epi_tile = self.pv_mma_tiler[:2] + + + q_smem_layout_staged = sm100_utils.make_smem_layout_a( + qk_tiled_mma, + self.qk_mma_tiler, + self.q_dtype, + self.q_stage, + ) + k_smem_layout_staged = sm100_utils.make_smem_layout_b( + qk_tiled_mma, + self.qk_mma_tiler, + self.k_dtype, + self.kv_stage, + ) + p_tmem_layout_staged = sm100_utils.make_smem_layout_a( + pv_tiled_mma, + self.pv_mma_tiler, + self.q_dtype, + self.acc_stage, + ) + v_smem_layout_staged = sm100_utils.make_smem_layout_b( + pv_tiled_mma, + self.pv_mma_tiler, + self.v_dtype, + self.kv_stage, + ) + o_smem_layout_staged = sm100_utils.make_smem_layout_epi( + self.o_dtype, + self.o_layout, + self.epi_tile, + self.epi_stage, + ) + + # TMA load for Q + tma_load_op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp(cta_group) + tma_store_op = cute.nvgpu.cpasync.CopyBulkTensorTileS2GOp() + + q_smem_layout = cute.select(q_smem_layout_staged, mode=[0, 1, 2]) + tma_atom_q, tma_tensor_q = cute.nvgpu.make_tma_tile_atom_A( + tma_load_op, + q, + q_smem_layout, + self.qk_mma_tiler, + qk_tiled_mma, + self.cluster_layout_vmnk.shape, + ) + + # TMA load for K + k_smem_layout = cute.select(k_smem_layout_staged, mode=[0, 1, 2]) + tma_atom_k, tma_tensor_k = cute.nvgpu.make_tma_tile_atom_B( + tma_load_op, + k, + k_smem_layout, + self.qk_mma_tiler, + qk_tiled_mma, + self.cluster_layout_vmnk.shape, + ) + # TMA load for V + v_smem_layout = cute.select(v_smem_layout_staged, mode=[0, 1, 2]) + tma_atom_v, tma_tensor_v = cute.nvgpu.make_tma_tile_atom_B( + tma_load_op, + v, + v_smem_layout, + self.pv_mma_tiler, + pv_tiled_mma, + self.cluster_layout_vmnk.shape, + ) + + o_cta_v_layout = cute.composition( + cute.make_identity_layout(o.shape), self.epi_tile + ) + o_smem_layout = cute.select(o_smem_layout_staged, mode=[0, 1]) + + tma_atom_o, tma_tensor_o = cute.nvgpu.cpasync.make_tma_tile_atom( + tma_store_op, + o, + o_smem_layout, + o_cta_v_layout, + ) + + q_copy_size = cute.size_in_bytes(self.q_dtype, q_smem_layout) + k_copy_size = cute.size_in_bytes(self.k_dtype, k_smem_layout) + self.tma_copy_q_bytes = q_copy_size + self.tma_copy_kv_bytes = k_copy_size + + self.tile_sched_params, grid = self._compute_grid( + o, + self.cta_tiler, + self.is_persistent, + ) + + @cute.struct + class SharedStorage: + # Pipeline barriers + load_q_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.q_stage * 2] + load_kv_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.kv_stage * 2] + mma_s0_mbar_ptr: cute.struct.MemRange[ + cutlass.Int64, self.mma_softmax_stage * 2 + ] + mma_s1_mbar_ptr: cute.struct.MemRange[ + cutlass.Int64, self.mma_softmax_stage * 2 + ] + s0_corr_mbar_ptr: cute.struct.MemRange[ + cutlass.Int64, self.softmax_corr_stage * 2 + ] + s1_corr_mbar_ptr: cute.struct.MemRange[ + cutlass.Int64, self.softmax_corr_stage * 2 + ] + s0_s1_sequence_mbar_ptr: cute.struct.MemRange[ + cutlass.Int64, self.softmax_warpgroup_count + ] + corr_epi_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.epi_stage * 2] + mma_corr_mbar_ptr: cute.struct.MemRange[ + cutlass.Int64, self.mma_corr_stage * 2 + ] + max_reg_setting_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1] + tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1] + # Tmem holding buffer + tmem_holding_buf: cutlass.Int32 + # Smem tensors + sO: cute.struct.Align[ + cute.struct.MemRange[self.o_dtype, cute.cosize(o_smem_layout_staged)], + self.buffer_align_bytes, + ] + sQ: cute.struct.Align[ + cute.struct.MemRange[self.q_dtype, cute.cosize(q_smem_layout_staged)], + self.buffer_align_bytes, + ] + sK: cute.struct.Align[ + cute.struct.MemRange[self.k_dtype, cute.cosize(k_smem_layout_staged)], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + fused_mask = create_fused_mask(self.mask_type, k.shape[0]) + + # Launch the kernel synchronously + self.kernel( + qk_tiled_mma, + pv_tiled_mma, + tma_atom_q, + tma_tensor_q, + tma_atom_k, + tma_tensor_k, + tma_atom_v, + tma_tensor_v, + tma_atom_o, + tma_tensor_o, + scale_softmax_log2, + scale_output, + q_smem_layout_staged, + k_smem_layout_staged, + p_tmem_layout_staged, + v_smem_layout_staged, + o_smem_layout_staged, + self.tile_sched_params, + fused_mask, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=self.cluster_shape_mnk, + smem=self.shared_storage.size_in_bytes(), + stream=stream, + min_blocks_per_mp=1, + ) + + # GPU device kernel + @cute.kernel + def kernel( + self, + qk_tiled_mma: cute.TiledMma, + pv_tiled_mma: cute.TiledMma, + tma_atom_q: cute.CopyAtom, + mQ_qdl: cute.Tensor, + tma_atom_k: cute.CopyAtom, + mK_kdl: cute.Tensor, + tma_atom_v: cute.CopyAtom, + mV_dkl: cute.Tensor, + tma_atom_o: cute.CopyAtom, + mO_qdl: cute.Tensor, + scale_softmax_log2: cutlass.Float32, + scale_output: cutlass.Float32, + q_smem_layout_staged: cute.ComposedLayout, + k_smem_layout_staged: cute.ComposedLayout, + p_tmem_layout_staged: cute.ComposedLayout, + v_smem_layout_staged: cute.ComposedLayout, + o_smem_layout_staged: cute.ComposedLayout, + tile_sched_params: FmhaStaticTileSchedulerParams, + fused_mask: FusedMask, + ): + """The device kernel implementation of the Fused Multi-Head Attention. + + This kernel coordinates multiple specialized warps to perform different phases of the FMHA computation: + 1. Load warp: Loads Q, K, V data from global memory to shared memory using TMA + 2. MMA warp: Performs matrix multiplications (Q*K^T and P*V) + 3. Softmax warps: Compute softmax normalization on attention scores + 4. Correction warps: Apply adjustments to intermediate results + 5. Epilogue warp: Handles final output transformation and storage + + The kernel implements a complex pipeline with overlapping computation and memory operations, + using tensor memory access (TMA) for efficient data loading, warp specialization for different + computation phases, and optional attention masking. + + :param qk_tiled_mma: Tiled MMA for Q*K^T + :type qk_tiled_mma: cute.TiledMma + :param pv_tiled_mma: Tiled MMA for P*V + :type pv_tiled_mma: cute.TiledMma + :param tma_atom_q: TMA copy atom for query tensor + :type tma_atom_q: cute.CopyAtom + :param mQ_qdl: Partitioned query tensor + :type mQ_qdl: cute.Tensor + :param tma_atom_k: TMA copy atom for key tensor + :type tma_atom_k: cute.CopyAtom + :param mK_kdl: Partitioned key tensor + :type mK_kdl: cute.Tensor + :param tma_atom_v: TMA copy atom for value tensor + :type tma_atom_v: cute.CopyAtom + :param mV_dkl: Partitioned value tensor + :type mV_dkl: cute.Tensor + :param tma_atom_o: TMA copy atom for output tensor + :type tma_atom_o: cute.CopyAtom + :param mO_qdl: Partitioned output tensor + :type mO_qdl: cute.Tensor + :param scale_softmax_log2: The log2 scale factor for softmax + :type scale_softmax_log2: cutlass.Float32 + :param scale_output: The scale factor for the output + :type scale_output: cutlass.Float32 + :param q_smem_layout_staged: Shared memory layout for query tensor + :type q_smem_layout_staged: cute.ComposedLayout + :param k_smem_layout_staged: Shared memory layout for key tensor + :type k_smem_layout_staged: cute.ComposedLayout + :param p_tmem_layout_staged: Tensor memory layout for probability matrix + :type p_tmem_layout_staged: cute.ComposedLayout + :param v_smem_layout_staged: Shared memory layout for value tensor + :type v_smem_layout_staged: cute.ComposedLayout + :param o_smem_layout_staged: Shared memory layout for output tensor + :type o_smem_layout_staged: cute.ComposedLayout + :param tile_sched_params: Scheduling parameters for work distribution + :type tile_sched_params: FmhaStaticTileSchedulerParams + :param fused_mask: Masking configuration (causal/residual/none) + :type fused_mask: FusedMask + """ + + warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) + # coord inside cta + tidx, _, _ = cute.arch.thread_idx() + + # Alloc + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + load_q_pipeline = self.make_and_init_load_q_pipeline( + storage.load_q_mbar_ptr.data_ptr() + ) + load_kv_pipeline = self.make_and_init_load_kv_pipeline( + storage.load_kv_mbar_ptr.data_ptr() + ) + mma_s0_pipeline = self.make_and_init_mma_si_pipeline( + storage.mma_s0_mbar_ptr.data_ptr() + ) + mma_s1_pipeline = self.make_and_init_mma_si_pipeline( + storage.mma_s1_mbar_ptr.data_ptr() + ) + s0_corr_pipeline = self.make_and_init_si_corr_pipeline( + storage.s0_corr_mbar_ptr.data_ptr() + ) + s1_corr_pipeline = self.make_and_init_si_corr_pipeline( + storage.s1_corr_mbar_ptr.data_ptr() + ) + corr_epi_pipeline = self.make_and_init_corr_epi_pipeline( + storage.corr_epi_mbar_ptr.data_ptr() + ) + mma_corr_pipeline = self.make_and_init_mma_corr_pipeline( + storage.mma_corr_mbar_ptr.data_ptr() + ) + s0_s1_sequence_pipeline = self.make_and_init_si_sequence_pipeline( + storage.s0_s1_sequence_mbar_ptr.data_ptr() + ) + max_reg_setting_mbar_ptr = storage.max_reg_setting_mbar_ptr.data_ptr() + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr() + + # Correction & Epilogue & tmem barrier init + if warp_idx == self.empty_warp_id: + cute.arch.mbarrier_init_arrive_cnt( + max_reg_setting_mbar_ptr, + self.threads_per_warp + * len( + ( + self.empty_warp_id, + self.load_warp_id, + self.mma_warp_id, + self.epilogue_warp_id, + *self.correction_warp_ids, + ) + ), + ) + cute.arch.mbarrier_init_arrive_cnt( + tmem_dealloc_mbar_ptr, + self.threads_per_warp + * len( + ( + *self.softmax0_warp_ids, + *self.softmax1_warp_ids, + *self.correction_warp_ids, + ) + ), + ) + + cute.arch.mbarrier_init_fence() + + # Generate smem tensor Q/K/V/O + # (MMA, MMA_Q, MMA_D, PIPE) + sQ = storage.sQ.get_tensor( + q_smem_layout_staged.outer, swizzle=q_smem_layout_staged.inner + ) + # (MMA, MMA_K, MMA_D, PIPE) + sK = storage.sK.get_tensor( + k_smem_layout_staged.outer, swizzle=k_smem_layout_staged.inner + ) + # (MMA, MMA_K, MMA_D, PIPE) + # Strip swizzle info to reuse smem + sV_ptr = cute.recast_ptr(sK.iterator, v_smem_layout_staged.inner) + sV = cute.make_tensor(sV_ptr, v_smem_layout_staged.outer) + + sO = storage.sO.get_tensor( + o_smem_layout_staged.outer, swizzle=o_smem_layout_staged.inner + ) + + # Local tile partition global tensors + # (bM, bK, loopM, loopK, loopL) need to check + gQ_qdl = cute.flat_divide(mQ_qdl, cute.select(self.qk_mma_tiler, mode=[0, 2])) + qk_thr_mma = qk_tiled_mma.get_slice(0) # default 1sm + tSgQ_qdl = qk_thr_mma.partition_A(gQ_qdl) + + tQsQ, tQgQ_qdl = cute.nvgpu.cpasync.tma_partition( + tma_atom_q, + 0, # no multicast + cute.make_layout(1), + cute.group_modes(sQ, 0, 3), + cute.group_modes(tSgQ_qdl, 0, 3), + ) + + gK_kdl = cute.flat_divide(mK_kdl, cute.select(self.qk_mma_tiler, mode=[1, 2])) + tSgK_kdl = qk_thr_mma.partition_B(gK_kdl) + tKsK, tKgK_kdl = cute.nvgpu.cpasync.tma_partition( + tma_atom_k, + 0, # no multicast + cute.make_layout(1), + cute.group_modes(sK, 0, 3), + cute.group_modes(tSgK_kdl, 0, 3), + ) + + # (bM, bN, loopM, loopN, loopL) + gV_dkl = cute.flat_divide(mV_dkl, cute.select(self.pv_mma_tiler, mode=[1, 2])) + + pv_thr_mma = pv_tiled_mma.get_slice(0) # default 1sm + tSgV_dkl = pv_thr_mma.partition_B(gV_dkl) + tVsV, tVgV_dkl = cute.nvgpu.cpasync.tma_partition( + tma_atom_v, + 0, # no multicast + cute.make_layout(1), + cute.group_modes(sV, 0, 3), + cute.group_modes(tSgV_dkl, 0, 3), + ) + + tSrQ = qk_thr_mma.make_fragment_A(sQ) + tSrK = qk_thr_mma.make_fragment_B(sK) + tOrV = pv_thr_mma.make_fragment_B(sV) + + gO_qdl = cute.flat_divide(mO_qdl, cute.select(self.pv_mma_tiler, mode=[0, 1])) + + qk_acc_shape = qk_thr_mma.partition_shape_C( + (self.qk_mma_tiler[0], self.qk_mma_tiler[1]) + ) + tStS = qk_thr_mma.make_fragment_C(qk_acc_shape) + + pv_acc_shape = pv_thr_mma.partition_shape_C( + (self.pv_mma_tiler[0], self.pv_mma_tiler[1]) + ) + tOtO = pv_thr_mma.make_fragment_C(pv_acc_shape) + + tStS0 = cute.make_tensor(tStS.iterator + self.tmem_s0_offset, tStS.layout) + tStS1 = cute.make_tensor(tStS.iterator + self.tmem_s1_offset, tStS.layout) + + tOtO0 = cute.make_tensor(tOtO.iterator + self.tmem_o0_offset, tOtO.layout) + tOtO1 = cute.make_tensor(tOtO.iterator + self.tmem_o1_offset, tOtO.layout) + + tP = cute.make_tensor(tStS.iterator, p_tmem_layout_staged.outer) + tOrP = pv_thr_mma.make_fragment_A(tP)[None, None, None, 0] + + tOrP0 = cute.make_tensor( + tOrP.iterator + + self.qk_acc_dtype.width // self.q_dtype.width * self.tmem_p0_offset, + tOrP.layout, + ) + tOrP1 = cute.make_tensor( + tOrP.iterator + + self.qk_acc_dtype.width // self.q_dtype.width * self.tmem_p1_offset, + tOrP.layout, + ) + + cute.arch.barrier( + barrier_id=self.cta_sync_bar_id, + number_of_threads=self.threads_per_cta, + ) + + # /////////////////////////////////////////////////////////////////////////////// + # EMPTY + # /////////////////////////////////////////////////////////////////////////////// + if warp_idx == self.empty_warp_id: + cute.arch.warpgroup_reg_dealloc(self.num_regs_empty) + cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr) + + # /////////////////////////////////////////////////////////////////////////////// + # LOAD + # /////////////////////////////////////////////////////////////////////////////// + if warp_idx == self.load_warp_id: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr) + + q_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.q_stage + ) + kv_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.kv_stage + ) + + tile_sched = create_fmha_static_tile_scheduler( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + while work_tile.is_valid_tile: + curr_block_coord = work_tile.tile_idx + tQgQ = tQgQ_qdl[None, None, 0, curr_block_coord[2]] + tKgK = tKgK_kdl[None, None, 0, curr_block_coord[2]] + tVgV = tVgV_dkl[None, 0, None, curr_block_coord[2]] + + # Q0 + q0_coord = 2 * curr_block_coord[0] + load_q_pipeline.producer_acquire(q_producer_state) + cute.copy( + tma_atom_q, + tQgQ[None, q0_coord], + tQsQ[None, q_producer_state.index], + tma_bar_ptr=load_q_pipeline.producer_get_barrier(q_producer_state), + ) + q_producer_state.advance() + + # K0 + kv_coord = 0 # seqlen_kv_loop + load_kv_pipeline.producer_acquire(kv_producer_state) + cute.copy( + tma_atom_k, + tKgK[None, kv_coord], + tKsK[None, kv_producer_state.index], + tma_bar_ptr=load_kv_pipeline.producer_get_barrier( + kv_producer_state + ), + ) + kv_producer_state.advance() + + # Q1 + q1_coord = q0_coord + 1 + load_q_pipeline.producer_acquire(q_producer_state) + cute.copy( + tma_atom_q, + tQgQ[None, q1_coord], + tQsQ[None, q_producer_state.index], + tma_bar_ptr=load_q_pipeline.producer_get_barrier(q_producer_state), + ) + q_producer_state.advance() + + # V0 + load_kv_pipeline.producer_acquire(kv_producer_state) + cute.copy( + tma_atom_v, + tVgV[None, kv_coord], + tVsV[None, kv_producer_state.index], + tma_bar_ptr=load_kv_pipeline.producer_get_barrier( + kv_producer_state + ), + ) + kv_producer_state.advance() + kv_coord += 1 + + seqlen_kv_loop_steps = ( + fused_mask.get_trip_count(curr_block_coord, self.cta_tiler) - 1 + ) + for i in cutlass.range_dynamic(0, seqlen_kv_loop_steps, 1, unroll=1): + # Ki + load_kv_pipeline.producer_acquire(kv_producer_state) + cute.copy( + tma_atom_k, + tKgK[None, kv_coord], + tKsK[None, kv_producer_state.index], + tma_bar_ptr=load_kv_pipeline.producer_get_barrier( + kv_producer_state + ), + ) + kv_producer_state.advance() + # Vi + load_kv_pipeline.producer_acquire(kv_producer_state) + cute.copy( + tma_atom_v, + tVgV[None, kv_coord], + tVsV[None, kv_producer_state.index], + tma_bar_ptr=load_kv_pipeline.producer_get_barrier( + kv_producer_state + ), + ) + kv_producer_state.advance() + kv_coord += 1 + # End of seqlen_kv loop + + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + # End of persistent scheduler loop + + # /////////////////////////////////////////////////////////////////////////////// + # MMA + # /////////////////////////////////////////////////////////////////////////////// + if warp_idx == self.mma_warp_id: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr) + + # Alloc tmem buffer + tmem_alloc_cols = cutlass.Int32(self.tmem_alloc_cols) + cute.arch.alloc_tmem(tmem_alloc_cols, storage.tmem_holding_buf) + cute.arch.barrier( + barrier_id=self.tmem_alloc_sync_bar_id, + number_of_threads=self.threads_per_warp, + ) + mma_q_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.q_stage + ) + mma_kv_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.kv_stage + ) + mma_q_release_state = mma_q_consumer_state.clone() + mma_kv_release_state = mma_kv_consumer_state.clone() + mma_s0_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.mma_softmax_stage + ) + mma_s1_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.mma_softmax_stage + ) + mma_corr_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.mma_corr_stage + ) + tile_sched = create_fmha_static_tile_scheduler( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + while work_tile.is_valid_tile: + curr_block_coord = work_tile.tile_idx + # GEMM_QK00 (Q0 * K0 -> S0) + # 1. wait for Q0 + load_q_pipeline.consumer_wait(mma_q_consumer_state) + tSrQ0 = tSrQ[None, None, None, mma_q_consumer_state.index] + mma_q_consumer_state.advance() + # 2. wait for K0 + load_kv_pipeline.consumer_wait(mma_kv_consumer_state) + tSrK0 = tSrK[None, None, None, mma_kv_consumer_state.index] + mma_kv_consumer_state.advance() + # 3. acquire empty S0 buffer + mma_s0_pipeline.producer_acquire(mma_s0_producer_state) + # 4. gemm + num_kphases = cute.size(tSrQ0, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = (None, None, kphase_idx) + qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0) + cute.gemm( + qk_tiled_mma, + tStS0, + tSrQ0[kphase_coord], + tSrK0[kphase_coord], + tStS0, + ) + # 5. release S0 + mma_s0_pipeline.producer_commit(mma_s0_producer_state) + mma_s0_producer_state.advance() + # End of GEMM (Q0 * K0 -> S0) + + # GEMM_QK10 (Q1 * K0 -> S1), K0 is ready in GEMM_QK00 + # 1. wait for Q1 + load_q_pipeline.consumer_wait(mma_q_consumer_state) + tSrQ1 = tSrQ[None, None, None, mma_q_consumer_state.index] + mma_q_consumer_state.advance() + # 2. acquire empty S1 + mma_s1_pipeline.producer_acquire(mma_s1_producer_state) + # 3. gemm + num_kphases = cute.size(tSrQ1, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = (None, None, kphase_idx) + qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0) + cute.gemm( + qk_tiled_mma, + tStS1, + tSrQ1[kphase_coord], + tSrK0[kphase_coord], + tStS1, + ) + # 4. release S1 + mma_s1_pipeline.producer_commit(mma_s1_producer_state) + mma_s1_producer_state.advance() + # 5. release K0 + load_kv_pipeline.consumer_release(mma_kv_release_state) + mma_kv_release_state.advance() + # End of GEMM (Q1 * K0 -> S1) + # Note: Q0 & Q1 are still needed in the seqlen_kv loop + # so we need to release them after the seqlen_kv loop + + # GEMM_PV00 (P0 * V0 -> O0_partial), O0 needs to be accumulated in the seqlen_kv loop + # 1. wait for V0 + load_kv_pipeline.consumer_wait(mma_kv_consumer_state) + tOrVi = tOrV[None, None, None, mma_kv_consumer_state.index] + mma_kv_consumer_state.advance() + # 2. acquire corrected O0_partial + # Note: acquire corr first to take it out of the critical + # path since softmax takes longer + mma_corr_pipeline.producer_acquire(mma_corr_producer_state) + # 3. acquire P0 + # this acquire returns the ownership of all of S0 to the mma warp + # including the P0 part (inplaced in S0) + mma_s0_pipeline.producer_acquire(mma_s0_producer_state) + # 4. gemm + num_kphases = cute.size(tOrP0, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = (None, None, kphase_idx) + pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0) + cute.gemm( + pv_tiled_mma, + tOtO0, + tOrP0[kphase_coord], + tOrVi[kphase_coord], + tOtO0, + ) + # 5. release accumulated O0_partial + mma_corr_pipeline.producer_commit(mma_corr_producer_state) + mma_corr_producer_state.advance() + # End of GEMM_PV00 (P0 * V0 -> O0_partial) + + seqlen_kv_loop_steps = ( + fused_mask.get_trip_count(curr_block_coord, self.cta_tiler) - 1 + ) + # O1 hasn't been accumulated yet, its first MMA calculation doesn't need to accumulate + pv_whether_acc = False + for i in cutlass.range_dynamic(0, seqlen_kv_loop_steps, 1, unroll=1): + # GEMM_QK0i (Q0 * Ki -> S0) + # 1. wait for Ki + load_kv_pipeline.consumer_wait(mma_kv_consumer_state) + tSrKi = tSrK[None, None, None, mma_kv_consumer_state.index] + mma_kv_consumer_state.advance() + # 2. gemm + inner_num_kphases = cute.size(tSrQ0, mode=[2]) + for kphase_idx in range(inner_num_kphases): + kphase_coord = (None, None, kphase_idx) + qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0) + cute.gemm( + qk_tiled_mma, + tStS0, + tSrQ0[kphase_coord], + tSrKi[kphase_coord], + tStS0, + ) + # 3. release S0 + mma_s0_pipeline.producer_commit(mma_s0_producer_state) + mma_s0_producer_state.advance() + # End of GEMM_QK0i (Q0 * Ki -> S0) + + # GEMM_PV1(i-1) (P1 * V(i-1) -> O1_partial), V(i-1) is ready in GEMM_PV0(i-1) + # 1. acquire corrected O1_partial + mma_corr_pipeline.producer_acquire(mma_corr_producer_state) + # 2. acquire P1 + mma_s1_pipeline.producer_acquire(mma_s1_producer_state) + # 3. gemm + inner_num_kphases = cute.size(tOrP0, mode=[2]) + for kphase_idx in range(inner_num_kphases): + kphase_coord = (None, None, kphase_idx) + pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, pv_whether_acc) + cute.gemm( + pv_tiled_mma, + tOtO1, + tOrP1[kphase_coord], + tOrVi[kphase_coord], + tOtO1, + ) + pv_whether_acc = True + # 4. release accumulated O1_partial + mma_corr_pipeline.producer_commit(mma_corr_producer_state) + mma_corr_producer_state.advance() + # 5. release V(i-1) + load_kv_pipeline.consumer_release(mma_kv_release_state) + mma_kv_release_state.advance() + # End of GEMM_PV1(i-1) (P1 * V(i-1) -> O1_partial) + + # GEMM_QK1i (Q1 * Ki -> S1), Q1 is ready in GEMM_QK10; Ki is ready in GEMM_QK0i + # 1. gemm + inner_num_kphases = cute.size(tSrQ1, mode=[2]) + for kphase_idx in range(inner_num_kphases): + kphase_coord = (None, None, kphase_idx) + qk_tiled_mma.set(tcgen05.Field.ACCUMULATE, kphase_idx != 0) + cute.gemm( + qk_tiled_mma, + tStS1, + tSrQ1[kphase_coord], + tSrKi[kphase_coord], + tStS1, + ) + mma_s1_pipeline.producer_commit(mma_s1_producer_state) + mma_s1_producer_state.advance() + # 2. release Ki + load_kv_pipeline.consumer_release(mma_kv_release_state) + mma_kv_release_state.advance() + # End of GEMM_QK1i (Q1 * Ki -> S1) + + # GEMM_PV0i (P0 * Vi -> O0_partial) + # 1. wait for Vi + load_kv_pipeline.consumer_wait(mma_kv_consumer_state) + tOrVi = tOrV[None, None, None, mma_kv_consumer_state.index] + mma_kv_consumer_state.advance() + # 2. acquire corrected O0_partial + mma_corr_pipeline.producer_acquire(mma_corr_producer_state) + # 3. acquire P0 + mma_s0_pipeline.producer_acquire(mma_s0_producer_state) + # 4. gemm + inner_num_kphases = cute.size(tOrP0, mode=[2]) + for kphase_idx in range(inner_num_kphases): + kphase_coord = (None, None, kphase_idx) + pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + cute.gemm( + pv_tiled_mma, + tOtO0, + tOrP0[kphase_coord], + tOrVi[kphase_coord], + tOtO0, + ) + # 5. release accumulated O0_partial + mma_corr_pipeline.producer_commit(mma_corr_producer_state) + mma_corr_producer_state.advance() + # End of GEMM_PV0i (P0 * Vi -> O0_partial) + # End of seqlen_kv loop + + # release Q0 & Q1 + load_q_pipeline.consumer_release(mma_q_release_state) + mma_q_release_state.advance() + load_q_pipeline.consumer_release(mma_q_release_state) + mma_q_release_state.advance() + + # GEMM_PV1(i_end) (P1 * Vi_end -> O1) + # 1. acquire corrected O1_partial + mma_corr_pipeline.producer_acquire(mma_corr_producer_state) + # 2. acquire P1 + mma_s1_pipeline.producer_acquire(mma_s1_producer_state) + # 3. gemm + num_kphases = cute.size(tOrP1, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = (None, None, kphase_idx) + pv_tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + cute.gemm( + pv_tiled_mma, + tOtO1, + tOrP1[kphase_coord], + tOrVi[kphase_coord], + tOtO1, + ) + # 4. commit accumulated O1 + mma_corr_pipeline.producer_commit(mma_corr_producer_state) + mma_corr_producer_state.advance() + # 5. release Vi_end + load_kv_pipeline.consumer_release(mma_kv_release_state) + mma_kv_release_state.advance() + # End of GEMM_PV1(i_end) (P1 * Vi_end -> O1) + + # Commit S0 and S1 + mma_s0_pipeline.producer_commit(mma_s0_producer_state) + mma_s0_producer_state.advance() + mma_s1_pipeline.producer_commit(mma_s1_producer_state) + mma_s1_producer_state.advance() + + # Advance to next tile + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + # End of persistent scheduler loop + + # dealloc tmem buffer + cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0) + tmem_alloc_cols = cutlass.Int32(self.tmem_alloc_cols) + # Retrieving tmem ptr and make acc + tmem_ptr = cute.arch.retrieve_tmem_ptr( + cutlass.Float32, + alignment=16, + ptr_to_buffer_holding_addr=storage.tmem_holding_buf, + ) + + cute.arch.dealloc_tmem(tmem_ptr, tmem_alloc_cols) + + # /////////////////////////////////////////////////////////////////////////////// + # Epilogue + # /////////////////////////////////////////////////////////////////////////////// + if warp_idx == self.epilogue_warp_id: + cute.arch.warpgroup_reg_dealloc(self.num_regs_other) + cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr) + + corr_epi_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.epi_stage + ) + corr_epi_release_state = corr_epi_consumer_state.clone() + + tile_sched = create_fmha_static_tile_scheduler( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + while work_tile.is_valid_tile: + curr_block_coord = work_tile.tile_idx + + o0_coord = 2 * curr_block_coord[0] + o1_coord = o0_coord + 1 + gO = gO_qdl[None, None, None, 0, curr_block_coord[2]] + tOsO, tOgO = cute.nvgpu.cpasync.tma_partition( + tma_atom_o, + 0, + cute.make_layout(1), + cute.group_modes(sO, 0, 2), + cute.group_modes(gO, 0, 2), + ) + + # O0 O1 using the same pipeline + # wait from corr, issue tma store on smem + # O0 + # 1. wait for O0 final + corr_epi_pipeline.consumer_wait(corr_epi_consumer_state) + corr_epi_consumer_state.advance() + # 2. copy O0 to gmem + cute.copy(tma_atom_o, tOsO[None, 0], tOgO[None, o0_coord]) + cute.arch.cp_async_bulk_commit_group() + # O1 + # 1. wait for O1 final + corr_epi_pipeline.consumer_wait(corr_epi_consumer_state) + corr_epi_consumer_state.advance() + # 2. copy O1 to gmem + cute.copy(tma_atom_o, tOsO[None, 1], tOgO[None, o1_coord]) + cute.arch.cp_async_bulk_commit_group() + + # Ensure O0 buffer is ready to be released + cute.arch.cp_async_bulk_wait_group(1, read=True) + corr_epi_pipeline.consumer_release(corr_epi_release_state) + corr_epi_release_state.advance() + # Ensure O1 buffer is ready to be released + cute.arch.cp_async_bulk_wait_group(0, read=True) + corr_epi_pipeline.consumer_release(corr_epi_release_state) + corr_epi_release_state.advance() + + # Advance to next tile + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + # End of persistent scheduler loop + + # /////////////////////////////////////////////////////////////////////////////// + # Softmax0 + # /////////////////////////////////////////////////////////////////////////////// + if warp_idx < self.softmax1_warp_ids[0]: + # increase register after decreasing + cute.arch.mbarrier_wait(max_reg_setting_mbar_ptr, 0) + cute.arch.warpgroup_reg_alloc(self.num_regs_softmax) + + self.softmax( + stage=0, + scale_softmax_log2=scale_softmax_log2, + qk_thr_mma=qk_thr_mma, + tStS=tStS, + tStSi=tStS0, + mma_si_pipeline=mma_s0_pipeline, + si_corr_pipeline=s0_corr_pipeline, + s0_s1_sequence_pipeline=s0_s1_sequence_pipeline, + tile_sched_params=tile_sched_params, + fused_mask=fused_mask, + ) + cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr) + + # /////////////////////////////////////////////////////////////////////////////// + # Softmax1 + # /////////////////////////////////////////////////////////////////////////////// + if ( + warp_idx < self.correction_warp_ids[0] + and warp_idx >= self.softmax1_warp_ids[0] + ): + # increase register after decreasing + cute.arch.mbarrier_wait(max_reg_setting_mbar_ptr, 0) + cute.arch.warpgroup_reg_alloc(self.num_regs_softmax) + + self.softmax( + stage=1, + scale_softmax_log2=scale_softmax_log2, + qk_thr_mma=qk_thr_mma, + tStS=tStS, + tStSi=tStS1, + mma_si_pipeline=mma_s1_pipeline, + si_corr_pipeline=s1_corr_pipeline, + s0_s1_sequence_pipeline=s0_s1_sequence_pipeline, + tile_sched_params=tile_sched_params, + fused_mask=fused_mask, + ) + cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr) + + # /////////////////////////////////////////////////////////////////////////////// + # Correction + # /////////////////////////////////////////////////////////////////////////////// + if warp_idx >= self.correction_warp_ids[0] and warp_idx < self.mma_warp_id: + cute.arch.warpgroup_reg_dealloc(self.num_regs_correction) + cute.arch.mbarrier_arrive(max_reg_setting_mbar_ptr) + + s0_corr_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.softmax_corr_stage + ) + s1_corr_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.softmax_corr_stage + ) + o_corr_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.mma_corr_stage + ) + corr_epi_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.epi_stage + ) + + cS = cute.make_identity_tensor((self.qk_mma_tiler[0], self.qk_mma_tiler[1])) + tScS = qk_thr_mma.partition_C(cS) + + tStS_vec_layout = cute.composition(tStS.layout, cute.make_layout((128, 2))) + + tStS_vec0 = cute.make_tensor( + tStS.iterator + self.tmem_vec0_offset, tStS_vec_layout + ) + tStS_vec1 = cute.make_tensor( + tStS.iterator + self.tmem_vec1_offset, tStS_vec_layout + ) + + tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2))) + tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout) + + tmem_load_v_atom = cute.make_copy_atom( + tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(2)), + self.qk_acc_dtype, + ) + + tiled_tmem_load_vec = tcgen05.make_tmem_copy(tmem_load_v_atom, tStS_vec0) + thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids)) + thr_tmem_load_vec = tiled_tmem_load_vec.get_slice(thread_idx) + + tTMEM_LOAD_VECtS0 = thr_tmem_load_vec.partition_S(tStS_vec0) + tTMEM_LOAD_VECtS1 = thr_tmem_load_vec.partition_S(tStS_vec1) + tTMEM_LOAD_VECcS = thr_tmem_load_vec.partition_D(tScS_vec) + + tile_sched = create_fmha_static_tile_scheduler( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + while work_tile.is_valid_tile: + curr_block_coord = work_tile.tile_idx + + # Ignore first signal from softmax as no correction is required + s0_corr_pipeline.consumer_wait(s0_corr_consumer_state) + s0_corr_pipeline.consumer_release(s0_corr_consumer_state) + s0_corr_consumer_state.advance() + + s1_corr_pipeline.consumer_wait(s1_corr_consumer_state) + + seqlen_kv_loop_steps = ( + fused_mask.get_trip_count(curr_block_coord, self.cta_tiler) - 1 + ) + for i in cutlass.range_dynamic(0, seqlen_kv_loop_steps, 1, unroll=1): + # wait for S0 + s0_corr_pipeline.consumer_wait(s0_corr_consumer_state) + tTMEM_LOAD_VECrS = cute.make_fragment( + tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype + ) + # read row_wise new global max + cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS0, tTMEM_LOAD_VECrS) + + scale_ = scale_softmax_log2 * ( + tTMEM_LOAD_VECrS[0] - tTMEM_LOAD_VECrS[1] + ) + scale = cute.arch.exp2(scale_) + + mma_corr_pipeline.consumer_wait(o_corr_consumer_state) + self.correction_rescale(pv_thr_mma, tOtO0, scale) + + s1_corr_pipeline.consumer_release(s1_corr_consumer_state) + s1_corr_consumer_state.advance() + + cute.arch.fence_view_async_tmem_store() + + mma_corr_pipeline.consumer_release(o_corr_consumer_state) + o_corr_consumer_state.advance() + + s1_corr_pipeline.consumer_wait(s1_corr_consumer_state) + + cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS1, tTMEM_LOAD_VECrS) + + scale_ = scale_softmax_log2 * ( + tTMEM_LOAD_VECrS[0] - tTMEM_LOAD_VECrS[1] + ) + scale = cute.arch.exp2(scale_) + + mma_corr_pipeline.consumer_wait(o_corr_consumer_state) + self.correction_rescale(pv_thr_mma, tOtO1, scale) + + s0_corr_pipeline.consumer_release(s0_corr_consumer_state) + s0_corr_consumer_state.advance() + + cute.arch.fence_view_async_tmem_store() + mma_corr_pipeline.consumer_release(o_corr_consumer_state) + o_corr_consumer_state.advance() + # End of seqlen_corr_loop_steps + + s1_corr_pipeline.consumer_release(s1_corr_consumer_state) + s1_corr_consumer_state.advance() + + s0_corr_pipeline.consumer_wait(s0_corr_consumer_state) + + tTMEM_LOAD_VECrS = cute.make_fragment( + tTMEM_LOAD_VECcS.shape, self.qk_acc_dtype + ) + cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS0, tTMEM_LOAD_VECrS) + cute.arch.fence_view_async_tmem_load() + + s0_corr_pipeline.consumer_release(s0_corr_consumer_state) + s0_corr_consumer_state.advance() + + mma_corr_pipeline.consumer_wait(o_corr_consumer_state) + corr_epi_pipeline.producer_acquire(corr_epi_producer_state) + + self.correction_epilog( + pv_thr_mma, + tOtO0, + scale_output / tTMEM_LOAD_VECrS[0], + sO[None, None, 0], + ) + + mma_corr_pipeline.consumer_release(o_corr_consumer_state) + o_corr_consumer_state.advance() + + corr_epi_pipeline.producer_commit(corr_epi_producer_state) + corr_epi_producer_state.advance() + + s1_corr_pipeline.consumer_wait(s1_corr_consumer_state) + # load from V1 + cute.copy(tiled_tmem_load_vec, tTMEM_LOAD_VECtS1, tTMEM_LOAD_VECrS) + cute.arch.fence_view_async_tmem_load() + + s1_corr_pipeline.consumer_release(s1_corr_consumer_state) + s1_corr_consumer_state.advance() + + mma_corr_pipeline.consumer_wait(o_corr_consumer_state) + + corr_epi_pipeline.producer_acquire(corr_epi_producer_state) + self.correction_epilog( + pv_thr_mma, + tOtO1, + scale_output / tTMEM_LOAD_VECrS[0], + sO[None, None, 1], + ) + mma_corr_pipeline.consumer_release(o_corr_consumer_state) + o_corr_consumer_state.advance() + + corr_epi_pipeline.producer_commit(corr_epi_producer_state) + corr_epi_producer_state.advance() + + # Advance to next tile + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + # End of persistent scheduler loop + + cute.arch.mbarrier_arrive(tmem_dealloc_mbar_ptr) + + return + + @cute.jit + def softmax_step( + self, + stage: int, + need_apply_mask: bool, + row_max: cutlass.Float32, + row_sum: cutlass.Float32, + mma_si_consumer_state: utils.PipelineState, + si_corr_producer_state: utils.PipelineState, + s0_s1_sequence_state: utils.PipelineState, + mma_si_pipeline: utils.PipelineAsync, + si_corr_pipeline: utils.PipelineAsync, + s0_s1_sequence_pipeline: utils.PipelineAsync, + scale_softmax_log2: cutlass.Float32, + cS: cute.Tensor, + qk_thr_mma: cute.core.ThrMma, + tiled_tmem_load: cute.TiledCopy, + tiled_tmem_store: cute.TiledCopy, + tiled_tmem_store_vec: cute.TiledCopy, + thr_tmem_load: cute.CopyAtom, + thr_tmem_store: cute.CopyAtom, + thr_tmem_store_vec: cute.CopyAtom, + tTMEM_LOADtS: cute.Tensor, + tTMEM_STORE_VECtS: cute.Tensor, + tTMEM_STOREtS_x4: cute.Tensor, + fused_mask: cute.Tensor, + ) -> Tuple[ + cutlass.Float32, + cutlass.Float32, + utils.PipelineState, + utils.PipelineState, + utils.PipelineState, + ]: + """Perform a single step of the softmax computation on a block of attention scores. + + This method processes one block of the attention matrix, computing numerically stable + softmax by first finding the row maximum, subtracting it from all elements, applying + exponential function, and then normalizing by the sum of exponentials. It also handles + optional masking of attention scores. + + The method involves several key operations: + 1. Loading attention scores from tensor memory + 2. Applying optional masking based on position + 3. Computing row-wise maximum values for numerical stability + 4. Transforming scores using exp2(x*scale - max*scale) + 5. Computing row sums for normalization + 6. Coordinating pipeline synchronization between different processing stages + + :param stage: Processing stage (0 for first half, 1 for second half) + :type stage: int + :param need_apply_mask: Whether to apply attention masking + :type need_apply_mask: bool + :param row_max: Current maximum value for the row + :type row_max: cute.core.Tensor + :param row_sum: Current sum value for the row + :type row_sum: cute.core.Tensor + :param mma_si_consumer_state: Pipeline state for MMA consumer operations + :type mma_si_consumer_state: utils.PipelineState + :param si_corr_producer_state: Pipeline state for correction producer operations + :type si_corr_producer_state: utils.PipelineState + :param s0_s1_sequence_state: Pipeline state for sequence synchronization + :type s0_s1_sequence_state: utils.PipelineState + :param mma_si_pipeline: Pipeline for MMA operations + :type mma_si_pipeline: utils.PipelineAsync + :param si_corr_pipeline: Pipeline for correction operations + :type si_corr_pipeline: utils.PipelineAsync + :param s0_s1_sequence_pipeline: Pipeline for sequence synchronization + :type s0_s1_sequence_pipeline: utils.PipelineAsync + :param scale_softmax_log2: Log2 scale factor for softmax computation + :type scale_softmax_log2: cutlass.Float32 + :param cS: Current slice of attention matrix + :type cS: cute.Tensor + :param qk_thr_mma: Thread MMA operation + :type qk_thr_mma: cute.core.ThrMma + :param tiled_tmem_load: Tiled copy operation for loading from tensor memory + :type tiled_tmem_load: cute.TiledCopy + :param tiled_tmem_store: Tiled copy operation for storing to tensor memory + :type tiled_tmem_store: cute.TiledCopy + :param tiled_tmem_store_vec: Tiled copy operation for storing vector data + :type tiled_tmem_store_vec: cute.TiledCopy + :param thr_tmem_load: Thread copy operation for loading + :type thr_tmem_load: cute.CopyAtom + :param thr_tmem_store: Thread copy operation for storing + :type thr_tmem_store: cute.CopyAtom + :param thr_tmem_store_vec: Thread copy operation for storing vector data + :type thr_tmem_store_vec: cute.CopyAtom + :param tTMEM_LOADtS: Tensor for loading from tensor memory + :type tTMEM_LOADtS: cute.Tensor + :param tTMEM_STORE_VECtS: Tensor for storing vector data + :type tTMEM_STORE_VECtS: cute.Tensor + :param tTMEM_STOREtS_x4: Tensor for storing processed data + :type tTMEM_STOREtS_x4: cute.Tensor + :param fused_mask: Mask configuration for attention masking + :type fused_mask: cute.Tensor + :return: Updated state values (row_max, row_sum, and pipeline states) + :rtype: tuple + """ + tilePlikeFP32 = ( + self.qk_mma_tiler[1] // cutlass.Float32.width * self.o_dtype.width + ) + tScS = qk_thr_mma.partition_C(cS) + tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2))) + tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout) + + tScS_P_layout = cute.composition( + tScS.layout, cute.make_layout((128, tilePlikeFP32)) + ) + tScS_P = cute.make_tensor(tScS.iterator, tScS_P_layout) + tTMEM_LOADcS = thr_tmem_load.partition_D(tScS) + tTMEM_STORE_VECcS = thr_tmem_store_vec.partition_S(tScS_vec) + tTMEM_STOREcS = thr_tmem_store.partition_S(tScS_P) + + # Wait for Si + mma_si_pipeline.consumer_wait(mma_si_consumer_state) + tTMEM_LOADrS = cute.make_fragment(tTMEM_LOADcS.shape, self.qk_acc_dtype) + cute.copy(tiled_tmem_load, tTMEM_LOADtS, tTMEM_LOADrS) + + if need_apply_mask: + fused_mask.apply_mask(tTMEM_LOADrS, tTMEM_LOADcS) + + old_row_max = row_max + row_max = tTMEM_LOADrS.load().reduce(cute.ReductionOp.MAX, row_max, 0) + row_max_safe = row_max + if row_max == -cutlass.Float32.inf: + row_max_safe = 0.0 + + tTMEM_STORE_VECrS = cute.make_fragment( + tTMEM_STORE_VECcS.shape, self.qk_acc_dtype + ) + tTMEM_STORE_VECrS[0] = old_row_max + tTMEM_STORE_VECrS[1] = row_max_safe + cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS, tTMEM_STORE_VECtS) + cute.arch.fence_view_async_tmem_store() + # Notify correction wg that row_max is ready + si_corr_pipeline.producer_commit(si_corr_producer_state) + si_corr_producer_state.advance() + + tTMEM_STORErS_x4 = cute.make_fragment(tTMEM_STOREcS.shape, self.qk_acc_dtype) + tTMEM_STORErS_x4_e = cute.make_tensor( + cute.recast_ptr(tTMEM_STORErS_x4.iterator, dtype=self.q_dtype), + tTMEM_LOADrS.layout, + ) + + scale = scale_softmax_log2 + minus_row_max_scale = (0.0 - row_max_safe) * scale + + # Sequence barrier wait + if stage == 0: + s0_s1_sequence_pipeline.producer_acquire(s0_s1_sequence_state) + else: + s0_s1_sequence_pipeline.consumer_wait(s0_s1_sequence_state) + + frg_cnt = 4 + frg_tile = cute.size(tTMEM_LOADrS) // frg_cnt + tTMEM_LOADrS_frg = cute.logical_divide(tTMEM_LOADrS, cute.make_layout(frg_tile)) + tTMEM_STORErS_x4_e_frg = cute.logical_divide( + tTMEM_STORErS_x4_e, cute.make_layout(frg_tile) + ) + for j in range(frg_cnt): + for k in range(0, cute.size(tTMEM_LOADrS_frg, mode=[0]), 2): + tTMEM_LOADrS_frg[k, j], tTMEM_LOADrS_frg[k + 1, j] = ( + cute.arch.fma_packed_f32x2( + (tTMEM_LOADrS_frg[k, j], tTMEM_LOADrS_frg[k + 1, j]), + (scale, scale), + (minus_row_max_scale, minus_row_max_scale), + ) + ) + tTMEM_LOADrS_frg[k, j] = cute.arch.exp2(tTMEM_LOADrS_frg[k, j]) + tTMEM_LOADrS_frg[k + 1, j] = cute.arch.exp2(tTMEM_LOADrS_frg[k + 1, j]) + s_vec = tTMEM_LOADrS_frg[None, j].load() + tTMEM_STORErS_x4_e_frg[None, j].store(s_vec.to(self.q_dtype)) + + # Sequence barrier arrive + if stage == 0: + s0_s1_sequence_pipeline.producer_commit(s0_s1_sequence_state) + else: + s0_s1_sequence_pipeline.consumer_release(s0_s1_sequence_state) + s0_s1_sequence_state.advance() + + cute.copy(tiled_tmem_store, tTMEM_STORErS_x4, tTMEM_STOREtS_x4) + cute.arch.fence_view_async_tmem_store() + + # Notify tensor core warp that P is ready + mma_si_pipeline.consumer_release(mma_si_consumer_state) + mma_si_consumer_state.advance() + + si_corr_pipeline.producer_acquire(si_corr_producer_state) + + acc_scale_ = scale * (old_row_max - row_max_safe) + acc_scale = cute.arch.exp2(acc_scale_) * 0.5 + row_sum *= acc_scale + local_row_sum_0 = (row_sum, row_sum) + local_row_sum_1 = (0.0, 0.0) + local_row_sum_2 = (0.0, 0.0) + local_row_sum_3 = (0.0, 0.0) + + reduction_unroll = 4 + frg_tile = cute.size(tTMEM_LOADrS) // reduction_unroll + tTMEM_LOADrS_frg = cute.logical_divide(tTMEM_LOADrS, cute.make_layout(frg_tile)) + + for j in range(0, cute.size(tTMEM_LOADrS_frg, mode=[0]), 2): + local_row_sum_0 = cute.arch.add_packed_f32x2( + local_row_sum_0, (tTMEM_LOADrS_frg[j, 0], tTMEM_LOADrS_frg[j + 1, 0]) + ) + local_row_sum_1 = cute.arch.add_packed_f32x2( + local_row_sum_1, (tTMEM_LOADrS_frg[j, 1], tTMEM_LOADrS_frg[j + 1, 1]) + ) + local_row_sum_2 = cute.arch.add_packed_f32x2( + local_row_sum_2, (tTMEM_LOADrS_frg[j, 2], tTMEM_LOADrS_frg[j + 1, 2]) + ) + local_row_sum_3 = cute.arch.add_packed_f32x2( + local_row_sum_3, (tTMEM_LOADrS_frg[j, 3], tTMEM_LOADrS_frg[j + 1, 3]) + ) + + local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, local_row_sum_1) + local_row_sum_2 = cute.arch.add_packed_f32x2(local_row_sum_2, local_row_sum_3) + local_row_sum_0 = cute.arch.add_packed_f32x2(local_row_sum_0, local_row_sum_2) + row_sum = local_row_sum_0[0] + local_row_sum_0[1] + + return ( + row_max, + row_sum, + mma_si_consumer_state, + si_corr_producer_state, + s0_s1_sequence_state, + ) + + # for both softmax0 and softmax1 warp group + @cute.jit + def softmax( + self, + stage: int, + scale_softmax_log2: cutlass.Float32, + qk_thr_mma: cute.core.ThrMma, + tStS: cute.Tensor, + tStSi: cute.Tensor, + mma_si_pipeline: utils.PipelineAsync, + si_corr_pipeline: utils.PipelineAsync, + s0_s1_sequence_pipeline: utils.PipelineAsync, + tile_sched_params: FmhaStaticTileSchedulerParams, + fused_mask: FusedMask, + ): + """Compute softmax on attention scores from QK matrix multiplication. + + This method handles the softmax computation for either the first or second half of the + attention matrix, depending on the 'stage' parameter. It calculates row-wise maximum + and sum values needed for stable softmax computation, applies optional masking, and + transforms raw attention scores into probability distributions. + + The implementation uses specialized memory access patterns and efficient math operations + for computing exp(x) using exp2 functions. It also coordinates pipeline + synchronization between MMA, correction, and sequence processing stages. + + :param stage: Processing stage (0 for first half, 1 for second half of attention matrix) + :type stage: int + :param scale_softmax_log2: Log2 scale factor for softmax operation + :type scale_softmax_log2: cutlass.Float32 + :param qk_thr_mma: Thread MMA operation for QK matrix multiplication + :type qk_thr_mma: cute.core.ThrMma + :param tStS: Shared tensor for softmax input/output + :type tStS: cute.Tensor + :param tStSi: Input tensor containing attention scores + :type tStSi: cute.Tensor + :param mma_si_pipeline: Pipeline for synchronizing with MMA operations + :type mma_si_pipeline: utils.PipelineAsync + :param si_corr_pipeline: Pipeline for synchronizing with correction operations + :type si_corr_pipeline: utils.PipelineAsync + :param s0_s1_sequence_pipeline: Pipeline for synchronizing between stage 0 and 1 + :type s0_s1_sequence_pipeline: utils.PipelineAsync + :param tile_sched_params: Parameters for tile scheduling + :type tile_sched_params: FmhaStaticTileSchedulerParams + :param fused_mask: Mask configuration for attention masking + :type fused_mask: FusedMask + """ + tidx, _, _ = cute.arch.thread_idx() + thread_idx = tidx % ( + self.threads_per_warp + * ( + len(self.softmax0_warp_ids) + if stage == 0 + else len(self.softmax1_warp_ids) + ) + ) + + cS_base = cute.make_identity_tensor( + (self.qk_mma_tiler[0], self.qk_mma_tiler[1]) + ) + + tilePlikeFP32 = self.qk_mma_tiler[1] // 32 * self.o_dtype.width + + tScS = qk_thr_mma.partition_C(cS_base) + + tStS_vec_layout = cute.composition(tStS.layout, cute.make_layout((128, 2))) + tmem_vec_offset = self.tmem_vec0_offset if stage == 0 else self.tmem_vec1_offset + tStS_vec = cute.make_tensor(tStS.iterator + tmem_vec_offset, tStS_vec_layout) + + tScS_vec_layout = cute.composition(tScS.layout, cute.make_layout((128, 2))) + tScS_vec = cute.make_tensor(tScS.iterator, tScS_vec_layout) + + tStS_P_layout = cute.composition( + tStS.layout, cute.make_layout((128, tilePlikeFP32)) + ) + tmem_p_offset = self.tmem_p0_offset if stage == 0 else self.tmem_p1_offset + tStS_P = cute.make_tensor(tStS.iterator + tmem_p_offset, tStS_P_layout) + + tmem_load_atom = cute.make_copy_atom( + tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(32)), + self.qk_acc_dtype, + ) + + tiled_tmem_load = tcgen05.make_tmem_copy(tmem_load_atom, tStSi) + thread_idx = tidx % ( + self.threads_per_warp + * ( + len(self.softmax0_warp_ids) + if stage == 0 + else len(self.softmax1_warp_ids) + ) + ) + thr_tmem_load = tiled_tmem_load.get_slice(thread_idx) + tTMEM_LOADtS = thr_tmem_load.partition_S(tStSi) + + tmem_store_vec_atom = cute.make_copy_atom( + tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(2)), + self.qk_acc_dtype, + ) + tiled_tmem_store_vec = tcgen05.make_tmem_copy(tmem_store_vec_atom, tStS_vec) + thr_tmem_store_vec = tiled_tmem_store_vec.get_slice(thread_idx) + + tTMEM_STORE_VECtS = thr_tmem_store_vec.partition_D(tStS_vec) + tTMEM_STORE_VECcS = thr_tmem_store_vec.partition_S(tScS_vec) + tmem_store_atom = cute.make_copy_atom( + tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(32)), + self.qk_acc_dtype, + ) + tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tStS_P) + thr_tmem_store = tiled_tmem_store.get_slice(thread_idx) + tTMEM_STOREtS_x4 = thr_tmem_store.partition_D(tStS_P) + + mma_si_consumer_state = utils.make_pipeline_state( + utils.PipelineUserType.Consumer, self.mma_softmax_stage + ) + si_corr_producer_state = utils.make_pipeline_state( + utils.PipelineUserType.Producer, self.softmax_corr_stage + ) + s0_s1_sequence_state = utils.make_pipeline_state( + ( + utils.PipelineUserType.Producer + if stage == 0 + else utils.PipelineUserType.Consumer + ), + 1, + ) + + tile_sched = create_fmha_static_tile_scheduler( + tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim() + ) + work_tile = tile_sched.initial_work_tile_info() + + while work_tile.is_valid_tile: + curr_block_coord = work_tile.tile_idx + logical_offset = ( + curr_block_coord[0] * self.cta_tiler[0] + stage * self.qk_mma_tiler[0], + 0, + ) + + cS = cute.domain_offset(logical_offset, cS_base) + + si_corr_pipeline.producer_acquire(si_corr_producer_state) + unmask_count = fused_mask.get_unmasked_trip_count( + curr_block_coord, + self.cta_tiler, + ) + + row_max = -cutlass.Float32.inf + row_sum = 0.0 + + for i in cutlass.range_dynamic(0, unmask_count, 1, unroll=1): + cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS) + ( + row_max, + row_sum, + mma_si_consumer_state, + si_corr_producer_state, + s0_s1_sequence_state, + ) = self.softmax_step( + stage, + False, + row_max, + row_sum, + mma_si_consumer_state, + si_corr_producer_state, + s0_s1_sequence_state, + mma_si_pipeline, + si_corr_pipeline, + s0_s1_sequence_pipeline, + scale_softmax_log2, + cS_iter, + qk_thr_mma, + tiled_tmem_load, + tiled_tmem_store, + tiled_tmem_store_vec, + thr_tmem_load, + thr_tmem_store, + thr_tmem_store_vec, + tTMEM_LOADtS, + tTMEM_STORE_VECtS, + tTMEM_STOREtS_x4, + fused_mask, + ) + + mask_count = fused_mask.get_masked_trip_count( + curr_block_coord, + self.cta_tiler, + ) + + for i in cutlass.range_dynamic( + unmask_count, unmask_count + mask_count, 1, unroll=1 + ): + cS_iter = cute.domain_offset((0, i * self.qk_mma_tiler[1]), cS) + ( + row_max, + row_sum, + mma_si_consumer_state, + si_corr_producer_state, + s0_s1_sequence_state, + ) = self.softmax_step( + stage, + True, + row_max, + row_sum, + mma_si_consumer_state, + si_corr_producer_state, + s0_s1_sequence_state, + mma_si_pipeline, + si_corr_pipeline, + s0_s1_sequence_pipeline, + scale_softmax_log2, + cS_iter, + qk_thr_mma, + tiled_tmem_load, + tiled_tmem_store, + tiled_tmem_store_vec, + thr_tmem_load, + thr_tmem_store, + thr_tmem_store_vec, + tTMEM_LOADtS, + tTMEM_STORE_VECtS, + tTMEM_STOREtS_x4, + fused_mask, + ) + + mma_si_pipeline.consumer_wait(mma_si_consumer_state) + + tTMEM_STORE_VECrS = cute.make_fragment( + tTMEM_STORE_VECcS.shape, self.qk_acc_dtype + ) + tTMEM_STORE_VECrS[0] = row_sum + tTMEM_STORE_VECrS[1] = row_max + cute.copy(tiled_tmem_store_vec, tTMEM_STORE_VECrS, tTMEM_STORE_VECtS) + cute.arch.fence_view_async_tmem_store() + + si_corr_pipeline.producer_commit(si_corr_producer_state) + si_corr_producer_state.advance() + + si_corr_pipeline.producer_acquire(si_corr_producer_state) + + # Empty step to sync against pipe s + mma_si_pipeline.consumer_release(mma_si_consumer_state) + mma_si_consumer_state.advance() + + # Advance to next tile + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + # End of persistent scheduler loop + + @cute.jit + def correction_rescale( + self, + thr_mma: cute.core.ThrMma, + tOtO: cute.Tensor, + scale: cutlass.Float32, + ): + """Rescale intermediate attention results based on softmax normalization factor. + + This method performs a crucial correction step in the attention computation pipeline. + When processing attention in blocks, the softmax normalization factors may change + as new blocks are processed. This method rescales previously computed partial + output values to account for updated normalization factors. + + The implementation uses efficient tensor memory operations to: + 1. Load existing partial attention output from tensor memory + 2. Apply the scaling factor to all elements + 3. Store the rescaled results back to tensor memory + + :param thr_mma: Thread MMA operation for the computation + :type thr_mma: cute.core.ThrMma + :param tOtO: Tensor representing partial attention output to be rescaled + :type tOtO: cute.Tensor + :param scale: Scaling factor to apply to the partial results + :type scale: cutlass.Float32 + """ + pv_tiled_mma_shape = ( + self.pv_mma_tiler[0], + self.pv_mma_tiler[1], + ) + cO = cute.make_identity_tensor(pv_tiled_mma_shape) + tOcO = thr_mma.partition_C(cO) + + corr_tile_size = 16 # tuneable parameter + tmem_load_atom = cute.make_copy_atom( + tcgen05.copy.Ld32x32bOp(tcgen05.copy.Repetition(corr_tile_size)), + self.pv_acc_dtype, + ) + tmem_store_atom = cute.make_copy_atom( + tcgen05.copy.St32x32bOp(tcgen05.copy.Repetition(corr_tile_size)), + self.pv_acc_dtype, + ) + + tOtO_i_layout = cute.composition( + tOtO.layout, cute.make_layout((128, corr_tile_size)) + ) + tOcO_i_layout = cute.composition( + tOcO.layout, cute.make_layout((128, corr_tile_size)) + ) + + tOtO_i = cute.make_tensor(tOtO.iterator, tOtO_i_layout) + tOcO_i = cute.make_tensor(tOcO.iterator, tOcO_i_layout) + + tiled_tmem_load = tcgen05.make_tmem_copy(tmem_load_atom, tOtO_i) + tiled_tmem_store = tcgen05.make_tmem_copy(tmem_store_atom, tOtO_i) + tidx, _, _ = cute.arch.thread_idx() + thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids)) + thr_tmem_load = tiled_tmem_load.get_slice(thread_idx) + thr_tmem_store = tiled_tmem_store.get_slice(thread_idx) + + tTMEM_LOADtO = thr_tmem_load.partition_S(tOtO_i) + tTMEM_LOADcO = thr_tmem_load.partition_D(tOcO_i) + + tTMEM_STOREtO = thr_tmem_store.partition_D(tOtO_i) + + tTMrO = cute.make_fragment( + (tTMEM_LOADcO.shape, 128 // corr_tile_size), self.pv_acc_dtype + ) + for i in range(self.cta_tiler[2] // corr_tile_size): + tTMrO_i_ = tTMrO[None, i] + tTMrO_i_layout = cute.composition( + tTMrO_i_.layout, cute.make_layout(tTMrO.shape[0]) + ) + tTMrO_i = cute.make_tensor(tTMrO_i_.iterator, tTMrO_i_layout) + tTMEM_LOADtO_i = cute.make_tensor( + tTMEM_LOADtO.iterator + i * corr_tile_size, tTMEM_LOADtO.layout + ) + tTMEM_STOREtO_i = cute.make_tensor( + tTMEM_STOREtO.iterator + i * corr_tile_size, tTMEM_STOREtO.layout + ) + + cute.copy(tiled_tmem_load, tTMEM_LOADtO_i, tTMrO_i) + for j in range(0, cute.size(tTMrO_i), 2): + tTMrO_i[j], tTMrO_i[j + 1] = cute.arch.mul_packed_f32x2( + (tTMrO_i[j], tTMrO_i[j + 1]), + (scale, scale), + ) + cute.copy(tiled_tmem_store, tTMrO_i, tTMEM_STOREtO_i) + + @cute.jit + def correction_epilog( + self, + thr_mma: cute.core.ThrMma, + tOtO: cute.Tensor, + scale: cutlass.Float32, + sO: cute.Tensor, + ): + """Apply final scaling and transformation to attention output before writing to global memory. + + This correction_epilog function handles the final processing step for attention output values. + It applies a scaling factor to the accumulated attention results and prepares the + data for efficient transfer back to global memory. + + The method performs: + 1. Loading of accumulated attention results from tensor memory + 2. Application of the final output scaling factor + 3. Type conversion if necessary (typically from higher precision accumulator to output precision) + 4. Reorganization of data for optimal memory access patterns + 5. Preparation for efficient TMA store operations + + :param thr_mma: Thread MMA operation for the computation + :type thr_mma: cute.core.ThrMma + :param tOtO: Tensor containing accumulated attention output + :type tOtO: cute.Tensor + :param scale: Final scaling factor to apply to the output + :type scale: cutlass.Float32 + :param sO: Shared memory tensor for the final output + :type sO: cute.Tensor + """ + + pv_tiled_mma_shape = ( + self.pv_mma_tiler[0], + self.pv_mma_tiler[1], + ) + cO = cute.make_identity_tensor(pv_tiled_mma_shape) + + corr_tile_size = 32 * 8 // self.o_dtype.width + tOsO = thr_mma.partition_C(sO) + tOcO = thr_mma.partition_C(cO) + + tOtO_i = cute.logical_divide(tOtO, cute.make_layout((128, corr_tile_size))) + tOcO_i = cute.logical_divide(tOcO, cute.make_layout((128, corr_tile_size))) + tOsO_i = cute.logical_divide(tOsO, cute.make_layout((128, corr_tile_size))) + tidx, _, _ = cute.arch.thread_idx() + thread_idx = tidx % (self.threads_per_warp * len(self.correction_warp_ids)) + + epi_subtile = (self.epi_tile[0], corr_tile_size) + tmem_copy_atom = sm100_utils.get_tmem_load_op( + self.pv_mma_tiler, + self.o_layout, + self.o_dtype, + self.pv_acc_dtype, + epi_subtile, + use_2cta_instrs=False, + ) + + tiled_tmem_load = tcgen05.make_tmem_copy( + tmem_copy_atom, tOtO_i[(None, None), 0] + ) + + thr_tmem_load = tiled_tmem_load.get_slice(thread_idx) + smem_copy_atom = sm100_utils.get_smem_store_op( + self.o_layout, self.o_dtype, self.pv_acc_dtype, tiled_tmem_load + ) + tiled_smem_store = cute.make_tiled_copy( + smem_copy_atom, + layout_tv=tiled_tmem_load.layout_dst_tv_tiled, + tiler_mn=tiled_tmem_load.tiler_mn, + ) + + tTMEM_LOADtO = thr_tmem_load.partition_S(tOtO_i[(None, None), None]) + tTMEM_LOADsO = thr_tmem_load.partition_D(tOsO_i[(None, None), None]) + tTMEM_LOADoO = thr_tmem_load.partition_D(tOcO_i[(None, None), None]) + + for i in range(self.cta_tiler[2] // corr_tile_size): + tTMEM_LOADtO_i = tTMEM_LOADtO[None, 0, 0, i] + tTMEM_LOADsO_i = tTMEM_LOADsO[None, 0, 0, i] + tTMrO = cute.make_fragment( + tTMEM_LOADoO[None, 0, 0, i].shape, self.pv_acc_dtype + ) + cute.copy(tiled_tmem_load, tTMEM_LOADtO_i, tTMrO) + for j in range(0, cute.size(tTMrO), 2): + tTMrO[j], tTMrO[j + 1] = cute.arch.mul_packed_f32x2( + (tTMrO[j], tTMrO[j + 1]), + (scale, scale), + ) + tSMrO = cute.make_fragment(tTMrO.shape, self.o_dtype) + o_vec = tTMrO.load() + tSMrO.store(o_vec.to(self.o_dtype)) + cute.copy(tiled_smem_store, tSMrO, tTMEM_LOADsO_i) + + # fence view async shared + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + + def make_and_init_load_q_pipeline(self, load_q_mbar_ptr): + load_q_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, len([self.load_warp_id]) + ) + load_q_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, len([self.mma_warp_id]) + ) + return utils.PipelineTmaUmma.create( + barrier_storage=load_q_mbar_ptr, + num_stages=self.q_stage, + producer_group=load_q_producer_group, + consumer_group=load_q_consumer_group, + tx_count=self.tma_copy_q_bytes, + ) + + def make_and_init_load_kv_pipeline(self, load_kv_mbar_ptr): + load_kv_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, len([self.load_warp_id]) + ) + load_kv_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, len([self.mma_warp_id]) + ) + return utils.PipelineTmaUmma.create( + barrier_storage=load_kv_mbar_ptr, + num_stages=self.kv_stage, + producer_group=load_kv_producer_group, + consumer_group=load_kv_consumer_group, + tx_count=self.tma_copy_kv_bytes, + ) + + def make_and_init_mma_si_pipeline(self, mma_si_mbar_ptr): + mma_si_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, len([self.mma_warp_id]) + ) + mma_si_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.softmax0_warp_ids), + self.threads_per_warp * len(self.softmax0_warp_ids), + ) + return utils.PipelineUmmaAsync.create( + barrier_storage=mma_si_mbar_ptr, + num_stages=self.mma_softmax_stage, + producer_group=mma_si_producer_group, + consumer_group=mma_si_consumer_group, + ) + + def make_and_init_si_corr_pipeline(self, si_corr_mbar_ptr): + si_corr_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.softmax0_warp_ids), + self.threads_per_warp * len(self.softmax0_warp_ids), + ) + si_corr_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.correction_warp_ids), + self.threads_per_warp * len(self.correction_warp_ids), + ) + return utils.PipelineAsync.create( + barrier_storage=si_corr_mbar_ptr, + num_stages=self.softmax_corr_stage, + producer_group=si_corr_producer_group, + consumer_group=si_corr_consumer_group, + ) + + def make_and_init_corr_epi_pipeline(self, corr_epi_mbar_ptr): + corr_epi_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.correction_warp_ids), + self.threads_per_warp * len(self.correction_warp_ids), + ) + corr_epi_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len([self.epilogue_warp_id]), + self.threads_per_warp * len([self.epilogue_warp_id]), + ) + return utils.PipelineAsync.create( + barrier_storage=corr_epi_mbar_ptr, + num_stages=self.epi_stage, + producer_group=corr_epi_producer_group, + consumer_group=corr_epi_consumer_group, + ) + + def make_and_init_mma_corr_pipeline(self, mma_corr_mbar_ptr): + mma_corr_producer_group = utils.CooperativeGroup( + utils.Agent.Thread, len([self.mma_warp_id]) + ) + mma_corr_consumer_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.correction_warp_ids), + self.threads_per_warp * len(self.correction_warp_ids), + ) + return utils.PipelineUmmaAsync.create( + barrier_storage=mma_corr_mbar_ptr, + num_stages=self.mma_corr_stage, + producer_group=mma_corr_producer_group, + consumer_group=mma_corr_consumer_group, + ) + + def make_and_init_si_sequence_pipeline(self, si_sequence_mbar_ptr): + s0_sequence_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.softmax0_warp_ids), + self.threads_per_warp * len(self.softmax0_warp_ids), + ) + s1_sequence_group = utils.CooperativeGroup( + utils.Agent.Thread, + self.threads_per_warp * len(self.softmax1_warp_ids), + self.threads_per_warp * len(self.softmax1_warp_ids), + ) + return utils.PipelineAsync.create( + barrier_storage=si_sequence_mbar_ptr, + num_stages=1, + producer_group=s0_sequence_group, + consumer_group=s1_sequence_group, + ) + + @staticmethod + def _compute_grid( + o: cute.Tensor, + cta_tiler: Tuple[int, int, int], + is_persistent: bool, + ) -> Tuple[FmhaStaticTileSchedulerParams, Tuple[int, int, int]]: + o_shape = o.shape + tile_sched_params = create_fmha_static_tile_scheduler_params( + is_persistent, + ( + cute.ceil_div(cute.size(o_shape[0]), cta_tiler[0]), + cute.size(o_shape[2][0]), + cute.size(o_shape[2][1]), + ), + ) + grid = FmhaStaticTileScheduler.get_grid_shape(tile_sched_params) + + return tile_sched_params, grid + + +def run_fmha_and_verify( + q_shape: Tuple[int, int, int, int], + k_shape: Tuple[int, int, int, int], + in_dtype: Type[cutlass.Numeric], + out_dtype: Type[cutlass.Numeric], + qk_acc_dtype: Type[cutlass.Numeric], + pv_acc_dtype: Type[cutlass.Numeric], + mma_tiler_mn: Tuple[int, int], + is_persistent: bool, + has_casual_mask: bool, + scale_q: float, + scale_k: float, + scale_v: float, + inv_scale_o: float, + scale_softmax: float, + tolerance: float, + warmup_iterations: int, + iterations: int, + skip_ref_check: bool, +): + """Execute Fused Multi-Head Attention (FMHA) on Blackwell architecture and validate results. + + This function creates random input tensors for query, key, and value, then performs the + complete FMHA computation pipeline. It supports configurable data types, tiling parameters, + and various attention masking options. Results can be validated against a PyTorch reference + implementation or run multiple times for performance measurement. + + The implementation leverages specialized tensor memory operations and efficient math + operations optimized for Blackwell architecture, including pipelined computation stages + for maximum throughput. + + :param q_shape: Query tensor shape (B, S_q, H, D) where B=batch size, S_q=query sequence length, + H=number of heads, D=head dimension + :type q_shape: Tuple[int, int, int, int] + :param k_shape: Key tensor shape (B, S_k, H_k, D) where B=batch size, S_k=key sequence length, + H_k=number of key heads (H must be divisible by H_k), D=head dimension + :type k_shape: Tuple[int, int, int, int] + :param in_dtype: Input data type for query, key and value tensors + :type in_dtype: Type[cutlass.Numeric] + :param out_dtype: Output data type for attention output + :type out_dtype: Type[cutlass.Numeric] + :param qk_acc_dtype: Accumulator data type for query-key matrix multiplication + :type qk_acc_dtype: Type[cutlass.Numeric] + :param pv_acc_dtype: Accumulator data type for probability-value matrix multiplication + :type pv_acc_dtype: Type[cutlass.Numeric] + :param mma_tiler_mn: Matrix multiply accumulate tile shape (M, N) + :type mma_tiler_mn: Tuple[int, int] + :param is_persistent: Whether to use persistent kernel optimization + :type is_persistent: bool + :param has_casual_mask: Whether to apply causal masking + :type has_casual_mask: bool + :param scale_q: Scaling factor for query tensor + :type scale_q: float + :param scale_k: Scaling factor for key tensor + :type scale_k: float + :param scale_v: Scaling factor for value tensor + :type scale_v: float + :param inv_scale_o: Inverse scaling factor for output tensor + :type inv_scale_o: float + :param scale_softmax: Attention score scaling factor (defaults to 1/sqrt(D) if set to 0) + :type scale_softmax: float + :param tolerance: Maximum acceptable error for validation + :type tolerance: float + :param warmup_iterations: Number of warmup iterations + :type warmup_iterations: int + :param iterations: Number of iterations to run for performance testing + :type iterations: int + :param skip_ref_check: Skip validation against reference implementation + :type skip_ref_check: bool + + :raises ValueError: If input shapes are incompatible or head dimension is unsupported + :raises RuntimeError: If GPU is unavailable for computation + """ + + print(f"Running Blackwell SM100 FMHA test with:") + print(f" q_shape: {q_shape}") + print(f" k_shape: {k_shape}") + print(f" in_dtype: {in_dtype}") + print(f" out_dtype: {out_dtype}") + print(f" qk_acc_dtype: {qk_acc_dtype}") + print(f" pv_acc_dtype: {pv_acc_dtype}") + print(f" mma_tiler_mn: {mma_tiler_mn}") + print(f" is_persistent: {is_persistent}") + print(f" has_casual_mask: {has_casual_mask}") + print(f" scale_q: {scale_q}") + print(f" scale_k: {scale_k}") + print(f" scale_v: {scale_v}") + print(f" inv_scale_o: {inv_scale_o}") + print(f" scale_softmax: {scale_softmax}") + print(f" tolerance: {tolerance}") + + # Unpack parameters + b, s_q, h, d = q_shape + b_, s_k, h_k, d_ = k_shape + + if b != b_: + raise ValueError("q & k must have the same batch size") + + if d != d_: + raise ValueError("q & k must have the same head dimension") + + if d not in {32, 64, 128}: + raise ValueError("head dimension must be 32, 64, or 128") + + if h % h_k != 0: + raise ValueError("h must be divisible by h_k") + + if in_dtype not in {cutlass.Float8E4M3FN, cutlass.Float16}: + raise ValueError("in_dtype must be Float8E4M3FN or Float16") + + if out_dtype not in {cutlass.Float8E4M3FN, cutlass.Float16}: + raise ValueError("out_dtype must be Float8E4M3FN or Float16") + + if qk_acc_dtype not in {cutlass.Float32}: + raise ValueError("qk_acc_dtype must be Float32") + + if pv_acc_dtype not in {cutlass.Float32}: + raise ValueError("pv_acc_dtype must be Float32") + + if iterations < 1: + raise ValueError("iterations must be at least 1") + + h_r = h // h_k + + # Prepare pytorch tensors: Q, K, V (random from 0 to 2) and O (all zero) + if not torch.cuda.is_available(): + raise RuntimeError("GPU is required to run this example!") + + torch.manual_seed(1111) + + def create_and_permute_tensor(b, s, h_r, h_k, d, dtype, is_dynamic_layout=True): + # (b, s, h_r, h_k, d) -> (s, d, h_r, h_k, b) + shape = (b, s, h_r, h_k, d) + permute_order = (1, 4, 2, 3, 0) + is_fp8 = dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} + + # torch does not support fp8 type + torch_dtype = cutlass.torch.dtype(dtype) if not is_fp8 else torch.uint8 + + # Create dtype torch tensor (cpu) + torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor( + shape, + torch_dtype, + permute_order=permute_order, + init_type=cutlass.torch.TensorInitType.RANDOM, + init_config=cutlass.torch.RandomInitConfig( + min_val=0 if is_fp8 else -2, max_val=2 + ), + ) + # Create dtype torch tensor (gpu) + torch_tensor_gpu = torch_tensor_cpu.cuda() + + # Create f32 torch tensor (cpu) + f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32) + + # Create dtype cute tensor (gpu) + cute_tensor = from_dlpack(torch_tensor_gpu, assumed_align=16) + cute_tensor.element_type = dtype + if is_dynamic_layout: + cute_tensor = cute_tensor.mark_layout_dynamic(leading_dim=1) + cute_tensor = cutlass_torch.convert_cute_tensor( + f32_torch_tensor, + cute_tensor, + dtype, + is_dynamic_layout=is_dynamic_layout, + ) + + return f32_torch_tensor, cute_tensor, torch_tensor_gpu + + q_ref, q_tensor, q_torch = create_and_permute_tensor( + b, s_q, h_r, h_k, d, in_dtype, is_dynamic_layout=True + ) + k_ref, k_tensor, k_torch = create_and_permute_tensor( + b, s_k, 1, h_k, d, in_dtype, is_dynamic_layout=True + ) + v_ref, v_tensor, v_torch = create_and_permute_tensor( + b, s_k, 1, h_k, d, in_dtype, is_dynamic_layout=True + ) + o_ref, o_tensor, o_torch = create_and_permute_tensor( + b, s_q, h_r, h_k, d, out_dtype, is_dynamic_layout=True + ) + + mma_tiler = (*mma_tiler_mn, d) + + mask_type = MaskType.NO_MASK + if has_casual_mask: + mask_type = MaskType.CAUSAL_MASK + else: + if s_k % mma_tiler_mn[1] != 0: + mask_type = MaskType.RESIDUAL_MASK + + fmha = BlackwellFusedMultiHeadAttentionForward( + qk_acc_dtype, + pv_acc_dtype, + mma_tiler, + is_persistent, + mask_type, + ) + + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + + if scale_softmax == 0.0: # default to 1/sqrt(d) + scale_softmax = 1.0 / math.sqrt(q_shape[1]) + log2_e = math.log2( + math.exp(1.0) + ) # gpu uses exp2 for perf concerns, we need an extra factor 'log2_e' here + + scale_softmax = scale_q * scale_k * scale_softmax + scale_softmax_log2 = scale_softmax * log2_e + scale_output = scale_v * inv_scale_o + + print("Compiling kernel with cute.compile ...") + start_time = time.time() + # compile fmha kernel + compiled_fmha = cute.compile( + fmha, + q_tensor, + k_tensor, + v_tensor, + o_tensor, + scale_softmax_log2, + scale_output, + current_stream, + ) + compilation_time = time.time() - start_time + print(f"Compilation time: {compilation_time:.4f} seconds") + + # Warmup + for _ in range(warmup_iterations): + compiled_fmha( + q_tensor, + k_tensor, + v_tensor, + o_tensor, + scale_softmax_log2, + scale_output, + current_stream, + ) + + # Execute kernel + for _ in range(iterations): + compiled_fmha( + q_tensor, + k_tensor, + v_tensor, + o_tensor, + scale_softmax_log2, + scale_output, + current_stream, + ) + + torch.cuda.synchronize() + + def run_torch_fmha( + q, k, v, scale_softmax=1.0, scale_output=1.0, has_casual_mask=False + ): + s_q, d, h_r, h_k, b = q.shape + s_k = k.shape[0] + + # broadcast k and v to have the same shape as q + k = k.expand(s_k, d, h_r, h_k, b) + v = v.expand(s_k, d, h_r, h_k, b) + + q_tmp = q.permute(4, 2, 3, 0, 1).contiguous().view(b, -1, s_q, d) + k_tmp = k.permute(4, 2, 3, 0, 1).contiguous().view(b, -1, s_k, d) + v_tmp = v.permute(4, 2, 3, 0, 1).contiguous().view(b, -1, s_k, d) + + ref = F.scaled_dot_product_attention( + q_tmp, + k_tmp, + v_tmp, + attn_mask=None, + dropout_p=0.0, + scale=scale_softmax, + is_causal=has_casual_mask, + ) + ref = ref.view(b, h_r, h_k, s_q, d).permute(3, 4, 1, 2, 0) * scale_output + + return ref + + if not skip_ref_check: + print("Verifying results...") + ref = run_torch_fmha( + q_ref, k_ref, v_ref, scale_softmax, scale_output, has_casual_mask + ) + + # Copy gpu result back + gpu_o = o_torch.cpu() + + # convert ref to out_type + if out_dtype == cutlass.Float16: + ref_o = ref.to(cutlass.torch.dtype(out_dtype)) + elif out_dtype in {cutlass.Float8E4M3FN, cutlass.Float8E5M2}: + # convert ref : f32 -> fp8 -> f32 + permute_order_0 = (4, 0, 2, 3, 1) + permute_order_1 = (1, 4, 2, 3, 0) + + shape = (b, s_q, h_r, h_k, d) + + f8_torch_tensor = cutlass.torch.create_and_permute_torch_tensor( + shape, + torch.uint8, + permute_order=permute_order_1, + init_type=cutlass.torch.TensorInitType.SKIP, + ).cuda() + + # Create dtype tensor (gpu) + ref_o_tensor = from_dlpack( + f8_torch_tensor, assumed_align=16 + ).mark_layout_dynamic(leading_dim=1) + ref_o_tensor.element_type = out_dtype + ref_o_tensor = cutlass.torch.convert_cute_tensor( + # ref for torch tensor is contiguous in shape (b, h_r, h_k, s_q, d), but shape is (s, d, h_r, h_k, b) + # need to make it contiguous first then permute + ref.permute(permute_order_0).contiguous().permute(permute_order_1).cuda(), + ref_o_tensor, + out_dtype, + is_dynamic_layout=True, + ) + + ref_o = f8_torch_tensor.cpu() + + # uint8 check; the minimum difference is 1 + tolerance = 2 + else: + pass + + # Assert close results + torch.testing.assert_close(gpu_o, ref_o, atol=tolerance, rtol=1e-05) + print("Results verified successfully!") + +if __name__ == "__main__": + + def parse_comma_separated_ints(s: str) -> Tuple[int, ...]: + try: + return tuple(int(x.strip()) for x in s.split(",")) + except ValueError: + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers." + ) + + parser = argparse.ArgumentParser(description="Example of FMHA on Blackwell.") + + parser.add_argument( + "--in_dtype", + type=cutlass.dtype, + default=cutlass.Float16, + help="Input data type", + ) + + parser.add_argument( + "--out_dtype", + type=cutlass.dtype, + default=cutlass.Float16, + help="Output data type", + ) + + parser.add_argument( + "--qk_acc_dtype", + type=cutlass.dtype, + default=cutlass.Float32, + help="QK accumulator data type", + ) + + parser.add_argument( + "--pv_acc_dtype", + type=cutlass.dtype, + default=cutlass.Float32, + help="PV accumulator data type", + ) + + parser.add_argument( + "--mma_tiler_mn", + type=parse_comma_separated_ints, + default=(128, 128), + help="MMA tile shape (M, N)", + ) + + parser.add_argument( + "--is_persistent", + action="store_true", + help="Is persistent", + ) + + parser.add_argument( + "--has_casual_mask", + action="store_true", + help="Whether to use casual mask", + ) + + parser.add_argument( + "--q_shape", + type=parse_comma_separated_ints, + default=(1, 256, 8, 128), + help="Shape of Q (B, S_q, H, D)", + ) + + parser.add_argument( + "--k_shape", + type=parse_comma_separated_ints, + default=(1, 256, 8, 128), + help="Shape of K (B, S_k, H_k, D)", + ) + + parser.add_argument( + "--scale_q", + type=float, + default=1.0, + help="Scaling factors to dequantize Q", + ) + + parser.add_argument( + "--scale_k", + type=float, + default=1.0, + help="Scaling factors to dequantize K", + ) + + parser.add_argument( + "--scale_v", + type=float, + default=1.0, + help="Scaling factors to dequantize V", + ) + + parser.add_argument( + "--inv_scale_o", + type=float, + default=1.0, + help="Scaling factor to quantize O", + ) + + parser.add_argument( + "--scale_softmax", + type=float, + default=0.0, + help="Scaling factor to scale S (i.e. Q*K); if zero, defaults to 1/sqrt(D)", + ) + + parser.add_argument( + "--tolerance", type=float, default=1e-01, help="Tolerance for validation" + ) + + parser.add_argument( + "--warmup_iterations", + type=int, + default=0, + help="Number of iterations for warmup", + ) + + parser.add_argument( + "--iterations", + type=int, + default=1, + help="Number of iterations after warmup", + ) + + parser.add_argument( + "--skip_ref_check", + action="store_true", + help="Skip reference check", + ) + + args = parser.parse_args() + + if len(args.q_shape) != 4: + parser.error("--q_shape must contain exactly 4 values") + + if len(args.k_shape) != 4: + parser.error("--k_shape must contain exactly 4 values") + + if len(args.mma_tiler_mn) != 2: + parser.error("--mma_tiler_mn must contain exactly 2 values") + + run_fmha_and_verify( + args.q_shape, + args.k_shape, + args.in_dtype, + args.out_dtype, + args.qk_acc_dtype, + args.pv_acc_dtype, + args.mma_tiler_mn, + args.is_persistent, + args.has_casual_mask, + args.scale_q, + args.scale_k, + args.scale_v, + args.inv_scale_o, + args.scale_softmax, + args.tolerance, + args.warmup_iterations, + args.iterations, + args.skip_ref_check, + ) + + print("PASS") diff --git a/examples/python/CuTeDSL/blackwell/grouped_gemm.py b/examples/python/CuTeDSL/blackwell/grouped_gemm.py new file mode 100644 index 00000000..d2e6f9ab --- /dev/null +++ b/examples/python/CuTeDSL/blackwell/grouped_gemm.py @@ -0,0 +1,2287 @@ +# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import argparse +import functools +from typing import List, Type, Union +from inspect import isclass + +import torch +import cuda.bindings.driver as cuda + +import cutlass +import cutlass.cute as cute +import cutlass.utils as utils +from cutlass.cute.nvgpu import cpasync, tcgen05 +import cutlass.utils.blackwell_helpers as sm100_utils +import cutlass.torch as cutlass_torch +from cutlass.cute.runtime import from_dlpack + +""" +A grouped GEMM example for the NVIDIA Blackwell SM100 architecture using CUTE DSL + +This example demonstrates an implementation of grouped GEMM using a TMA plus Blackwell SM100 TensorCore +warp-specialized persistent kernel. +The grouped GEMM workload computes a batch of GEMM operations with distinct problem sizes. Pointers to matrices +in global memory are passed to the kernel in an array (also held in global memory). Similarly, problem shapes and +strides are also stored in arrays in GMEM. + +This differs from "Batched Array" GEMM since the size of each GEMM problem in the grouped GEMM concept may be distinct. + +To run this example: + +.. code-block:: bash + + python examples/blackwell/grouped_gemm.py \ + --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \ + --mma_tiler_mn 128,64 --cluster_shape_mn 1,1 \ + --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)" \ + --num_groups 4 --tensormap_update_mode SMEM + +The above example command makes 4 groups of different m, n, k sizes. The Blackwell tcgen05 MMA tile shape +is specified as (128, 64) and the cluster shape is (1,1). The input, mma accumulator and output data type +are set as fp16, fp32 and fp16, respectively. + +To collect performance with NCU profiler: + +.. code-block:: bash + + ncu python examples/blackwell/grouped_gemm.py \ + --ab_dtype Float16 --c_dtype Float16 --acc_dtype Float32 \ + --mma_tiler_mn 128,64 --cluster_shape_mn 1,1 \ + --problem_sizes_mnkl "(8192,1280,32,1),(16,384,1536,1),(640,1280,16,1),(640,160,16,1)" \ + --num_groups 4 --tensormap_update_mode SMEM \ + --warmup_iterations 1 --iterations 10 --skip_ref_check + +There are some constrains for this example. Besides the constrains from the Balckwell dense GEMM persistent example, +there are also the following constrains: +* Only fp16 and bf16 data types are supported as inputs. +* Output data types could be fp16, bf16 or fp32. +* The contiguous dimension of each tensor must be at least 16 bytes aligned. +* The l mode(aka, batch size) for each group must be 1. +* The majorness for A, B and C must be the same across all groups. +""" + + +class GroupedGemmKernel: + + def __init__( + self, + acc_dtype: type[cutlass.Numeric], + use_2cta_instrs: bool, + mma_tiler_mn: tuple[int, int], + cluster_shape_mn: tuple[int, int], + tensormap_update_mode: utils.TensorMapUpdateMode = utils.TensorMapUpdateMode.SMEM, + ): + """Initializes the configuration for a Blackwell grouped GEMM kernel. + + Besides configurations for dense persistent GEMM, there is an extra config specific to grouped GEMM: + + Tensormap Update Mode: + - tensormap_update_mode: Specifies whether the tensormap is + updated in global memory(GMEM) or shared memory(SMEM). + The 2 modes are functionally equivalent and the difference are: + - We buffer 3 tensormaps in SMEM for A, B, and C tensors (each TMA descriptor takes 128B) when TMA updates performed on SMEM. + - Performance varies between modes depending on problem size; optimal choice differs across workloads. + + :param acc_dtype: Data type of the accumulator. + :type acc_dtype: type[cutlass.Numeric] + :param use_2cta_instrs: Boolean, True to use cta_group=2 MMA variant. + :type use_2cta_instrs: bool + :param mma_tiler_mn: tuple (M, N) shape of the MMA instruction. + :type mma_tiler_mn: tuple[int, int] + :param cluster_shape_mn: tuple (ClusterM, ClusterN) shape of the cluster. + :type cluster_shape_mn: tuple[int, int] + :param tensormap_update_mode: Mode for updating the tensormap (GMEM or SMEM), defaults to SMEM. + :type tensormap_update_mode: utils.TensorMapUpdateMode, optional + """ + self.acc_dtype: Type[cutlass.Numeric] = acc_dtype + self.use_2cta_instrs = use_2cta_instrs + self.cluster_shape_mn = cluster_shape_mn + # K dimension is deferred in _setup_attributes + self.mma_tiler = (*mma_tiler_mn, 1) + self.cta_group = ( + tcgen05.CtaGroup.TWO if use_2cta_instrs else tcgen05.CtaGroup.ONE + ) + + self.tensormap_update_mode = tensormap_update_mode + # Delegate tensormap ab initialization to MMA warp when SMEM mode is used for better latency hiding + self.delegate_tensormap_ab_init = ( + tensormap_update_mode == utils.TensorMapUpdateMode.SMEM + ) + + self.num_mcast_ctas_a = 1 + self.num_mcast_ctas_b = 1 + self.is_a_mcast = False + self.is_b_mcast = False + + self.occupancy = 1 + # Set specialized warp ids + self.epilog_warp_id = ( + 0, + 1, + 2, + 3, + ) + self.mma_warp_id = 4 + self.tma_warp_id = 5 + self.threads_per_cta = 32 * len( + (self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id) + ) + # Set barrier id for cta sync, epilog sync, tmem ptr sync and tensormap update sync + self.cta_sync_bar_id = 0 + self.epilog_sync_bar_id = 1 + self.tmem_ptr_sync_bar_id = 2 + # Barrier ID used by MMA/TMA warps to signal A/B tensormap initialization completion + self.tensormap_ab_init_bar_id = 4 + self.num_smem_capacity = sm100_utils.SMEM_CAPACITY["sm100"] + self.num_tma_load_bytes = 0 + + def _setup_attributes(self): + """Set up configurations that are dependent on GEMM inputs + + Most of the implementation follows standard dense GEMM patterns, + with the key difference being additional consideration for SMEM + buffer needed for tensormap updates. + """ + # Configure tiled mma + tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + + # Compute mma/cluster/tile shapes + mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2]) + mma_inst_tile_k = 4 + self.mma_tiler = ( + self.mma_tiler[0], + self.mma_tiler[1], + mma_inst_shape_k * mma_inst_tile_k, + ) + self.cta_tile_shape_mnk = ( + self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape), + self.mma_tiler[1], + self.mma_tiler[2], + ) + self.cluster_tile_shape_mnk = tuple( + x * y for x, y in zip(self.cta_tile_shape_mnk, (*self.cluster_shape_mn, 1)) + ) + + # Compute cluster layout + self.cluster_layout_vmnk = cute.tiled_divide( + cute.make_layout((*self.cluster_shape_mn, 1)), + (tiled_mma.thr_id.shape,), + ) + + # Compute number of multicast CTAs for A/B + self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2]) + self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1]) + self.is_a_mcast = self.num_mcast_ctas_a > 1 + self.is_b_mcast = self.num_mcast_ctas_b > 1 + + # Compute epilogue subtile + self.epi_tile = utils.compute_epilogue_tile_shape( + self.cta_tile_shape_mnk, + self.use_2cta_instrs, + self.c_layout, + self.c_dtype, + ) + + # Setup A/B/C stage count in shared memory and ACC stage count in tensor memory + self.num_acc_stage, self.num_ab_stage, self.num_epi_stage = ( + self._compute_stages( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.b_dtype, + self.epi_tile, + self.c_dtype, + self.c_layout, + self.num_smem_capacity, + self.occupancy, + ) + ) + + self.a_smem_layout_staged = sm100_utils.make_smem_layout_a( + tiled_mma, + self.mma_tiler, + self.a_dtype, + self.num_ab_stage, + ) + self.b_smem_layout_staged = sm100_utils.make_smem_layout_b( + tiled_mma, + self.mma_tiler, + self.b_dtype, + self.num_ab_stage, + ) + self.epi_smem_layout_staged = sm100_utils.make_smem_layout_epi( + self.c_dtype, + self.c_layout, + self.epi_tile, + self.num_epi_stage, + ) + + tensor_smem_bytes = self._get_tensor_smem_bytes( + self.a_smem_layout_staged, + self.a_dtype, + self.b_smem_layout_staged, + self.b_dtype, + self.epi_smem_layout_staged, + self.c_dtype, + ) + mbar_smem_bytes = self._get_mbar_smem_bytes( + num_acc_stage=self.num_acc_stage, + num_ab_stage=self.num_ab_stage, + num_epi_stage=self.num_epi_stage, + ) + tensormap_smem_bytes = self._get_tensormap_smem_bytes( + self.tensormap_update_mode + ) + if ( + mbar_smem_bytes + + tensormap_smem_bytes + + GroupedGemmKernel.tensor_memory_management_bytes + > self.reserved_smem_bytes + ): + raise ValueError( + f"smem consumption for mbar and tensormap {mbar_smem_bytes + tensormap_smem_bytes} exceeds the " + f"reserved smem bytes {self.reserved_smem_bytes}" + ) + + # Compute the number of tensor memory allocation columns + self.num_tmem_alloc_cols = self._compute_num_tmem_alloc_cols( + tiled_mma, self.mma_tiler, self.num_acc_stage + ) + + @cute.jit + def __call__( + self, + initial_a: cute.Tensor, + initial_b: cute.Tensor, + initial_c: cute.Tensor, + group_count: cutlass.Constexpr[int], + problem_shape_mnkl: cute.Tensor, + strides_abc: cute.Tensor, + tensor_address_abc: cute.Tensor, + total_num_clusters: cutlass.Constexpr[int], + tensormap_cute_tensor: cute.Tensor, + max_active_clusters: cutlass.Constexpr[int], + stream: cuda.CUstream, + ): + """Execute the GEMM operation in steps: + - Setup static attributes before smem/grid/tma computation + - Setup TMA load/store atoms and tensors + - Compute grid size with regard to hardware constraints + - Define shared storage for kernel + - Launch the kernel synchronously + + For grouped GEMM, tensor shapes, tensor strides, and tensor address are all provided + by different tensors in global memory. The "initial" tensors only carry data type and + majorness information. + + :param initial_a: Initial tensor A, used for data type and majorness information. + :type initial_a: cute.Tensor + :param initial_b: Initial tensor B, used for data type and majorness information. + :type initial_b: cute.Tensor + :param initial_c: Initial tensor C, used for data type and majorness information. + :type initial_c: cute.Tensor + :param group_count: The number of GEMM groups. + :type group_count: cutlass.Constexpr[int] + :param problem_shape_mnkl: Tensor containing the (M, N, K, L) shape for each group. + :type problem_shape_mnkl: cute.Tensor + :param strides_abc: Tensor containing the strides for A, B, and C for each group. + :type strides_abc: cute.Tensor + :param tensor_address_abc: Tensor containing the base addresses for A, B, and C for each group. + :type tensor_address_abc: cute.Tensor + :param total_num_clusters: Total number of clusters needed for all groups. + :type total_num_clusters: cutlass.Constexpr[int] + :param tensormap_cute_tensor: Tensor for storing tensormaps. + :type tensormap_cute_tensor: cute.Tensor + :param max_active_clusters: Maximum number of active clusters. + :type max_active_clusters: cutlass.Constexpr[int] + :param stream: CUDA stream for asynchronous execution. + :type stream: cuda.CUstream + :raises TypeError: If A and B data types do not match. + """ + self.a_dtype = initial_a.element_type + self.b_dtype = initial_b.element_type + self.c_dtype = initial_c.element_type + self.a_major_mode = utils.LayoutEnum.from_tensor(initial_a).mma_major_mode() + self.b_major_mode = utils.LayoutEnum.from_tensor(initial_b).mma_major_mode() + self.c_layout = utils.LayoutEnum.from_tensor(initial_c) + if cutlass.const_expr(self.a_dtype != self.b_dtype): + raise TypeError(f"Type mismatch: {self.a_dtype} != {self.b_dtype}") + + # Setup attributes that dependent on gemm inputs + self._setup_attributes() + + tiled_mma = sm100_utils.make_trivial_tiled_mma( + self.a_dtype, + self.a_major_mode, + self.b_major_mode, + self.acc_dtype, + self.cta_group, + self.mma_tiler[:2], + ) + atom_thr_size = cute.size(tiled_mma.thr_id.shape) + + # Setup TMA load for A + a_op = self._get_tma_atom_kind(atom_thr_size, self.is_a_mcast) + a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0)) + tma_atom_a, tma_tensor_a = cute.nvgpu.make_tma_tile_atom_A( + a_op, + initial_a, + a_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + ) + + # Setup TMA load for B + b_op = self._get_tma_atom_kind(atom_thr_size, self.is_b_mcast) + b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0)) + tma_atom_b, tma_tensor_b = cute.nvgpu.make_tma_tile_atom_B( + b_op, + initial_b, + b_smem_layout, + self.mma_tiler, + tiled_mma, + self.cluster_layout_vmnk.shape, + ) + + a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout) + b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout) + self.num_tma_load_bytes = (a_copy_size + b_copy_size) * atom_thr_size + + # Setup TMA store for C + tma_atom_c = None + tma_tensor_c = None + c_cta_v_layout = cute.composition( + cute.make_identity_layout(initial_c.shape), self.epi_tile + ) + epi_smem_layout = cute.slice_(self.epi_smem_layout_staged, (None, None, 0)) + tma_atom_c, tma_tensor_c = cpasync.make_tma_tile_atom( + cpasync.CopyBulkTensorTileS2GOp(), + initial_c, + epi_smem_layout, + c_cta_v_layout, + ) + + self.tile_sched_params, grid = self._compute_grid( + total_num_clusters, self.cluster_shape_mn, max_active_clusters + ) + + self.buffer_align_bytes = 1024 + self.size_tensormap_in_i64 = ( + 0 + if cutlass.const_expr( + self.tensormap_update_mode == utils.TensorMapUpdateMode.GMEM + ) + else GroupedGemmKernel.num_tensormaps + * GroupedGemmKernel.bytes_per_tensormap + // 8 + ) + + # Define shared storage for kernel + @cute.struct + class SharedStorage: + tensormap_buffer: cute.struct.MemRange[ + cutlass.Int64, self.size_tensormap_in_i64 + ] + ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] + ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage] + acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] + acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage] + tmem_dealloc_mbar_ptr: cutlass.Int64 + tmem_holding_buf: cutlass.Int32 + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC: cute.struct.Align[ + cute.struct.MemRange[ + self.c_dtype, + cute.cosize(self.epi_smem_layout_staged.outer), + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_M, MMA_K, STAGE) + sA: cute.struct.Align[ + cute.struct.MemRange[ + self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer) + ], + self.buffer_align_bytes, + ] + # (MMA, MMA_N, MMA_K, STAGE) + sB: cute.struct.Align[ + cute.struct.MemRange[ + self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer) + ], + self.buffer_align_bytes, + ] + + self.shared_storage = SharedStorage + + # Launch the kernel synchronously + self.kernel( + tiled_mma, + tma_atom_a, + tma_tensor_a, + tma_atom_b, + tma_tensor_b, + tma_atom_c, + tma_tensor_c, + self.cluster_layout_vmnk, + self.a_smem_layout_staged, + self.b_smem_layout_staged, + self.epi_smem_layout_staged, + self.epi_tile, + self.tile_sched_params, + group_count, + problem_shape_mnkl, + strides_abc, + tensor_address_abc, + tensormap_cute_tensor, + ).launch( + grid=grid, + block=[self.threads_per_cta, 1, 1], + cluster=(*self.cluster_shape_mn, 1), + smem=self.shared_storage.size_in_bytes(), + stream=stream, + ) + return + + # GPU device kernel + @cute.kernel + def kernel( + self, + tiled_mma: cute.TiledMma, + tma_atom_a: cute.CopyAtom, + mA_mkl: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB_nkl: cute.Tensor, + tma_atom_c: cute.CopyAtom, + mC_mnl: cute.Tensor, + cluster_layout_vmnk: cute.Layout, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + epi_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout], + epi_tile: cute.Tile, + tile_sched_params: utils.PersistentTileSchedulerParams, + group_count: cutlass.Constexpr[int], + problem_sizes_mnkl: cute.Tensor, + strides_abc: cute.Tensor, + ptrs_abc: cute.Tensor, + tensormaps: cute.Tensor, + ): + """ + GPU device kernel performing the grouped GEMM computation. + """ + warp_idx = cute.arch.warp_idx() + warp_idx = cute.arch.make_warp_uniform(warp_idx) + + # + # Prefetch tma desc + # + if warp_idx == self.tma_warp_id: + cpasync.prefetch_descriptor(tma_atom_a) + cpasync.prefetch_descriptor(tma_atom_b) + cpasync.prefetch_descriptor(tma_atom_c) + + use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2 + + # + # Setup cta/thread coordinates + # + # Coord inside cluster + bid = cute.arch.block_idx() + mma_tile_coord_v = bid[0] % cute.size(tiled_mma.thr_id.shape) + is_leader_cta = mma_tile_coord_v == 0 + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord( + cta_rank_in_cluster + ) + # Coord inside cta + tidx, _, _ = cute.arch.thread_idx() + + # + # Alloc and init: tensormap buffer, a+b full/empty, accumulator full/empty, tensor memory dealloc barrier + # + smem = utils.SmemAllocator() + storage = smem.allocate(self.shared_storage) + + tensormap_a_smem_ptr = None + tensormap_b_smem_ptr = None + tensormap_c_smem_ptr = None + if cutlass.const_expr( + self.tensormap_update_mode == utils.TensorMapUpdateMode.SMEM + ): + tensormap_smem_ptr = storage.tensormap_buffer.data_ptr() + tensormap_a_smem_ptr = tensormap_smem_ptr + tensormap_b_smem_ptr = ( + tensormap_a_smem_ptr + GroupedGemmKernel.bytes_per_tensormap // 8 + ) + tensormap_c_smem_ptr = ( + tensormap_b_smem_ptr + GroupedGemmKernel.bytes_per_tensormap // 8 + ) + ab_full_mbar_ptr = storage.ab_full_mbar_ptr.data_ptr() + ab_empty_mbar_ptr = storage.ab_empty_mbar_ptr.data_ptr() + acc_full_mbar_ptr = storage.acc_full_mbar_ptr.data_ptr() + acc_empty_mbar_ptr = storage.acc_empty_mbar_ptr.data_ptr() + tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr + tmem_holding_buf = storage.tmem_holding_buf + + # init barrier for loading A, B with TMA + if warp_idx == self.epilog_warp_id[0]: + for k_stage in range(self.num_ab_stage): + num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1 + with cute.arch.elect_one(): + cute.arch.mbarrier_init_arrive_cnt(ab_full_mbar_ptr + k_stage, 1) + cute.arch.mbarrier_init_arrive_cnt( + ab_empty_mbar_ptr + k_stage, num_tma_producer + ) + # Accumulator barrier init + if warp_idx == self.mma_warp_id: + for acc_stage in range(self.num_acc_stage): + with cute.arch.elect_one(): + cute.arch.mbarrier_init_arrive_cnt(acc_full_mbar_ptr + acc_stage, 1) + cute.arch.mbarrier_init_arrive_cnt( + acc_empty_mbar_ptr + acc_stage, 8 if use_2cta_instrs else 4 + ) + # Tensor memory dealloc barrier init + if use_2cta_instrs: + if warp_idx == self.tma_warp_id: + num_tmem_dealloc_threads = 32 + with cute.arch.elect_one(): + cute.arch.mbarrier_init_arrive_cnt( + tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads + ) + cute.arch.mbarrier_init_fence() + + # Cluster arrive after barrier init + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_arrive_relaxed() + + # + # Setup smem tensor A/B/C + # + # (EPI_TILE_M, EPI_TILE_N, STAGE) + sC = storage.sC.get_tensor( + epi_smem_layout_staged.outer, swizzle=epi_smem_layout_staged.inner + ) + # (MMA, MMA_M, MMA_K, STAGE) + sA = storage.sA.get_tensor( + a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner + ) + # (MMA, MMA_N, MMA_K, STAGE) + sB = storage.sB.get_tensor( + b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner + ) + + # + # Compute multicast mask for A/B buffer full and empty + # + a_full_mcast_mask = None + b_full_mcast_mask = None + ab_empty_mcast_mask = None + if self.is_a_mcast or self.is_b_mcast or use_2cta_instrs: + a_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2 + ) + b_full_mcast_mask = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1 + ) + ab_empty_mcast_mask = a_full_mcast_mask | b_full_mcast_mask + acc_full_mcast_mask = None + if use_2cta_instrs: + acc_full_mcast_mask = cute.make_layout_image_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk, mode=0 + ) + block_in_cluster_coord_vmnk_peer = ( + block_in_cluster_coord_vmnk[0] ^ 1, + *block_in_cluster_coord_vmnk[1:], + ) + a_full_mcast_mask_peer = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2 + ) + b_full_mcast_mask_peer = cpasync.create_tma_multicast_mask( + cluster_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1 + ) + ab_empty_mcast_mask = ( + a_full_mcast_mask_peer + | b_full_mcast_mask_peer + | cutlass.Int16( + 0 if ab_empty_mcast_mask is None else ab_empty_mcast_mask + ) + ) + + # + # Local_tile partition global tensors + # + # (bM, bK, loopM, loopK, loopL) + gA_mkl = cute.local_tile( + mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None) + ) + # (bN, bK, loopN, loopK, loopL) + gB_nkl = cute.local_tile( + mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None) + ) + # (bM, bN, loopM, loopN, loopL) + gC_mnl = cute.local_tile( + mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None) + ) + + # + # Partition global tensor for TiledMMA_A/B/C + # + thr_mma = tiled_mma.get_slice(mma_tile_coord_v) + # (MMA, MMA_M, MMA_K, loopM, loopK, loopL) + tCgA = thr_mma.partition_A(gA_mkl) + # (MMA, MMA_N, MMA_K, loopN, loopK, loopL) + tCgB = thr_mma.partition_B(gB_nkl) + # (MMA, MMA_M, MMA_N, loopM, loopN, loopL) + tCgC = thr_mma.partition_C(gC_mnl) + + # + # Partition global/shared tensor for load A, B with TMA + # + a_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopM, loopK, loopL) + tAsA, tAgA = cpasync.tma_partition( + tma_atom_a, + block_in_cluster_coord_vmnk[2], + a_cta_layout, + cute.group_modes(sA, 0, 3), + cute.group_modes(tCgA, 0, 3), + ) + # TMA load B partition_S/D + b_cta_layout = cute.make_layout( + cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape + ) + # ((atom_v, rest_v), STAGE) + # ((atom_v, rest_v), loopM, loopK, loopL) + tBsB, tBgB = cpasync.tma_partition( + tma_atom_b, + block_in_cluster_coord_vmnk[1], + b_cta_layout, + cute.group_modes(sB, 0, 3), + cute.group_modes(tCgB, 0, 3), + ) + + # + # Partition shared/tensor memory tensor for TiledMMA_A/B/C + # + # (MMA, MMA_M, MMA_K, STAGE) + tCrA = tiled_mma.make_fragment_A(sA) + # (MMA, MMA_N, MMA_K, STAGE) + tCrB = tiled_mma.make_fragment_B(sB) + # (MMA, MMA_M, MMA_N) + acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2]) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_fake = tiled_mma.make_fragment_C( + cute.append(acc_shape, self.num_acc_stage) + ) + + # + # Cluster wait before tensor memory alloc + # + if cute.size(self.cluster_shape_mn) > 1: + cute.arch.cluster_wait() + else: + cute.arch.barrier( + barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta + ) + + # + # Get tensormap buffer address + # + grid_dim = cute.arch.grid_dim() + tensormap_workspace_idx = ( + bid[2] * grid_dim[1] * grid_dim[0] + bid[1] * grid_dim[0] + bid[0] + ) + + tensormap_manager = utils.TensorMapManager( + self.tensormap_update_mode, GroupedGemmKernel.bytes_per_tensormap + ) + tensormap_a_ptr = tensormap_manager.get_tensormap_ptr( + tensormaps[(tensormap_workspace_idx, 0, None)].iterator + ) + tensormap_b_ptr = tensormap_manager.get_tensormap_ptr( + tensormaps[(tensormap_workspace_idx, 1, None)].iterator + ) + tensormap_c_ptr = tensormap_manager.get_tensormap_ptr( + tensormaps[(tensormap_workspace_idx, 2, None)].iterator + ) + # Setup tensormap initialization pointer based on the mode + if cutlass.const_expr( + self.tensormap_update_mode == utils.TensorMapUpdateMode.SMEM + ): + tensormap_a_init_ptr = tensormap_a_smem_ptr + tensormap_b_init_ptr = tensormap_b_smem_ptr + tensormap_c_init_ptr = tensormap_c_smem_ptr + else: + tensormap_a_init_ptr = tensormap_a_ptr + tensormap_b_init_ptr = tensormap_b_ptr + tensormap_c_init_ptr = tensormap_c_ptr + + # + # Specialized TMA load warp + # + if warp_idx == self.tma_warp_id: + # Initialize tensormaps for A, B + if cutlass.const_expr(self.delegate_tensormap_ab_init == False): + tensormap_manager.init_tensormap_from_atom( + tma_atom_a, tensormap_a_init_ptr, self.tma_warp_id + ) + tensormap_manager.init_tensormap_from_atom( + tma_atom_b, tensormap_b_init_ptr, self.tma_warp_id + ) + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, bid, grid_dim + ) + # grouped gemm tile scheduler helper will compute the group index for the tile we're working on + group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper( + group_count, + tile_sched_params, + self.cluster_tile_shape_mnk, + utils.create_initial_search_state(), + ) + tensormap_init_done = cutlass.Boolean(False) + # tile count we have searched + total_k_block_cnt = cutlass.Int32(0) + # group index of last tile + last_group_idx = cutlass.Int32(-1) + work_tile = tile_sched.initial_work_tile_info() + while work_tile.is_valid_tile: + cur_tile_coord = work_tile.tile_idx + grouped_gemm_cta_tile_info = group_gemm_ts_helper.delinearize_z( + cur_tile_coord, + problem_sizes_mnkl, + ) + cur_k_block_cnt = grouped_gemm_cta_tile_info.cta_tile_count_k + cur_group_idx = grouped_gemm_cta_tile_info.group_idx + is_group_changed = cur_group_idx != last_group_idx + # skip tensormap update if we're working on the same group + if is_group_changed: + real_tensor_a = self.make_tensor_for_tensormap_update( + cur_group_idx, + self.a_dtype, + ( + grouped_gemm_cta_tile_info.problem_shape_m, + grouped_gemm_cta_tile_info.problem_shape_n, + grouped_gemm_cta_tile_info.problem_shape_k, + ), + strides_abc, + ptrs_abc, + 0, # 0 for tensor A + ) + real_tensor_b = self.make_tensor_for_tensormap_update( + cur_group_idx, + self.b_dtype, + ( + grouped_gemm_cta_tile_info.problem_shape_m, + grouped_gemm_cta_tile_info.problem_shape_n, + grouped_gemm_cta_tile_info.problem_shape_k, + ), + strides_abc, + ptrs_abc, + 1, # 1 for tensor B + ) + # wait tensormap initialization complete before update + if tensormap_init_done == False: + if cutlass.const_expr(self.delegate_tensormap_ab_init): + cute.arch.barrier( + barrier_id=self.tensormap_ab_init_bar_id, + number_of_threads=64, + ) + tensormap_manager.fence_tensormap_initialization() + tensormap_init_done = True + + tensormap_manager.update_tensormap( + (real_tensor_a, real_tensor_b), + (tma_atom_a, tma_atom_b), + (tensormap_a_ptr, tensormap_b_ptr), + self.tma_warp_id, + (tensormap_a_smem_ptr, tensormap_b_smem_ptr), + ) + + mma_tile_coord_mnl = ( + grouped_gemm_cta_tile_info.cta_tile_idx_m + // cute.size(tiled_mma.thr_id.shape), + grouped_gemm_cta_tile_info.cta_tile_idx_n, + 0, + ) + + # + # Slice to per mma tile index + # + # ((atom_v, rest_v), loopK) + tAgA_slice = tAgA[ + (None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2]) + ] + # ((atom_v, rest_v), loopK) + tBgB_slice = tBgB[ + (None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2]) + ] + + num_prev_k_blk = total_k_block_cnt + total_k_block_cnt += cur_k_block_cnt + + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + tma_wr_k_block = cutlass.Int32(0) + smem_wr_buffer = (num_prev_k_blk + tma_wr_k_block) % self.num_ab_stage + tma_wr_ab_empty_phase = ( + num_prev_k_blk + tma_wr_k_block + ) // self.num_ab_stage % 2 ^ 1 + peek_ab_empty_status = cute.arch.conditional_mbarrier_try_wait( + tma_wr_k_block < cur_k_block_cnt, + ab_empty_mbar_ptr + smem_wr_buffer, + tma_wr_ab_empty_phase, + ) + # ensure the update to tensormap has completed before using it + if is_group_changed: + tensormap_manager.fence_tensormap_update(tensormap_a_ptr) + tensormap_manager.fence_tensormap_update(tensormap_b_ptr) + # + # Tma load loop + # + for k_block in cutlass.range_dynamic(0, cur_k_block_cnt, 1, unroll=1): + tma_wr_k_block_next = tma_wr_k_block + 1 + smem_wr_buffer_next = ( + num_prev_k_blk + tma_wr_k_block_next + ) % self.num_ab_stage + tma_wr_ab_empty_phase_next = ( + tma_wr_ab_empty_phase ^ 1 + if smem_wr_buffer_next == 0 + else tma_wr_ab_empty_phase + ) + + smem_full_mbar_ptr = ab_full_mbar_ptr + smem_wr_buffer + + # Wait for AB buffer empty + if peek_ab_empty_status == 0: + cute.arch.mbarrier_wait( + ab_empty_mbar_ptr + smem_wr_buffer, tma_wr_ab_empty_phase + ) + + # Init AB buffer full transaction byte + if is_leader_cta: + with cute.arch.elect_one(): + cute.arch.mbarrier_init_tx_bytes( + smem_full_mbar_ptr, self.num_tma_load_bytes + ) + + # Load A/B with TMA + cute.copy( + tma_atom_a, + tAgA_slice[(None, tma_wr_k_block)], + tAsA[(None, smem_wr_buffer)], + tma_bar_ptr=smem_full_mbar_ptr, + mcast_mask=a_full_mcast_mask, + tma_desc_ptr=tensormap_manager.get_tensormap_ptr( + tensormap_a_ptr, + cute.AddressSpace.generic, + ), + ) + cute.copy( + tma_atom_b, + tBgB_slice[(None, tma_wr_k_block)], + tBsB[(None, smem_wr_buffer)], + tma_bar_ptr=smem_full_mbar_ptr, + mcast_mask=b_full_mcast_mask, + tma_desc_ptr=tensormap_manager.get_tensormap_ptr( + tensormap_b_ptr, + cute.AddressSpace.generic, + ), + ) + + # Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1 + peek_ab_empty_status = cute.arch.conditional_mbarrier_try_wait( + tma_wr_k_block_next < cur_k_block_cnt, + ab_empty_mbar_ptr + smem_wr_buffer_next, + tma_wr_ab_empty_phase_next, + ) + + tma_wr_k_block = tma_wr_k_block_next + smem_wr_buffer = smem_wr_buffer_next + tma_wr_ab_empty_phase = tma_wr_ab_empty_phase_next + + # Advance to next tile + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + last_group_idx = cur_group_idx + + # + # Specialized MMA warp + # + if warp_idx == self.mma_warp_id: + # initilize tensormap A, B for TMA warp + if cutlass.const_expr(self.delegate_tensormap_ab_init): + tensormap_manager.init_tensormap_from_atom( + tma_atom_a, tensormap_a_init_ptr, self.mma_warp_id + ) + tensormap_manager.init_tensormap_from_atom( + tma_atom_b, tensormap_b_init_ptr, self.mma_warp_id + ) + # signal tensormap initialization has finished + cute.arch.barrier( + barrier_id=self.tensormap_ab_init_bar_id, number_of_threads=64 + ) + # Bar sync for retrieve tmem ptr from shared mem + tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id)) + cute.arch.barrier( + barrier_id=self.tmem_ptr_sync_bar_id, + number_of_threads=tmem_ptr_read_threads, + ) + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, + alignment=16, + ptr_to_buffer_holding_addr=tmem_holding_buf, + ) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout) + + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, bid, grid_dim + ) + # grouped gemm tile scheduler helper will compute the group index for the tile we're working on + group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper( + group_count, + tile_sched_params, + self.cluster_tile_shape_mnk, + utils.create_initial_search_state(), + ) + + work_tile = tile_sched.initial_work_tile_info() + # tile count we have searched + total_k_block_cnt = cutlass.Int32(0) + while work_tile.is_valid_tile: + cur_tile_coord = work_tile.tile_idx + # MMA warp is only interested in number of tiles along K dimension + cur_k_block_cnt, cur_group_idx = ( + group_gemm_ts_helper.search_cluster_tile_count_k( + cur_tile_coord, + problem_sizes_mnkl, + ) + ) + # Set tensor memory buffer for current tile + acc_buf_idx = tile_sched.num_tiles_executed % self.num_acc_stage + # (MMA, MMA_M, MMA_N) + tCtAcc = tCtAcc_base[(None, None, None, acc_buf_idx)] + + num_prev_k_blk = total_k_block_cnt + total_k_block_cnt += cur_k_block_cnt + + # Peek (try_wait) AB buffer full for k_block = 0 + mma_rd_k_block = cutlass.Int32(0) + smem_rd_buffer = (num_prev_k_blk + mma_rd_k_block) % self.num_ab_stage + need_check_rd_buffer_full = ( + mma_rd_k_block < cur_k_block_cnt and is_leader_cta + ) + mma_rd_ab_full_phase = ( + (num_prev_k_blk + mma_rd_k_block) // self.num_ab_stage % 2 + ) + peek_ab_full_status = cute.arch.conditional_mbarrier_try_wait( + need_check_rd_buffer_full, + ab_full_mbar_ptr + smem_rd_buffer, + mma_rd_ab_full_phase, + ) + + # + # Wait for accumulator buffer empty + # + if is_leader_cta: + acc_empty_phase = ( + tile_sched.num_tiles_executed // self.num_acc_stage % 2 ^ 1 + ) + cute.arch.mbarrier_wait( + acc_empty_mbar_ptr + acc_buf_idx, acc_empty_phase + ) + + # + # Reset the ACCUMULATE field for each tile + # + tiled_mma.set(tcgen05.Field.ACCUMULATE, False) + + # + # Mma mainloop + # + for k_block in cutlass.range_dynamic(0, cur_k_block_cnt, 1, unroll=1): + mma_rd_k_block_next = cutlass.Int32(k_block + 1) + smem_rd_buffer_next = ( + num_prev_k_blk + mma_rd_k_block_next + ) % self.num_ab_stage + mma_rd_ab_full_phase_next = ( + mma_rd_ab_full_phase ^ 1 + if smem_rd_buffer_next == 0 + else mma_rd_ab_full_phase + ) + if is_leader_cta: + # Wait for AB buffer full + if peek_ab_full_status == 0: + cute.arch.mbarrier_wait( + ab_full_mbar_ptr + smem_rd_buffer, mma_rd_ab_full_phase + ) + + # tCtAcc += tCrA * tCrB + num_kphases = cute.size(tCrA, mode=[2]) + for kphase_idx in range(num_kphases): + kphase_coord = (None, None, kphase_idx, smem_rd_buffer) + + cute.gemm( + tiled_mma, + tCtAcc, + tCrA[kphase_coord], + tCrB[kphase_coord], + tCtAcc, + ) + # Enable accumulate on tCtAcc after first kphase + tiled_mma.set(tcgen05.Field.ACCUMULATE, True) + + # Async arrive AB buffer empty + with cute.arch.elect_one(): + tcgen05.commit( + ab_empty_mbar_ptr + smem_rd_buffer, + ab_empty_mcast_mask, + self.cta_group, + ) + + # Peek (try_wait) AB buffer full for k_block = k_block + 1 + need_check_rd_buffer_full = ( + mma_rd_k_block_next < cur_k_block_cnt and is_leader_cta + ) + + peek_ab_full_status = cute.arch.conditional_mbarrier_try_wait( + need_check_rd_buffer_full, + ab_full_mbar_ptr + smem_rd_buffer_next, + mma_rd_ab_full_phase_next, + ) + + mma_rd_k_block = mma_rd_k_block_next + smem_rd_buffer = smem_rd_buffer_next + mma_rd_ab_full_phase = mma_rd_ab_full_phase_next + + # + # Async arrive accumulator buffer full + # + if is_leader_cta: + with cute.arch.elect_one(): + tcgen05.commit( + acc_full_mbar_ptr + acc_buf_idx, + acc_full_mcast_mask, + self.cta_group, + ) + + # + # Advance to next tile + # + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + + # + # Specialized epilogue warps + # + if warp_idx < self.mma_warp_id: + # initialize tensorap for C + tensormap_manager.init_tensormap_from_atom( + tma_atom_c, + tensormap_c_init_ptr, + self.epilog_warp_id[0], + ) + # Alloc tensor memory buffer + if warp_idx == self.epilog_warp_id[0]: + cute.arch.alloc_tmem( + self.num_tmem_alloc_cols, + tmem_holding_buf, + is_two_cta=use_2cta_instrs, + ) + + # + # Bar sync for retrieve tensor memory ptr from shared memory + # + tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id)) + cute.arch.barrier( + barrier_id=self.tmem_ptr_sync_bar_id, + number_of_threads=tmem_ptr_read_threads, + ) + + # + # Retrieving tensor memory ptr and make accumulator tensor + # + tmem_ptr = cute.arch.retrieve_tmem_ptr( + self.acc_dtype, + alignment=16, + ptr_to_buffer_holding_addr=tmem_holding_buf, + ) + # (MMA, MMA_M, MMA_N, STAGE) + tCtAcc_base = cute.make_tensor(tmem_ptr, tCtAcc_fake.layout) + + epi_tidx = tidx + # + # Partition for epilogue + # + tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = ( + self.epilog_tmem_copy_and_partition( + epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs + ) + ) + + tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype) + tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition( + tiled_copy_t2r, tTR_rC, epi_tidx, sC + ) + tma_atom_c, bSG_sC, bSG_gC_partitioned = ( + self.epilog_gmem_copy_and_partition(tma_atom_c, tCgC, epi_tile, sC) + ) + + # + # Persistent tile scheduling loop + # + tile_sched = utils.StaticPersistentTileScheduler.create( + tile_sched_params, bid, grid_dim + ) + # grouped gemm tile scheduler helper will compute the group index for the tile we're working on + group_gemm_ts_helper = utils.GroupedGemmTileSchedulerHelper( + group_count, + tile_sched_params, + self.cluster_tile_shape_mnk, + utils.create_initial_search_state(), + ) + + work_tile = tile_sched.initial_work_tile_info() + # wait tensormap initialization complete before update + tensormap_manager.fence_tensormap_initialization() + # tile count we have searched + total_k_block_cnt = cutlass.Int32(0) + # group index of last tile + last_group_idx = cutlass.Int32(-1) + while work_tile.is_valid_tile: + cur_tile_coord = work_tile.tile_idx + grouped_gemm_cta_tile_info = group_gemm_ts_helper.delinearize_z( + cur_tile_coord, + problem_sizes_mnkl, + ) + cur_group_idx = grouped_gemm_cta_tile_info.group_idx + is_group_changed = cur_group_idx != last_group_idx + if is_group_changed: + # construct tensor C based on real address, shape and stride information + real_tensor_c = self.make_tensor_for_tensormap_update( + cur_group_idx, + self.c_dtype, + ( + grouped_gemm_cta_tile_info.problem_shape_m, + grouped_gemm_cta_tile_info.problem_shape_n, + grouped_gemm_cta_tile_info.problem_shape_k, + ), + strides_abc, + ptrs_abc, + 2, # 2 for tensor C + ) + tensormap_manager.update_tensormap( + ((real_tensor_c),), + ((tma_atom_c),), + ((tensormap_c_ptr),), + self.epilog_warp_id[0], + (tensormap_c_smem_ptr,), + ) + + mma_tile_coord_mnl = ( + grouped_gemm_cta_tile_info.cta_tile_idx_m + // cute.size(tiled_mma.thr_id.shape), + grouped_gemm_cta_tile_info.cta_tile_idx_n, + 0, + ) + cur_k_block_cnt = grouped_gemm_cta_tile_info.cta_tile_count_k + total_k_block_cnt += cur_k_block_cnt + + # + # Slice to per mma tile index + # + # ((ATOM_V, REST_V), EPI_M, EPI_N) + bSG_gC = bSG_gC_partitioned[ + ( + None, + None, + None, + *mma_tile_coord_mnl, + ) + ] + + # Set tensor memory buffer for current tile + acc_buf_idx = tile_sched.num_tiles_executed % self.num_acc_stage + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M) + tTR_tAcc = tTR_tAcc_base[(None, None, None, None, None, acc_buf_idx)] + + # + # Wait for accumulator buffer full + # + acc_full_phase = tile_sched.num_tiles_executed // self.num_acc_stage % 2 + cute.arch.mbarrier_wait(acc_full_mbar_ptr + acc_buf_idx, acc_full_phase) + + tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc)) + bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC)) + # ensure the update to tensormap has completed before using it + if is_group_changed: + if warp_idx == self.epilog_warp_id[0]: + tensormap_manager.fence_tensormap_update(tensormap_c_ptr) + # + # Store accumulator to global memory in subtiles + # + subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3]) + num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt + for subtile_idx in cutlass.range_dynamic(subtile_cnt): + # + # Load accumulator from tensor memory buffer to register + # + tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)] + cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc) + + # + # Convert to output type + # + acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load() + tRS_rC.store(acc_vec.to(self.c_dtype)) + # + # Store C to shared memory + # + epi_buffer = (num_prev_subtiles + subtile_idx) % self.num_epi_stage + cute.copy( + tiled_copy_r2s, + tRS_rC, + tRS_sC[(None, None, None, epi_buffer)], + ) + # Fence and barrier to make sure shared memory store is visible to TMA store + cute.arch.fence_proxy( + cute.arch.ProxyKind.async_shared, + space=cute.arch.SharedSpace.shared_cta, + ) + epilog_threads = 32 * len(self.epilog_warp_id) + cute.arch.barrier( + barrier_id=self.epilog_sync_bar_id, + number_of_threads=epilog_threads, + ) + # + # store C to global memory with TMA + # + if warp_idx == self.epilog_warp_id[0]: + cute.copy( + tma_atom_c, + bSG_sC[(None, epi_buffer)], + bSG_gC[(None, subtile_idx)], + tma_desc_ptr=tensormap_manager.get_tensormap_ptr( + tensormap_c_ptr, + cute.AddressSpace.generic, + ), + ) + cute.arch.cp_async_bulk_commit_group() + cute.arch.cp_async_bulk_wait_group( + self.num_epi_stage - 1, read=True + ) + cute.arch.barrier( + barrier_id=self.epilog_sync_bar_id, + number_of_threads=epilog_threads, + ) + # + # Async arrive accumulator buffer empty + # + with cute.arch.elect_one(): + cute.arch.mbarrier_arrive( + acc_empty_mbar_ptr + acc_buf_idx, + cta_rank_in_cluster // 2 * 2 if use_2cta_instrs else None, + ) + + # + # Advance to next tile + # + tile_sched.advance_to_next_work() + work_tile = tile_sched.get_current_work() + last_group_idx = cur_group_idx + + # + # Dealloc the tensor memory buffer + # + if warp_idx == self.epilog_warp_id[0]: + cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs) + epilog_threads = 32 * len(self.epilog_warp_id) + cute.arch.barrier( + barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads + ) + if warp_idx == self.epilog_warp_id[0]: + if use_2cta_instrs: + cute.arch.mbarrier_arrive( + tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1 + ) + cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0) + cute.arch.dealloc_tmem( + tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs + ) + + # + # Wait a/b buffer empty + # + if warp_idx == self.epilog_warp_id[0]: + cute.arch.mbarrier_wait( + (ab_empty_mbar_ptr + ((total_k_block_cnt - 1) % self.num_ab_stage)), + (((total_k_block_cnt - 1) // self.num_ab_stage) % 2), + ) + + @cute.jit + def make_tensor_for_tensormap_update( + self, + group_idx: cutlass.Int32, + dtype: Type[cutlass.Numeric], + problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32], + strides_abc: cute.Tensor, + tensor_address_abc: cute.Tensor, + tensor_index: int, + ): + """Extract stride and tensor address for a given group and construct a global tensor. + + This function is used within the kernel to dynamically create a CUTE tensor + representing A, B, or C for the current group being processed, using the + group-specific address, shape, and stride information. + + :param group_idx: The index of the current group within the grouped GEMM. + :type group_idx: cutlass.Int32 + :param dtype: The data type of the tensor elements (e.g., cutlass.Float16). + :type dtype: Type[cutlass.Numeric] + :param problem_shape_mnk: The (M, N, K) problem shape for the current group. + :type problem_shape_mnk: tuple[cutlass.Int32, cutlass.Int32, cutlass.Int32] + :param strides_abc: Tensor containing strides for A, B, C for all groups. Layout: (group_count, 3, 2). + :type strides_abc: cute.Tensor + :param tensor_address_abc: Tensor containing global memory addresses for A, B, C for all groups. Layout: (group_count, 3). + :type tensor_address_abc: cute.Tensor + :param tensor_index: Specifies which tensor to create: 0 for A, 1 for B, 2 for C. + :type tensor_index: int + :return: A CUTE tensor representing the requested global memory tensor (A, B, or C) for the specified group. + :rtype: cute.Tensor + :raises TypeError: If the provided dtype is not a subclass of cutlass.Numeric. + """ + ptr_i64 = tensor_address_abc[(group_idx, tensor_index)] + if cutlass.const_expr( + not isclass(dtype) or not issubclass(dtype, cutlass.Numeric) + ): + raise TypeError( + f"dtype must be a type of cutlass.Numeric, got {type(dtype)}" + ) + tensor_gmem_ptr = cute.make_ptr( + dtype, ptr_i64, cute.AddressSpace.gmem, assumed_align=16 + ) + + strides_tensor_gmem = strides_abc[(group_idx, tensor_index, None)] + strides_tensor_reg = cute.make_fragment( + cute.make_layout(2), + strides_abc.element_type, + ) + cute.autovec_copy(strides_tensor_gmem, strides_tensor_reg) + stride_mn = strides_tensor_reg[0] + stride_k = strides_tensor_reg[1] + c1 = cutlass.Int32(1) + c0 = cutlass.Int32(0) + + if cutlass.const_expr(tensor_index == 0): # tensor A + m = problem_shape_mnk[0] + k = problem_shape_mnk[2] + return cute.make_tensor( + tensor_gmem_ptr, + cute.make_layout((m, k, c1), stride=(stride_mn, stride_k, c0)), + ) + elif cutlass.const_expr(tensor_index == 1): # tensor B + n = problem_shape_mnk[1] + k = problem_shape_mnk[2] + return cute.make_tensor( + tensor_gmem_ptr, + cute.make_layout((n, k, c1), stride=(stride_mn, stride_k, c0)), + ) + else: # tensor C + m = problem_shape_mnk[0] + n = problem_shape_mnk[1] + return cute.make_tensor( + tensor_gmem_ptr, + cute.make_layout((m, n, c1), stride=(stride_mn, stride_k, c0)), + ) + + def epilog_tmem_copy_and_partition( + self, + tidx: cutlass.Int32, + tAcc: cute.Tensor, + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + use_2cta_instrs: Union[cutlass.Boolean, bool], + ) -> tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination). + + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param tAcc: The accumulator tensor to be copied and partitioned + :type tAcc: cute.Tensor + :param gC_mnl: The global tensor C + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler + :type epi_tile: cute.Tile + :param use_2cta_instrs: Whether use_2cta_instrs is enabled + :type use_2cta_instrs: bool + + :return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where: + - tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + - tTR_tAcc: The partitioned accumulator tensor + - tTR_rAcc: The accumulated tensor in register used to hold t2r results + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + # Make tiledCopy for tensor memory load(t2r) + copy_atom_t2r = sm100_utils.get_tmem_load_op( + self.cta_tile_shape_mnk, + self.c_layout, + self.c_dtype, + self.acc_dtype, + epi_tile, + use_2cta_instrs, + ) + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE) + tAcc_epi = cute.flat_divide( + tAcc[((None, None), 0, 0, None)], + epi_tile, + ) + # (EPI_TILE_M, EPI_TILE_N) + tiled_copy_t2r = tcgen05.make_tmem_copy( + copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)] + ) + + thr_copy_t2r = tiled_copy_t2r.get_slice(tidx) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE) + tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi) + + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_mnl_epi = cute.flat_divide( + gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile + ) + # (T2R, T2R_M, T2R_N, EPI_M, EPI_N, loopM, loopN, loopL) + tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi) + # (T2R, T2R_M, T2R_N) + tTR_rAcc = cute.make_fragment( + tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype + ) + return tiled_copy_t2r, tTR_tAcc, tTR_rAcc + + def epilog_smem_copy_and_partition( + self, + tiled_copy_t2r: cute.TiledCopy, + tTR_rC: cute.Tensor, + tidx: cutlass.Int32, + sC: cute.Tensor, + ) -> tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]: + """ + Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination). + + :param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r) + :type tiled_copy_t2r: cute.TiledCopy + :param tTR_rC: The partitioned accumulator tensor + :type tTR_rC: cute.Tensor + :param tidx: The thread index in epilogue warp groups + :type tidx: cutlass.Int32 + :param sC: The shared memory tensor to be copied and partitioned + :type sC: cute.Tensor + + :return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where: + - tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s) + - tRS_rC: The partitioned tensor C (register source) + - tRS_sC: The partitioned tensor C (smem destination) + :rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor] + """ + copy_atom_r2s = sm100_utils.get_smem_store_op( + self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r + ) + tiled_copy_r2s = cute.make_tiled_copy( + copy_atom_r2s, + layout_tv=tiled_copy_t2r.layout_dst_tv_tiled, + tiler_mn=tiled_copy_t2r.tiler_mn, + ) + # (R2S, R2S_M, R2S_N, PIPE_D) + thr_copy_r2s = tiled_copy_r2s.get_slice(tidx) + tRS_sC = thr_copy_r2s.partition_D(sC) + # (R2S, R2S_M, R2S_N) + tRS_rC = tiled_copy_r2s.retile(tTR_rC) + return tiled_copy_r2s, tRS_rC, tRS_sC + + def epilog_gmem_copy_and_partition( + self, + tma_atom_c: cute.CopyAtom, + gC_mnl: cute.Tensor, + epi_tile: cute.Tile, + sC: cute.Tensor, + ) -> tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]: + """Make tiledCopy for global memory store, then use it to partition + shared memory (source) and global memory (destination) for TMA store version. + + :param tma_atom_c: The TMA copy atom configured for storing tensor C. + :type tma_atom_c: cute.CopyAtom + :param gC_mnl: The global memory tensor C. + :type gC_mnl: cute.Tensor + :param epi_tile: The epilogue tiler defining the granularity of the operation. + :type epi_tile: cute.Tile + :param sC: The shared memory epilogue buffer tensor. + :type sC: cute.Tensor + :return: A tuple containing: + - tma_atom_c: The input TMA copy atom (passed through). + - bSG_sC: The source shared memory tensor partitioned for the TMA operation. + - tCgC: The destination global memory tensor partitioned for the TMA operation. + :rtype: tuple[cute.CopyAtom, cute.Tensor, cute.Tensor] + """ + # (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, loopM, loopN, loopL) + gC_epi = cute.flat_divide( + gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile + ) + sC_for_tma_partition = cute.group_modes(sC, 0, 2) + gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2) + # ((ATOM_V, REST_V), EPI_M, EPI_N) + # ((ATOM_V, REST_V), EPI_M, EPI_N, loopM, loopN, loopL) + bSG_sC, bSG_gC = cpasync.tma_partition( + tma_atom_c, + 0, + cute.make_layout(1), + sC_for_tma_partition, + gC_for_tma_partition, + ) + return tma_atom_c, bSG_sC, bSG_gC + + @staticmethod + def _compute_stages( + tiled_mma: cute.TiledMma, + mma_tiler_mnk: tuple[int, int, int], + a_dtype: type[cutlass.Numeric], + b_dtype: type[cutlass.Numeric], + epi_tile: cute.Tile, + c_dtype: type[cutlass.Numeric], + c_layout: utils.LayoutEnum, + num_smem_capacity: int, + occupancy: int, + ) -> tuple[int, int, int]: + """Computes the number of stages for accumulator, A/B operands, and epilogue based on heuristics. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler. + :type mma_tiler_mnk: tuple[int, int, int] + :param a_dtype: Data type of operand A. + :type a_dtype: type[cutlass.Numeric] + :param b_dtype: Data type of operand B. + :type b_dtype: type[cutlass.Numeric] + :param epi_tile: The epilogue tile shape. + :type epi_tile: cute.Tile + :param c_dtype: Data type of operand C (output). + :type c_dtype: type[cutlass.Numeric] + :param c_layout: Layout enum of operand C in global memory. + :type c_layout: utils.LayoutEnum + :param num_smem_capacity: Total available shared memory capacity in bytes. + :type num_smem_capacity: int + :param occupancy: Target number of CTAs per SM (occupancy). + :type occupancy: int + + :return: A tuple containing the computed number of stages for: + (accumulator stages, A/B operand stages, epilogue stages) + :rtype: tuple[int, int, int] + """ + # Default accumulator and epilogue stages + num_acc_stage = 2 + num_epi_stage = 2 + + # Calculate smem layout and size for one stage of A, B, and Epilogue + a_smem_layout_stage_one = sm100_utils.make_smem_layout_a( + tiled_mma, + mma_tiler_mnk, + a_dtype, + 1, # stage=1 + ) + b_smem_layout_staged_one = sm100_utils.make_smem_layout_b( + tiled_mma, + mma_tiler_mnk, + b_dtype, + 1, # stage=1 + ) + epi_smem_layout_staged_one = sm100_utils.make_smem_layout_epi( + c_dtype, + c_layout, + epi_tile, + 1, # stage=1 + ) + ab_bytes_per_stage = cute.size_in_bytes( + a_dtype, a_smem_layout_stage_one + ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged_one) + + epi_bytes_per_stage = cute.size_in_bytes(c_dtype, epi_smem_layout_staged_one) + epi_bytes = epi_bytes_per_stage * num_epi_stage + + # Calculate A/B stages: + # Start with total smem per CTA (capacity / occupancy) + # Subtract reserved bytes and initial epilogue bytes + # Divide remaining by bytes needed per A/B stage + num_ab_stage = ( + num_smem_capacity // occupancy + - GroupedGemmKernel.reserved_smem_bytes + - epi_bytes + ) // ab_bytes_per_stage + + # Refine epilogue stages: + # Calculate remaining smem after allocating for A/B stages and reserved bytes + # Add remaining unused smem to epilogue + remaining_smem = ( + num_smem_capacity + - occupancy * ab_bytes_per_stage * num_ab_stage + - occupancy * (GroupedGemmKernel.reserved_smem_bytes + epi_bytes) + ) + num_epi_stage += remaining_smem // (occupancy * epi_bytes_per_stage) + return num_acc_stage, num_ab_stage, num_epi_stage + + @staticmethod + def _compute_grid( + total_num_clusters: int, + cluster_shape_mn: tuple[int, int], + max_active_clusters: cutlass.Constexpr[int], + ) -> tuple[utils.PersistentTileSchedulerParams, tuple[int, int, int]]: + """Compute tile scheduler parameters and grid shape for grouped GEMM operations. + + :param total_num_clusters: Total number of clusters to process across all groups. + :type total_num_clusters: int + :param cluster_shape_mn: Shape of each cluster in M, N dimensions. + :type cluster_shape_mn: tuple[int, int] + :param max_active_clusters: Maximum number of active clusters. + :type max_active_clusters: cutlass.Constexpr[int] + + :return: A tuple containing: + - tile_sched_params: Parameters for the persistent tile scheduler. + - grid: Grid shape for kernel launch. + :rtype: tuple[utils.PersistentTileSchedulerParams, tuple[int, ...]] + """ + # Create problem shape with M, N dimensions from cluster shape + # and L dimension representing the total number of clusters. + problem_shape_ntile_mnl = ( + cluster_shape_mn[0], + cluster_shape_mn[1], + cutlass.Int32(total_num_clusters), + ) + + tile_sched_params = utils.PersistentTileSchedulerParams( + problem_shape_ntile_mnl, (*cluster_shape_mn, 1) + ) + + grid = utils.StaticPersistentTileScheduler.get_grid_shape( + tile_sched_params, max_active_clusters + ) + + return tile_sched_params, grid + + @staticmethod + def _get_mbar_smem_bytes(**kwargs_stages: int) -> int: + """Calculate shared memory consumption for memory barriers based on provided stages. + + Each stage requires 2 barriers, and each barrier consumes 8 bytes of shared memory. + The total consumption is the sum across all provided stages. This function calculates the total + shared memory needed for these barriers. + + :param kwargs_stages: Variable keyword arguments where each key is a stage name + (e.g., num_acc_stage, num_ab_stage) and each value is the + number of stages of that type. + :type kwargs_stages: int + :return: Total shared memory bytes required for all memory barriers. + :rtype: int + """ + num_barriers_per_stage = 2 + num_bytes_per_barrier = 8 + mbar_smem_consumption = sum( + [ + num_barriers_per_stage * num_bytes_per_barrier * stage + for stage in kwargs_stages.values() + ] + ) + return mbar_smem_consumption + + @staticmethod + def _get_tensormap_smem_bytes( + tensormap_update_mode: utils.TensorMapUpdateMode, + ) -> int: + """Get the SMEM consumption for the tensormap buffer based on the update mode. + + :param tensormap_update_mode: Specifies whether tensormaps are updated in GMEM or SMEM. + :type tensormap_update_mode: utils.TensorMapUpdateMode + :return: The shared memory bytes required for the tensormap buffer. Returns 0 if mode is GMEM. + :rtype: int + :raises ValueError: If an invalid tensormap update mode is provided. + """ + if tensormap_update_mode == utils.TensorMapUpdateMode.GMEM: + return 0 + elif tensormap_update_mode == utils.TensorMapUpdateMode.SMEM: + return ( + GroupedGemmKernel.bytes_per_tensormap * GroupedGemmKernel.num_tensormaps + ) + else: + raise ValueError(f"Invalid tensormap update mode: {tensormap_update_mode}") + + @staticmethod + def _get_tensor_smem_bytes( + a_smem_layout_staged: cute.Layout, + a_dtype: Type[cutlass.Numeric], + b_smem_layout_staged: cute.Layout, + b_dtype: Type[cutlass.Numeric], + epi_smem_layout_staged: cute.Layout, + c_dtype: Type[cutlass.Numeric], + ) -> int: + """Compute the total SMEM consumption for tensor A, B and C.""" + ab_bytes = cute.size_in_bytes( + a_dtype, a_smem_layout_staged + ) + cute.size_in_bytes(b_dtype, b_smem_layout_staged) + + epi_bytes = cute.size_in_bytes(c_dtype, epi_smem_layout_staged) + return ab_bytes + epi_bytes + + @staticmethod + def _get_tma_atom_kind(atom_sm_cnt: int, mcast: bool): + """Select the appropriate TMA copy atom based on the number of SMs and the multicast flag.""" + if atom_sm_cnt == 2 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 2 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.TWO) + elif atom_sm_cnt == 1 and mcast: + return cpasync.CopyBulkTensorTileG2SMulticastOp(tcgen05.CtaGroup.ONE) + elif atom_sm_cnt == 1 and not mcast: + return cpasync.CopyBulkTensorTileG2SOp(tcgen05.CtaGroup.ONE) + + raise ValueError(f"Invalid atom_sm_cnt: {atom_sm_cnt} and {mcast}") + + @staticmethod + def _compute_num_tmem_alloc_cols( + tiled_mma: cute.TiledMma, + mma_tiler: tuple[int, int, int], + num_acc_stage: int, + ) -> int: + """ + Compute the number of tensor memory allocation columns. + + :param tiled_mma: The tiled MMA object defining the core computation. + :type tiled_mma: cute.TiledMma + :param mma_tiler: The shape (M, N, K) of the MMA tile. + :type mma_tiler: tuple[int, int, int] + :param acc_stage: The stage of the accumulator tensor. + :type acc_stage: int + + :return: The number of tensor memory allocation columns. + :rtype: int + """ + acc_shape = tiled_mma.partition_shape_C(mma_tiler[:2]) + tCtAcc_fake = tiled_mma.make_fragment_C(cute.append(acc_shape, num_acc_stage)) + num_tmem_alloc_cols = utils.get_num_tmem_alloc_cols(tCtAcc_fake) + + return num_tmem_alloc_cols + + # Size of smem we reserved for mbarrier, tensor memory management and tensormap update + reserved_smem_bytes = 1024 + bytes_per_tensormap = 128 + num_tensormaps = 3 + # size of smem used for tensor memory management + tensor_memory_management_bytes = 12 + + +def run_grouped_gemm( + num_groups: int, + problem_sizes_mnkl: tuple[int, int, int, int], + ab_dtype: Type[cutlass.Numeric], + c_dtype: Type[cutlass.Numeric], + acc_dtype: Type[cutlass.Numeric], + a_major: str, + b_major: str, + c_major: str, + mma_tiler_mn: tuple[int, int], + cluster_shape_mn: tuple[int, int], + use_2cta_instrs: bool, + tensormap_update_mode: utils.TensorMapUpdateMode, + tolerance: float, + warmup_iterations: int, + iterations: int, + skip_ref_check: bool, +): + """Run grouped GEMM example with specified configurations.""" + print(f"Running Blackwell Grouped GEMM test with:") + print(f"{num_groups} groups") + for i, (m, n, k, l) in enumerate(problem_sizes_mnkl): + print(f"Group {i}: {m}x{n}x{k}x{l}") + print(f"AB dtype: {ab_dtype}, C dtype: {c_dtype}, Acc dtype: {acc_dtype}") + print(f"Matrix majors - A: {a_major}, B: {b_major}, C: {c_major}") + print(f"Mma Tiler (M, N): {mma_tiler_mn}, Cluster Shape (M, N): {cluster_shape_mn}") + print(f"2CTA MMA instructions: {'True' if use_2cta_instrs else 'False'}") + print(f"Tensor map update mode: {tensormap_update_mode}") + print(f"Tolerance: {tolerance}") + print(f"Warmup iterations: {warmup_iterations}") + print(f"Iterations: {iterations}") + print(f"Skip reference checking: {skip_ref_check}") + + # Skip unsupported types + if ab_dtype not in { + cutlass.Float16, + cutlass.BFloat16, + }: + raise ValueError(f"Skip unsupported ab_dtype {ab_dtype}") + if c_dtype not in {cutlass.Float16, cutlass.BFloat16, cutlass.Float32}: + raise ValueError(f"Skip unsupported c_dtype {c_dtype}") + # Skip unsupported acc dtype + if acc_dtype not in {cutlass.Float32, cutlass.Float16}: + raise ValueError(f"Skip unsupported acc_dtype {acc_dtype}") + # Skip invalid ab_dtype and acc_dtype combination + if ab_dtype == cutlass.BFloat16 and acc_dtype == cutlass.Float16: + raise ValueError("Skip invalid ab_dtype and acc_dtype combination") + # Skip invalid mma tile shape + if not ( + (not use_2cta_instrs and mma_tiler_mn[0] in [64, 128]) + or (use_2cta_instrs and mma_tiler_mn[0] in [128, 256]) + ): + raise ValueError(f"Skip invalid mma tiler M {mma_tiler_mn[0]}") + if mma_tiler_mn[1] not in range(32, 257, 32): + raise ValueError(f"Skip invalid mma tiler N {mma_tiler_mn[1]}") + # Skip illegal cluster shape + if cluster_shape_mn[0] % (2 if use_2cta_instrs else 1) != 0: + raise ValueError( + f"cluster_shape_m need align with use_2cta_instrs config {cluster_shape_mn}" + ) + # Skip invalid cluster shape + is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0 + if ( + cluster_shape_mn[0] * cluster_shape_mn[1] > 16 + or cluster_shape_mn[0] <= 0 + or cluster_shape_mn[1] <= 0 + or not is_power_of_2(cluster_shape_mn[0]) + or not is_power_of_2(cluster_shape_mn[1]) + ): + raise ValueError(f"Skip invalid cluster shape {cluster_shape_mn}") + + # Skip illegal problem shape for load/store alignment + def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape): + major_mode_idx = 0 if is_mode0_major else 1 + num_major_elements = tensor_shape[major_mode_idx] + num_contiguous_elements = 16 * 8 // dtype.width + return num_major_elements % num_contiguous_elements == 0 + + if ( + not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l)) + or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l)) + or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l)) + ): + raise ValueError("Skip invalid problem alignment") + if not torch.cuda.is_available(): + raise RuntimeError("GPU is required to run this example!") + + torch.manual_seed(2025) + + # Create tensor and return the pointer, tensor, and stride + def create_tensor_and_stride( + l: int, + mode0: int, + mode1: int, + is_mode0_major: bool, + dtype: type[cutlass.Numeric], + is_dynamic_layout: bool = True, + ) -> tuple[int, torch.Tensor, cute.Tensor, torch.Tensor, tuple[int, int]]: + # is_mode0_major: (l, mode1, mode0) -> (mode0, mode1, l) + # else: (l, mode0, mode1) -> (mode0, mode1, l) + shape = (l, mode1, mode0) if is_mode0_major else (l, mode0, mode1) + permute_order = (2, 1, 0) if is_mode0_major else (1, 2, 0) + # omit stride for L mode as it is always 1 for grouped GEMM + strides = (1, mode0) if is_mode0_major else (mode1, 1) + assert dtype in {cutlass.Float16, cutlass.BFloat16, cutlass.Float32} + is_unsigned = False + + torch_dtype = cutlass_torch.dtype(dtype) + torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor( + shape, + torch_dtype, + permute_order=permute_order, + init_type=cutlass_torch.TensorInitType.RANDOM, + init_config=cutlass_torch.RandomInitConfig( + min_val=0 if is_unsigned else -2, max_val=4 if is_unsigned else 2 + ), + ) + torch_tensor = torch_tensor_cpu.cuda() + f32_torch_tensor = torch_tensor_cpu.to(dtype=torch.float32) + + cute_tensor = from_dlpack(torch_tensor, assumed_align=16) + if is_dynamic_layout: + cute_tensor = cute_tensor.mark_layout_dynamic( + leading_dim=(0 if is_mode0_major else 1) + ) + cute_tensor = cutlass_torch.convert_cute_tensor( + f32_torch_tensor, + cute_tensor, + dtype, + is_dynamic_layout=is_dynamic_layout, + ) + # Get pointer of the tensor + ptr = torch_tensor.data_ptr() + return ptr, torch_tensor, cute_tensor, f32_torch_tensor, strides + + # iterate all groups and create tensors for each group + torch_fp32_tensors_abc = [] + torch_tensors_abc = [] + cute_tensors_abc = [] + strides_abc = [] + ptrs_abc = [] + for _, (m, n, k, l) in enumerate(problem_sizes_mnkl): + ptr_a, torch_tensor_a, cute_tensor_a, tensor_fp32_a, stride_mk_a = ( + create_tensor_and_stride(l, m, k, a_major == "m", ab_dtype) + ) + ptr_b, torch_tensor_b, cute_tensor_b, tensor_fp32_b, stride_nk_b = ( + create_tensor_and_stride(l, n, k, b_major == "n", ab_dtype) + ) + ptr_c, torch_tensor_c, cute_tensor_c, tensor_fp32_c, stride_mn_c = ( + create_tensor_and_stride(l, m, n, c_major == "m", c_dtype) + ) + ptrs_abc.append([ptr_a, ptr_b, ptr_c]) + torch_tensors_abc.append([torch_tensor_a, torch_tensor_b, torch_tensor_c]) + torch_fp32_tensors_abc.append([tensor_fp32_a, tensor_fp32_b, tensor_fp32_c]) + strides_abc.append([stride_mk_a, stride_nk_b, stride_mn_c]) + cute_tensors_abc.append( + ( + cute_tensor_a, + cute_tensor_b, + cute_tensor_c, + ) + ) + # Choose A, B, C with the smallest size to create initial tensormaps + key_size_a = lambda item: item[1][0] * item[1][2] + key_size_b = lambda item: item[1][1] * item[1][2] + key_size_c = lambda item: item[1][0] * item[1][1] + # Find the indices of the groups with the smallest tensor sizes + min_a_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_a) + min_b_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_b) + min_c_idx, _ = min(enumerate(problem_sizes_mnkl), key=key_size_c) + initial_cute_tensors_abc = [ + cute_tensors_abc[min_a_idx][0], # A with smallest (m, k) + cute_tensors_abc[min_b_idx][1], # B with smallest (n, k) + cute_tensors_abc[min_c_idx][2], # C with smallest (m, n) + ] + + hardware_info = utils.HardwareInfo() + sm_count = hardware_info.get_max_active_clusters(1) + max_active_clusters = hardware_info.get_max_active_clusters( + cluster_shape_mn[0] * cluster_shape_mn[1] + ) + # Prepare tensormap buffer for each SM + num_tensormap_buffers = sm_count + tensormap_pytorch_tensor = ( + torch.empty( + ( + num_tensormap_buffers, + GroupedGemmKernel.num_tensormaps, + GroupedGemmKernel.bytes_per_tensormap // 8, + ), + dtype=torch.int64, + ) + .fill_(0) + .cuda() + ) + tensormap_cute_tensor = from_dlpack(tensormap_pytorch_tensor, assumed_align=16) + + grouped_gemm = GroupedGemmKernel( + acc_dtype, + use_2cta_instrs, + mma_tiler_mn, + cluster_shape_mn, + tensormap_update_mode, + ) + + # Convert integer list to torch tensor and cute tensor + def convert_list_to_tensor(l, dtype) -> tuple[torch.Tensor, cute.Tensor]: + torch_tensor = torch.tensor(l, dtype=dtype).cuda() + cute_tensor = from_dlpack(torch_tensor, assumed_align=16) + return torch_tensor, cute_tensor + + # layout (num_groups, 4):(4, 1) + problem_sizes_mnkl_torch_tensor, problem_sizes_mnkl_cute_tensor = ( + convert_list_to_tensor(problem_sizes_mnkl, torch.int32) + ) + # layout (num_groups, 3, 2):(6, 2, 1) + strides_abc_torch_tensor, strides_abc_cute_tensor = convert_list_to_tensor( + strides_abc, torch.int32 + ) + # layout (num_groups,3):(3, 1) + ptrs_abc_torch_tensor, ptrs_abc_cute_tensor = convert_list_to_tensor( + ptrs_abc, torch.int64 + ) + + # Compute total number of cluster tiles we need to compute for given grouped GEMM problem + def compute_total_num_clusters( + problem_sizes_mnkl: List[tuple[int, int, int, int]], + cluster_tile_shape_mn: tuple[int, int], + ) -> int: + total_num_clusters = 0 + for m, n, _, _ in problem_sizes_mnkl: + num_clusters_mn = tuple( + (x + y - 1) // y for x, y in zip((m, n), cluster_tile_shape_mn) + ) + total_num_clusters += functools.reduce(lambda x, y: x * y, num_clusters_mn) + return total_num_clusters + + # Compute cluster tile shape + def compute_cluster_tile_shape( + mma_tiler_mn: tuple[int, int], + cluster_shape_mn: tuple[int, int], + use_2cta_instrs: bool, + ) -> tuple[int, int]: + cta_tile_shape_mn = list(mma_tiler_mn) + if use_2cta_instrs: + cta_tile_shape_mn[0] = cta_tile_shape_mn[0] // 2 + return tuple(x * y for x, y in zip(cta_tile_shape_mn, cluster_shape_mn)) + + cluster_tile_shape_mn = compute_cluster_tile_shape( + mma_tiler_mn, cluster_shape_mn, use_2cta_instrs + ) + total_num_clusters = compute_total_num_clusters( + problem_sizes_mnkl, cluster_tile_shape_mn + ) + + # Get current CUDA stream from PyTorch + torch_stream = torch.cuda.current_stream() + # Get the raw stream pointer as a CUstream + current_stream = cuda.CUstream(torch_stream.cuda_stream) + # Compile grouped GEMM kernel + compiled_grouped_gemm = cute.compile( + grouped_gemm, + initial_cute_tensors_abc[0], + initial_cute_tensors_abc[1], + initial_cute_tensors_abc[2], + num_groups, + problem_sizes_mnkl_cute_tensor, + strides_abc_cute_tensor, + ptrs_abc_cute_tensor, + total_num_clusters, + tensormap_cute_tensor, + max_active_clusters, + current_stream, + ) + + # Launch GPU kernel + # Warm up + for _ in range(warmup_iterations): + compiled_grouped_gemm( + initial_cute_tensors_abc[0], + initial_cute_tensors_abc[1], + initial_cute_tensors_abc[2], + problem_sizes_mnkl_cute_tensor, + strides_abc_cute_tensor, + ptrs_abc_cute_tensor, + tensormap_cute_tensor, + current_stream, + ) + # Execution + for i in range(iterations): + compiled_grouped_gemm( + initial_cute_tensors_abc[0], + initial_cute_tensors_abc[1], + initial_cute_tensors_abc[2], + problem_sizes_mnkl_cute_tensor, + strides_abc_cute_tensor, + ptrs_abc_cute_tensor, + tensormap_cute_tensor, + current_stream, + ) + + # Compute reference result + if not skip_ref_check: + refs = [] + for a, b, _ in torch_fp32_tensors_abc: + ref = (torch.einsum("mkl,nkl->mnl", a, b)).cpu() + refs.append(ref) + for i, ((_, _, c), ref) in enumerate(zip(torch_tensors_abc, refs)): + print(f"checking group {i}") + if c_dtype == cutlass.Float32: + ref_c = ref + else: + ref_c = ref.to(cutlass_torch.dtype(c_dtype)) + torch.testing.assert_close( + c.cpu(), + ref_c, + atol=tolerance, + rtol=1e-05, + ) + + +if __name__ == "__main__": + + def parse_comma_separated_ints(s: str) -> tuple[int, ...]: + try: + return tuple(int(x.strip()) for x in s.split(",")) + except ValueError: + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers." + ) + + def parse_comma_separated_tuples(s: str) -> List[tuple[int, ...]]: + if s.strip().startswith("("): + # Split on ),( to separate tuples + tuples = s.strip("()").split("),(") + result = [] + tuple_len = None + + for t in tuples: + # Parse individual tuple + nums = [int(x.strip()) for x in t.split(",")] + + # Validate tuple length consistency + if tuple_len is None: + tuple_len = len(nums) + elif len(nums) != tuple_len: + raise argparse.ArgumentTypeError( + "All tuples must have the same length" + ) + + result.append(tuple(nums)) + return result + + raise argparse.ArgumentTypeError( + "Invalid format. Expected comma-separated integers or list of tuples" + ) + + parser = argparse.ArgumentParser( + description="Example of Grouped GEMM on Blackwell." + ) + parser.add_argument( + "--num_groups", + type=int, + default=2, + help="Number of groups", + ) + parser.add_argument( + "--problem_sizes_mnkl", + type=parse_comma_separated_tuples, + default=((128, 128, 128, 1), (128, 128, 128, 1)), + help="a tuple of problem sizes for each group (comma-separated tuples)", + ) + parser.add_argument( + "--mma_tiler_mn", + type=parse_comma_separated_ints, + default=(128, 128), + help="Mma tile shape (comma-separated)", + ) + parser.add_argument( + "--cluster_shape_mn", + type=parse_comma_separated_ints, + default=(1, 1), + help="Cluster shape (comma-separated)", + ) + parser.add_argument( + "--tensormap_update_mode", + type=str, + default="SMEM", + help="Tensor map update mode", + ) + parser.add_argument("--ab_dtype", type=cutlass.dtype, default=cutlass.Float16) + parser.add_argument("--c_dtype", type=cutlass.dtype, default=cutlass.Float16) + parser.add_argument("--acc_dtype", type=cutlass.dtype, default=cutlass.Float32) + parser.add_argument( + "--use_2cta_instrs", + action="store_true", + help="Enable 2CTA MMA instructions feature", + ) + parser.add_argument("--a_major", choices=["k", "m"], type=str, default="k") + parser.add_argument("--b_major", choices=["k", "n"], type=str, default="k") + parser.add_argument("--c_major", choices=["n", "m"], type=str, default="n") + parser.add_argument( + "--tolerance", type=float, default=1e-01, help="Tolerance for validation" + ) + parser.add_argument( + "--warmup_iterations", type=int, default=0, help="Warmup iterations" + ) + parser.add_argument( + "--iterations", + type=int, + default=1, + help="Number of iterations to run the kernel", + ) + parser.add_argument( + "--skip_ref_check", action="store_true", help="Skip reference checking" + ) + + args = parser.parse_args() + + if ( + len(args.problem_sizes_mnkl) != 0 + and len(args.problem_sizes_mnkl) != args.num_groups + ): + parser.error("--problem_sizes_mnkl must contain exactly num_groups tuples") + + # l mode must be 1 for all groups + for _, _, _, l in args.problem_sizes_mnkl: + if l != 1: + parser.error("l must be 1 for all groups") + + if len(args.mma_tiler_mn) != 2: + parser.error("--mma_tiler_mn must contain exactly 2 values") + + if len(args.cluster_shape_mn) != 2: + parser.error("--cluster_shape_mn must contain exactly 2 values") + + if args.tensormap_update_mode not in ["GMEM", "SMEM"]: + parser.error("--tensormap_update_mode must be GMEM or SMEM") + + if args.tensormap_update_mode == "GMEM": + tensormap_update_mode = utils.TensorMapUpdateMode.GMEM + else: + tensormap_update_mode = utils.TensorMapUpdateMode.SMEM + + run_grouped_gemm( + args.num_groups, + args.problem_sizes_mnkl, + args.ab_dtype, + args.c_dtype, + args.acc_dtype, + args.a_major, + args.b_major, + args.c_major, + args.mma_tiler_mn, + args.cluster_shape_mn, + args.use_2cta_instrs, + tensormap_update_mode, + args.tolerance, + args.warmup_iterations, + args.iterations, + args.skip_ref_check, + ) + print("PASS") diff --git a/examples/python/CuTeDSL/notebooks/README.md b/examples/python/CuTeDSL/notebooks/README.md new file mode 100644 index 00000000..402c1cfc --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/README.md @@ -0,0 +1,31 @@ +# Copyright + +Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +SPDX-License-Identifier: BSD-3-Clause + +``` + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +``` diff --git a/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb b/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb new file mode 100644 index 00000000..dc7c17cf --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/cuda_graphs.ipynb @@ -0,0 +1,648 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0e95f0df-4d1a-4e2e-92ff-90539bb4c517", + "metadata": {}, + "source": [ + "# Example 06: CUDA Graphs\n", + "\n", + "In this example we demonstrate how to use CUDA graphs through PyTorch with CuTe DSL.\n", + "The process of interacting with PyTorch's CUDA graph implementation requires exposing PyTorch's CUDA streams to CUTLASS.\n", + "\n", + "To use CUDA graphs with Blackwell requires a version of PyTorch that supports Blackwell.\n", + "This can be obtained through:\n", + "- The [PyTorch NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch)\n", + "- [PyTorch 2.7 with CUDA 12.8 or later](https://pytorch.org/) (e.g., `pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128`)\n", + "- Building PyTorch directly with your version of CUDA." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "46b8fb6f-9ac5-4a3d-b765-b6476f182bf7", + "metadata": {}, + "outputs": [], + "source": [ + "# import torch for CUDA graphs\n", + "import torch\n", + "import cutlass\n", + "import cutlass.cute as cute\n", + "# import CUstream type from the cuda driver bindings\n", + "from cuda.bindings.driver import CUstream\n", + "# import the current_stream function from torch\n", + "from torch.cuda import current_stream" + ] + }, + { + "cell_type": "markdown", + "id": "bcf5e06e-1f5b-4d72-ad73-9b36efb78ca0", + "metadata": {}, + "source": [ + "## Kernel Creation\n", + "\n", + "We create a kernel which prints \"Hello world\" as well as a host function to launch the kernel.\n", + "We then compile the kernel for use in our graph, by passing in a default stream.\n", + "\n", + "Kernel compilation before graph capture is required since CUDA graphs cannot JIT compile kernels during graph execution." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0c2a6ca8-98d7-4837-b91f-af769ca8fcd8", + "metadata": {}, + "outputs": [], + "source": [ + "@cute.kernel\n", + "def hello_world_kernel():\n", + " \"\"\"\n", + " A kernel that prints hello world\n", + " \"\"\"\n", + " cute.printf(\"Hello world\")\n", + "\n", + "@cute.jit\n", + "def hello_world(stream : CUstream):\n", + " \"\"\"\n", + " Host function that launches our (1,1,1), (1,1,1) grid in stream\n", + " \"\"\"\n", + " hello_world_kernel().launch(grid=[1, 1, 1], block=[1, 1, 1], stream=stream)\n", + "\n", + "# Grab a stream from PyTorch, this will also initialize our context\n", + "# so we can omit cutlass.cuda.initialize_cuda_context()\n", + "stream = current_stream()\n", + "hello_world_compiled = cute.compile(hello_world, CUstream(stream.cuda_stream))" + ] + }, + { + "cell_type": "markdown", + "id": "ecc850af-09f8-4a29-9c93-ff31fbb9326f", + "metadata": {}, + "source": [ + "## Creating and replaying a CUDA Graph\n", + "\n", + "We create a stream through torch as well as a graph.\n", + "When we create the graph we can pass the stream we want to capture to torch. We similarly run the compiled kernel with the stream passed as a CUstream.\n", + "\n", + "Finally we can replay our graph and synchronize." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f673e5ae-42bb-44d0-b652-3280606181c4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello world\n", + "Hello world\n" + ] + } + ], + "source": [ + "# Create a CUDA Graph\n", + "g = torch.cuda.CUDAGraph()\n", + "# Capture our graph\n", + "with torch.cuda.graph(g):\n", + " # Turn our torch Stream into a cuStream stream.\n", + " # This is done by getting the underlying CUstream with .cuda_stream\n", + " graph_stream = CUstream(current_stream().cuda_stream)\n", + " # Run 2 iterations of our compiled kernel\n", + " for _ in range(2):\n", + " # Run our kernel in the stream\n", + " hello_world_compiled(graph_stream)\n", + "\n", + "# Replay our graph\n", + "g.replay()\n", + "# Synchronize all streams (equivalent to cudaDeviceSynchronize() in C++)\n", + "torch.cuda.synchronize()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "db76d9c3-7617-4bf2-b326-11982e6803bf", + "metadata": {}, + "source": [ + "Our run results in the following execution when viewed in NSight Systems:\n", + "\n", + "![Image of two hello world kernels run back to back in a CUDA graph](images/cuda_graphs_image.png)\n", + "\n", + "We can observe the launch of the two kernels followed by a `cudaDeviceSynchronize()`.\n", + "\n", + "Now we can confirm that this minimizes some launch overhead:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "3ebe15bf-dc97-42e9-913c-224ecfb472e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n", + "Hello world\n" + ] + } + ], + "source": [ + "# Get our CUDA stream from PyTorch\n", + "stream = CUstream(current_stream().cuda_stream)\n", + "\n", + "# Create a larger CUDA Graph of 100 iterations\n", + "g = torch.cuda.CUDAGraph()\n", + "# Capture our graph\n", + "with torch.cuda.graph(g):\n", + " # Turn our torch Stream into a cuStream stream.\n", + " # This is done by getting the underlying CUstream with .cuda_stream\n", + " graph_stream = CUstream(current_stream().cuda_stream)\n", + " # Run 2 iterations of our compiled kernel\n", + " for _ in range(100):\n", + " # Run our kernel in the stream\n", + " hello_world_compiled(graph_stream)\n", + "\n", + "# Create CUDA events for measuring performance\n", + "start = torch.cuda.Event(enable_timing=True)\n", + "end = torch.cuda.Event(enable_timing=True)\n", + "\n", + "# Run our kernel to warm up the GPU\n", + "for _ in range(100):\n", + " hello_world_compiled(stream)\n", + "\n", + "# Record our start time\n", + "start.record()\n", + "# Run 100 kernels\n", + "for _ in range(100):\n", + " hello_world_compiled(stream)\n", + "# Record our end time\n", + "end.record()\n", + "# Synchronize (cudaDeviceSynchronize())\n", + "torch.cuda.synchronize()\n", + "\n", + "# Calculate the time spent when launching kernels in a stream\n", + "# Results are in ms\n", + "stream_time = start.elapsed_time(end) \n", + "\n", + "# Warmup our GPU again\n", + "g.replay()\n", + "# Record our start time\n", + "start.record()\n", + "# Run our graph\n", + "g.replay()\n", + "# Record our end time\n", + "end.record()\n", + "# Synchronize (cudaDeviceSynchronize())\n", + "torch.cuda.synchronize()\n", + "\n", + "# Calculate the time spent when launching kernels in a graph\n", + "# units are ms\n", + "graph_time = start.elapsed_time(end)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "12b8151a-46b3-4c99-9945-301f6b628131", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.94% speedup when using CUDA graphs for this kernel!\n" + ] + } + ], + "source": [ + "# Print out speedup when using CUDA graphs\n", + "percent_speedup = (stream_time - graph_time) / graph_time\n", + "print(f\"{percent_speedup * 100.0:.2f}% speedup when using CUDA graphs for this kernel!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb b/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb new file mode 100644 index 00000000..035776aa --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/cute_layout_algebra.ipynb @@ -0,0 +1,1001 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using CuTe Layout Algebra With Python DSL\n", + "\n", + "Referencing the [01_layout.md](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/01_layout.md) and [02_layout_algebra.md](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/02_layout_algebra.md) documentation from CuTe C++, we summarize:\n", + "\n", + "In CuTe, a `Layout`:\n", + "- is defined by a pair of `Shape` and `Stride`,\n", + "- maps coordinates space(s) to an index space,\n", + "- supports both static (compile-time) and dynamic (runtime) values.\n", + "\n", + "CuTe also provides a powerful set of operations—the *Layout Algebra*—for combining and manipulating layouts, including:\n", + "- Layout composition: Functional composition of layouts,\n", + "- Layout \"divide\": Splitting a layout into two component layouts,\n", + "- Layout \"product\": Reproducing a layout according to another layout.\n", + "\n", + "In this notebook, we will demonstrate:\n", + "1. How to use CuTe’s key layout algebra operations with the Python DSL.\n", + "2. How static and dynamic layouts behave when printed or manipulated within the Python DSL.\n", + "\n", + "We use examples from [02_layout_algebra.md](https://github.com/NVIDIA/cutlass/blob/main/media/docs/cpp/cute/02_layout_algebra.md) which we recommend to the reader for additional details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cutlass\n", + "import cutlass.cute as cute" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Layout Algebra Operations\n", + "\n", + "These operations form the foundation of CuTe's layout manipulation capabilities, enabling:\n", + "- Efficient data tiling and partitioning,\n", + "- Separation of thread and data layouts with a canonical type to represent both,\n", + "- Native description and manipulation of hierarchical tensors of threads and data crucial for tensor core programs,\n", + "- Mixed static/dynamic layout transformations,\n", + "- Seamless integration of layout algebra with tensor operations,\n", + "- Expression of complex MMA and copies as canonical loops." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Coalesce\n", + "\n", + "The `coalesce` operation simplifies a layout by flattening and combining modes when possible, without changing its size or behavior as a function on the integers.\n", + "\n", + "It ensures the post-conditions:\n", + "- Preserve size: cute.size(layout) == cute.size(result),\n", + "- Flattened: depth(result) <= 1,\n", + "- Preserve functional: For all i, 0 <= i < cute.size(layout), layout(i) == result(i).\n", + "\n", + "#### Examples\n", + "\n", + "- Basic Coalesce Example :" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Original: (2,(1,6)):(1,(?,2))\n", + ">>> Coalesced: 12:1\n", + ">?? Original: (2,(1,6)):(1,(6,2))\n", + ">?? Coalesced: 12:1\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def coalesce_example():\n", + " \"\"\"\n", + " Demonstrates coalesce operation flattening and combining modes\n", + " \"\"\"\n", + " layout = cute.make_layout((2, (1, 6)), stride=(1, (cutlass.Int32(6), 2))) # Dynamic stride\n", + " result = cute.coalesce(layout)\n", + "\n", + " print(\">>> Original:\", layout)\n", + " cute.printf(\">?? Original: {}\", layout)\n", + " print(\">>> Coalesced:\", result)\n", + " cute.printf(\">?? Coalesced: {}\", result)\n", + "\n", + "coalesce_example()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Original: ((2,(3,4)),(3,2),1):((4,(8,24)),(2,6),12)\n", + ">>> Coalesced: (24,6):(4,2)\n", + ">>> Checking post-conditions:\n", + ">>> 1. Checking size remains the same after the coalesce operation:\n", + "Original size: 144, Coalesced size: 144\n", + ">>> 2. Checking depth of coalesced layout <= 1:\n", + "Depth of coalesced layout: 1\n", + ">>> 3. Checking layout functionality remains the same after the coalesce operation:\n", + "Index 0: original 0, coalesced 0\n", + "Index 1: original 4, coalesced 4\n", + "Index 2: original 8, coalesced 8\n", + "Index 3: original 12, coalesced 12\n", + "Index 4: original 16, coalesced 16\n", + "Index 5: original 20, coalesced 20\n", + "Index 6: original 24, coalesced 24\n", + "Index 7: original 28, coalesced 28\n", + "Index 8: original 32, coalesced 32\n", + "Index 9: original 36, coalesced 36\n", + "Index 10: original 40, coalesced 40\n", + "Index 11: original 44, coalesced 44\n", + "Index 12: original 48, coalesced 48\n", + "Index 13: original 52, coalesced 52\n", + "Index 14: original 56, coalesced 56\n", + "Index 15: original 60, coalesced 60\n", + "Index 16: original 64, coalesced 64\n", + "Index 17: original 68, coalesced 68\n", + "Index 18: original 72, coalesced 72\n", + "Index 19: original 76, coalesced 76\n", + "Index 20: original 80, coalesced 80\n", + "Index 21: original 84, coalesced 84\n", + "Index 22: original 88, coalesced 88\n", + "Index 23: original 92, coalesced 92\n", + "Index 24: original 2, coalesced 2\n", + "Index 25: original 6, coalesced 6\n", + "Index 26: original 10, coalesced 10\n", + "Index 27: original 14, coalesced 14\n", + "Index 28: original 18, coalesced 18\n", + "Index 29: original 22, coalesced 22\n", + "Index 30: original 26, coalesced 26\n", + "Index 31: original 30, coalesced 30\n", + "Index 32: original 34, coalesced 34\n", + "Index 33: original 38, coalesced 38\n", + "Index 34: original 42, coalesced 42\n", + "Index 35: original 46, coalesced 46\n", + "Index 36: original 50, coalesced 50\n", + "Index 37: original 54, coalesced 54\n", + "Index 38: original 58, coalesced 58\n", + "Index 39: original 62, coalesced 62\n", + "Index 40: original 66, coalesced 66\n", + "Index 41: original 70, coalesced 70\n", + "Index 42: original 74, coalesced 74\n", + "Index 43: original 78, coalesced 78\n", + "Index 44: original 82, coalesced 82\n", + "Index 45: original 86, coalesced 86\n", + "Index 46: original 90, coalesced 90\n", + "Index 47: original 94, coalesced 94\n", + "Index 48: original 4, coalesced 4\n", + "Index 49: original 8, coalesced 8\n", + "Index 50: original 12, coalesced 12\n", + "Index 51: original 16, coalesced 16\n", + "Index 52: original 20, coalesced 20\n", + "Index 53: original 24, coalesced 24\n", + "Index 54: original 28, coalesced 28\n", + "Index 55: original 32, coalesced 32\n", + "Index 56: original 36, coalesced 36\n", + "Index 57: original 40, coalesced 40\n", + "Index 58: original 44, coalesced 44\n", + "Index 59: original 48, coalesced 48\n", + "Index 60: original 52, coalesced 52\n", + "Index 61: original 56, coalesced 56\n", + "Index 62: original 60, coalesced 60\n", + "Index 63: original 64, coalesced 64\n", + "Index 64: original 68, coalesced 68\n", + "Index 65: original 72, coalesced 72\n", + "Index 66: original 76, coalesced 76\n", + "Index 67: original 80, coalesced 80\n", + "Index 68: original 84, coalesced 84\n", + "Index 69: original 88, coalesced 88\n", + "Index 70: original 92, coalesced 92\n", + "Index 71: original 96, coalesced 96\n", + "Index 72: original 6, coalesced 6\n", + "Index 73: original 10, coalesced 10\n", + "Index 74: original 14, coalesced 14\n", + "Index 75: original 18, coalesced 18\n", + "Index 76: original 22, coalesced 22\n", + "Index 77: original 26, coalesced 26\n", + "Index 78: original 30, coalesced 30\n", + "Index 79: original 34, coalesced 34\n", + "Index 80: original 38, coalesced 38\n", + "Index 81: original 42, coalesced 42\n", + "Index 82: original 46, coalesced 46\n", + "Index 83: original 50, coalesced 50\n", + "Index 84: original 54, coalesced 54\n", + "Index 85: original 58, coalesced 58\n", + "Index 86: original 62, coalesced 62\n", + "Index 87: original 66, coalesced 66\n", + "Index 88: original 70, coalesced 70\n", + "Index 89: original 74, coalesced 74\n", + "Index 90: original 78, coalesced 78\n", + "Index 91: original 82, coalesced 82\n", + "Index 92: original 86, coalesced 86\n", + "Index 93: original 90, coalesced 90\n", + "Index 94: original 94, coalesced 94\n", + "Index 95: original 98, coalesced 98\n", + "Index 96: original 8, coalesced 8\n", + "Index 97: original 12, coalesced 12\n", + "Index 98: original 16, coalesced 16\n", + "Index 99: original 20, coalesced 20\n", + "Index 100: original 24, coalesced 24\n", + "Index 101: original 28, coalesced 28\n", + "Index 102: original 32, coalesced 32\n", + "Index 103: original 36, coalesced 36\n", + "Index 104: original 40, coalesced 40\n", + "Index 105: original 44, coalesced 44\n", + "Index 106: original 48, coalesced 48\n", + "Index 107: original 52, coalesced 52\n", + "Index 108: original 56, coalesced 56\n", + "Index 109: original 60, coalesced 60\n", + "Index 110: original 64, coalesced 64\n", + "Index 111: original 68, coalesced 68\n", + "Index 112: original 72, coalesced 72\n", + "Index 113: original 76, coalesced 76\n", + "Index 114: original 80, coalesced 80\n", + "Index 115: original 84, coalesced 84\n", + "Index 116: original 88, coalesced 88\n", + "Index 117: original 92, coalesced 92\n", + "Index 118: original 96, coalesced 96\n", + "Index 119: original 100, coalesced 100\n", + "Index 120: original 10, coalesced 10\n", + "Index 121: original 14, coalesced 14\n", + "Index 122: original 18, coalesced 18\n", + "Index 123: original 22, coalesced 22\n", + "Index 124: original 26, coalesced 26\n", + "Index 125: original 30, coalesced 30\n", + "Index 126: original 34, coalesced 34\n", + "Index 127: original 38, coalesced 38\n", + "Index 128: original 42, coalesced 42\n", + "Index 129: original 46, coalesced 46\n", + "Index 130: original 50, coalesced 50\n", + "Index 131: original 54, coalesced 54\n", + "Index 132: original 58, coalesced 58\n", + "Index 133: original 62, coalesced 62\n", + "Index 134: original 66, coalesced 66\n", + "Index 135: original 70, coalesced 70\n", + "Index 136: original 74, coalesced 74\n", + "Index 137: original 78, coalesced 78\n", + "Index 138: original 82, coalesced 82\n", + "Index 139: original 86, coalesced 86\n", + "Index 140: original 90, coalesced 90\n", + "Index 141: original 94, coalesced 94\n", + "Index 142: original 98, coalesced 98\n", + "Index 143: original 102, coalesced 102\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def coalesce_post_conditions():\n", + " \"\"\"\n", + " Demonstrates coalesce operation's 3 post-conditions:\n", + " 1. size(@a result) == size(@a layout)\n", + " 2. depth(@a result) <= 1\n", + " 3. for all i, 0 <= i < size(@a layout), @a result(i) == @a layout(i)\n", + " \"\"\"\n", + " layout = cute.make_layout(\n", + " ((2, (3, 4)), (3, 2), 1),\n", + " stride=((4, (8, 24)), (2, 6), 12)\n", + " )\n", + " result = cute.coalesce(layout)\n", + "\n", + " print(\">>> Original:\", layout)\n", + " print(\">>> Coalesced:\", result)\n", + "\n", + " print(\">>> Checking post-conditions:\")\n", + " print(\">>> 1. Checking size remains the same after the coalesce operation:\")\n", + " original_size = cute.size(layout)\n", + " coalesced_size = cute.size(result)\n", + " print(f\"Original size: {original_size}, Coalesced size: {coalesced_size}\")\n", + " assert coalesced_size == original_size, \\\n", + " f\"Size mismatch: original {original_size}, coalesced {coalesced_size}\"\n", + " \n", + " print(\">>> 2. Checking depth of coalesced layout <= 1:\")\n", + " depth = cute.depth(result)\n", + " print(f\"Depth of coalesced layout: {depth}\")\n", + " assert depth <= 1, f\"Depth of coalesced layout should be <= 1, got {depth}\"\n", + "\n", + " print(\">>> 3. Checking layout functionality remains the same after the coalesce operation:\")\n", + " for i in range(original_size):\n", + " original_value = layout(i)\n", + " coalesced_value = result(i)\n", + " print(f\"Index {i}: original {original_value}, coalesced {coalesced_value}\")\n", + " assert coalesced_value == original_value, \\\n", + " f\"Value mismatch at index {i}: original {original_value}, coalesced {coalesced_value}\"\n", + "\n", + "coalesce_post_conditions()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- By-mode Coalesce Example :" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Original: (2,(1,6)):(1,(6,2))\n", + ">>> Coalesced Result: (2,6):(1,2)\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def bymode_coalesce_example():\n", + " \"\"\"\n", + " Demonstrates by-mode coalescing\n", + " \"\"\"\n", + " layout = cute.make_layout((2, (1, 6)), stride=(1, (6, 2)))\n", + "\n", + " # Coalesce with mode-wise profile (1,1) = coalesce both modes\n", + " result = cute.coalesce(layout, target_profile=(1, 1))\n", + " \n", + " # Print results\n", + " print(\">>> Original: \", layout)\n", + " print(\">>> Coalesced Result: \", result)\n", + "\n", + "bymode_coalesce_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Composition\n", + "\n", + "`Composition` of Layout `A` with Layout `B` creates a new layout `R = A ◦ B` where:\n", + "- The shape of `B` is compatible with the shape of `R` so that all coordinates of `B` can also be used as coordinates of `R`,\n", + "- `R(c) = A(B(c))` for all coordinates `c` in `B`'s domain.\n", + "\n", + "Layout composition is very useful for reshaping and reordering layouts.\n", + "\n", + "#### Examples\n", + "\n", + "- Basic Composition Example :" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout A: (6,2):(?,2)\n", + ">>> Layout B: (4,3):(3,1)\n", + ">>> Composition R = A ◦ B: ((2,2),3):((?{div=3},2),?)\n", + ">?? Layout A: (6,2):(8,2)\n", + ">?? Layout B: (4,3):(3,1)\n", + ">?? Composition R: ((2,2),3):((24,2),8)\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def composition_example():\n", + " \"\"\"\n", + " Demonstrates basic layout composition R = A ◦ B\n", + " \"\"\"\n", + " A = cute.make_layout((6, 2), stride=(cutlass.Int32(8), 2)) # Dynamic stride\n", + " B = cute.make_layout((4, 3), stride=(3, 1))\n", + " R = cute.composition(A, B)\n", + "\n", + " # Print static and dynamic information\n", + " print(\">>> Layout A:\", A)\n", + " cute.printf(\">?? Layout A: {}\", A)\n", + " print(\">>> Layout B:\", B) \n", + " cute.printf(\">?? Layout B: {}\", B)\n", + " print(\">>> Composition R = A ◦ B:\", R)\n", + " cute.printf(\">?? Composition R: {}\", R)\n", + "\n", + "composition_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Comparing Composition with static and dynamic layouts :\n", + "\n", + "In this case, the results may look different but are mathematically the same. The 1s in the shape don't affect the layout as a mathematical function on the integers. In the dynamic case, CuTe can not coalesce the dynamic size-1 modes to \"simplify\" the layout because it is not valid to do so for all possible dynamic values that parameter could realize at runtime." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Static composition:\n", + ">>> A_static: (10,2):(16,4)\n", + ">>> B_static: (5,4):(1,5)\n", + ">>> R_static: (5,(2,2)):(16,(80,4))\n", + ">?? Dynamic composition:\n", + ">?? A_dynamic: (10,2):(16,4)\n", + ">?? B_dynamic: (5,4):(1,5)\n", + ">?? R_dynamic: ((5,1),(2,2)):((16,4),(80,4))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def composition_static_vs_dynamic_layout():\n", + " \"\"\"\n", + " Shows difference between static and dynamic composition results\n", + " \"\"\"\n", + " # Static version - using compile-time values\n", + " A_static = cute.make_layout(\n", + " (10, 2), \n", + " stride=(16, 4)\n", + " )\n", + " B_static = cute.make_layout(\n", + " (5, 4), \n", + " stride=(1, 5)\n", + " )\n", + " R_static = cute.composition(A_static, B_static)\n", + "\n", + " # Static print shows compile-time info\n", + " print(\">>> Static composition:\")\n", + " print(\">>> A_static: \", A_static)\n", + " print(\">>> B_static: \", B_static)\n", + " print(\">>> R_static: \", R_static)\n", + "\n", + " # Dynamic version - using runtime Int32 values\n", + " A_dynamic = cute.make_layout(\n", + " (cutlass.Int32(10), cutlass.Int32(2)),\n", + " stride=(cutlass.Int32(16), cutlass.Int32(4))\n", + " )\n", + " B_dynamic = cute.make_layout(\n", + " (cutlass.Int32(5), cutlass.Int32(4)),\n", + " stride=(cutlass.Int32(1), cutlass.Int32(5))\n", + " )\n", + " R_dynamic = cute.composition(A_dynamic, B_dynamic)\n", + " \n", + " # Dynamic printf shows runtime values\n", + " cute.printf(\">?? Dynamic composition:\")\n", + " cute.printf(\">?? A_dynamic: {}\", A_dynamic)\n", + " cute.printf(\">?? B_dynamic: {}\", B_dynamic)\n", + " cute.printf(\">?? R_dynamic: {}\", R_dynamic)\n", + "\n", + "composition_static_vs_dynamic_layout()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- By-mode Composition Example :\n", + "\n", + "By-mode composition allows us to apply composition operations to individual modes of a layout. This is particularly useful when you want to manipulate specific modes layout independently (e.g. rows and columns).\n", + "\n", + "In the context of CuTe, by-mode composition is achieved by using a `Tiler`, which can be a layout or a tuple of layouts. The leaves of the `Tiler` tuple specify how the corresponding mode of the target layout should be composed, allowing for sublayouts to be treated independently." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout A: (?,(?,?)):(?,(?,?))\n", + ">>> Tiler: (3, 8)\n", + ">>> By-mode Composition Result: (3,(?,?)):(?,(?,?))\n", + ">?? Layout A: (12,(4,8)):(59,(13,1))\n", + ">?? Tiler: (3,8)\n", + ">?? By-mode Composition Result: (3,(4,2)):(59,(13,1))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def bymode_composition_example():\n", + " \"\"\"\n", + " Demonstrates by-mode composition using a tiler\n", + " \"\"\"\n", + " # Define the original layout A\n", + " A = cute.make_layout(\n", + " (cutlass.Int32(12), (cutlass.Int32(4), cutlass.Int32(8))), \n", + " stride=(cutlass.Int32(59), (cutlass.Int32(13), cutlass.Int32(1)))\n", + " )\n", + "\n", + " # Define the tiler for by-mode composition\n", + " tiler = (3, 8) # Apply 3:1 to mode-0 and 8:1 to mode-1\n", + "\n", + " # Apply by-mode composition\n", + " result = cute.composition(A, tiler)\n", + "\n", + " # Print static and dynamic information\n", + " print(\">>> Layout A:\", A)\n", + " cute.printf(\">?? Layout A: {}\", A)\n", + " print(\">>> Tiler:\", tiler)\n", + " cute.printf(\">?? Tiler: {}\", tiler)\n", + " print(\">>> By-mode Composition Result:\", result)\n", + " cute.printf(\">?? By-mode Composition Result: {}\", result)\n", + "\n", + "bymode_composition_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Division (Splitting into Tiles)\n", + "\n", + "The Division operation in CuTe is used to split a layout into tiles, which is particularly useful for partitioning data across threads or memory hierarchies.\n", + "\n", + "#### Examples :\n", + "\n", + "- Logical divide :\n", + "\n", + "When applied to two Layouts, `logical_divide` splits a layout into two modes -- the first mode contains the elements pointed to by the tiler, and the second mode contains the remaining elements." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (4,2,3):(2,1,8)\n", + ">>> Tiler : 4:2\n", + ">>> Logical Divide Result: ((2,2),(2,3)):((4,1),(2,8))\n", + ">?? Logical Divide Result: ((2,2),(2,3)):((4,1),(2,8))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def logical_divide_1d_example():\n", + " \"\"\"\n", + " Demonstrates 1D logical divide\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((4, 2, 3), stride=(2, 1, 8)) # (4,2,3):(2,1,8)\n", + " \n", + " # Define the tiler\n", + " tiler = cute.make_layout(4, stride=2) # Apply to layout 4:2\n", + " \n", + " # Apply logical divide\n", + " result = cute.logical_divide(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Logical Divide Result:\", result)\n", + " cute.printf(\">?? Logical Divide Result: {}\", result)\n", + "\n", + "logical_divide_1d_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When applied to a Layout and a `Tiler` tuple, `logical_divide` applies itself to the leaves of the `Tiler`and the corresponding mode of the target Layout. This means that the sublayouts are split independently according to the layouts within the `Tiler`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (9,(4,8)):(59,(13,1))\n", + ">>> Tiler : (, )\n", + ">>> Logical Divide Result: ((3,3),((2,4),(2,2))):((177,59),((13,2),(26,1)))\n", + ">?? Logical Divide Result: ((3,3),((2,4),(2,2))):((177,59),((13,2),(26,1)))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def logical_divide_2d_example():\n", + " \"\"\"\n", + " Demonstrates 2D logical divide :\n", + " Layout Shape : (M, N, L, ...)\n", + " Tiler Shape : \n", + " Result Shape : ((TileM,RestM), (TileN,RestN), L, ...)\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1))) # (9,(4,8)):(59,(13,1))\n", + " \n", + " # Define the tiler\n", + " tiler = (cute.make_layout(3, stride=3), # Apply to mode-0 layout 3:3\n", + " cute.make_layout((2, 4), stride=(1, 8))) # Apply to mode-1 layout (2,4):(1,8)\n", + " \n", + " # Apply logical divide\n", + " result = cute.logical_divide(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Logical Divide Result:\", result)\n", + " cute.printf(\">?? Logical Divide Result: {}\", result)\n", + "\n", + "logical_divide_2d_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Zipped, tiled, and flat divide are flavors of `logical_divide` that potentially rearrange modes into more convenient forms.\n", + "\n", + "- Zipped Divide :" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (9,(4,8)):(59,(13,1))\n", + ">>> Tiler : (, )\n", + ">>> Zipped Divide Result: ((3,(2,4)),(3,(2,2))):((177,(13,2)),(59,(26,1)))\n", + ">?? Zipped Divide Result: ((3,(2,4)),(3,(2,2))):((177,(13,2)),(59,(26,1)))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def zipped_divide_example():\n", + " \"\"\"\n", + " Demonstrates zipped divide :\n", + " Layout Shape : (M, N, L, ...)\n", + " Tiler Shape : \n", + " Result Shape : ((TileM,TileN), (RestM,RestN,L,...))\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1))) # (9,(4,8)):(59,(13,1))\n", + " \n", + " # Define the tiler\n", + " tiler = (cute.make_layout(3, stride=3), # Apply to mode-0 layout 3:3\n", + " cute.make_layout((2, 4), stride=(1, 8))) # Apply to mode-1 layout (2,4):(1,8)\n", + " \n", + " # Apply zipped divide\n", + " result = cute.zipped_divide(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Zipped Divide Result:\", result)\n", + " cute.printf(\">?? Zipped Divide Result: {}\", result)\n", + "\n", + "zipped_divide_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Tiled Divide :" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (9,(4,8)):(59,(13,1))\n", + ">>> Tiler : (, )\n", + ">>> Tiled Divide Result: ((3,(2,4)),3,(2,2)):((177,(13,2)),59,(26,1))\n", + ">?? Tiled Divide Result: ((3,(2,4)),3,(2,2)):((177,(13,2)),59,(26,1))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def tiled_divide_example():\n", + " \"\"\"\n", + " Demonstrates tiled divide :\n", + " Layout Shape : (M, N, L, ...)\n", + " Tiler Shape : \n", + " Result Shape : ((TileM,TileN), RestM, RestN, L, ...)\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1))) # (9,(4,8)):(59,(13,1))\n", + " \n", + " # Define the tiler\n", + " tiler = (cute.make_layout(3, stride=3), # Apply to mode-0 layout 3:3\n", + " cute.make_layout((2, 4), stride=(1, 8))) # Apply to mode-1 layout (2,4):(1,8)\n", + " \n", + " # Apply tiled divide\n", + " result = cute.tiled_divide(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Tiled Divide Result:\", result)\n", + " cute.printf(\">?? Tiled Divide Result: {}\", result)\n", + "\n", + "tiled_divide_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Flat Divide :" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (9,(4,8)):(59,(13,1))\n", + ">>> Tiler : (, )\n", + ">>> Flat Divide Result: (3,(2,4),3,(2,2)):(177,(13,2),59,(26,1))\n", + ">?? Flat Divide Result: (3,(2,4),3,(2,2)):(177,(13,2),59,(26,1))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def flat_divide_example():\n", + " \"\"\"\n", + " Demonstrates flat divide :\n", + " Layout Shape : (M, N, L, ...)\n", + " Tiler Shape : \n", + " Result Shape : (TileM, TileN, RestM, RestN, L, ...)\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((9, (4, 8)), stride=(59, (13, 1))) # (9,(4,8)):(59,(13,1))\n", + " \n", + " # Define the tiler\n", + " tiler = (cute.make_layout(3, stride=3), # Apply to mode-0 layout 3:3\n", + " cute.make_layout((2, 4), stride=(1, 8))) # Apply to mode-1 layout (2,4):(1,8)\n", + " \n", + " # Apply flat divide\n", + " result = cute.flat_divide(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Flat Divide Result:\", result)\n", + " cute.printf(\">?? Flat Divide Result: {}\", result)\n", + "\n", + "flat_divide_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Product (Reproducing a Tile)\n", + "\n", + "The Product operation in CuTe is used to reproduce one layout according to another layout. It creates a new layout where:\n", + "- The first mode is the original layout A.\n", + "- The second mode is a restrided layout B that points to the origin of a \"unique replication\" of A.\n", + "\n", + "This is particularly useful for repeating layouts of threads across a tile of data for creating \"repeat\" patterns.\n", + "\n", + "#### Examples\n", + "\n", + "- Logical Product :" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (2,2):(4,1)\n", + ">>> Tiler : 6:1\n", + ">>> Logical Product Result: ((2,2),(2,3)):((4,1),(2,8))\n", + ">?? Logical Product Result: ((2,2),(2,3)):((4,1),(2,8))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def logical_product_1d_example():\n", + " \"\"\"\n", + " Demonstrates 1D logical product\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((2, 2), stride=(4, 1)) # (2,2):(4,1)\n", + " \n", + " # Define the tiler\n", + " tiler = cute.make_layout(6, stride=1) # Apply to layout 6:1\n", + " \n", + " # Apply logical product\n", + " result = cute.logical_product(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Logical Product Result:\", result)\n", + " cute.printf(\">?? Logical Product Result: {}\", result)\n", + "\n", + "logical_product_1d_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Blocked and Raked Product :\n", + " \n", + " - Blocked Product: Combines the modes of A and B in a block-like fashion, preserving the semantic meaning of the modes by reassociating them after the product.\n", + " - Raked Product: Combines the modes of A and B in an interleaved or \"raked\" fashion, creating a cyclic distribution of the tiles." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (2,5):(5,1)\n", + ">>> Tiler : (3,4):(1,3)\n", + ">>> Blocked Product Result: ((2,3),(5,4)):((5,10),(1,30))\n", + ">>> Raked Product Result: ((3,2),(4,5)):((10,5),(30,1))\n", + ">?? Blocked Product Result: ((2,3),(5,4)):((5,10),(1,30))\n", + ">?? Raked Product Result: ((3,2),(4,5)):((10,5),(30,1))\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def blocked_raked_product_example():\n", + " \"\"\"\n", + " Demonstrates blocked and raked products\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((2, 5), stride=(5, 1))\n", + " \n", + " # Define the tiler\n", + " tiler = cute.make_layout((3, 4), stride=(1, 3))\n", + " \n", + " # Apply blocked product\n", + " blocked_result = cute.blocked_product(layout, tiler=tiler)\n", + "\n", + " # Apply raked product\n", + " raked_result = cute.raked_product(layout, tiler=tiler)\n", + " \n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Blocked Product Result:\", blocked_result)\n", + " print(\">>> Raked Product Result:\", raked_result)\n", + " cute.printf(\">?? Blocked Product Result: {}\", blocked_result)\n", + " cute.printf(\">?? Raked Product Result: {}\", raked_result)\n", + "\n", + "blocked_raked_product_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Zipped, tiled, and flat product :\n", + " \n", + " - Similar to divide operations, zipped, tiled, and flat product are flavors of `logical_product` that potentially rearrange modes into more convenient forms." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> Layout: (2,5):(5,1)\n", + ">>> Tiler : (3,4):(1,3)\n", + ">>> Zipped Product Result: ((2,5),(3,4)):((5,1),(10,30))\n", + ">>> Tiled Product Result: ((2,5),3,4):((5,1),10,30)\n", + ">>> Flat Product Result: (2,5,3,4):(5,1,10,30)\n", + ">?? Zipped Product Result: ((2,5),(3,4)):((5,1),(10,30))\n", + ">?? Tiled Product Result: ((2,5),3,4):((5,1),10,30)\n", + ">?? Flat Product Result: (2,5,3,4):(5,1,10,30)\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def zipped_tiled_flat_product_example():\n", + " \"\"\"\n", + " Demonstrates zipped, tiled, and flat products\n", + " Layout Shape : (M, N, L, ...)\n", + " Tiler Shape : \n", + "\n", + " zipped_product : ((M,N), (TileM,TileN,L,...))\n", + " tiled_product : ((M,N), TileM, TileN, L, ...)\n", + " flat_product : (M, N, TileM, TileN, L, ...)\n", + " \"\"\"\n", + " # Define the original layout\n", + " layout = cute.make_layout((2, 5), stride=(5, 1))\n", + " \n", + " # Define the tiler\n", + " tiler = cute.make_layout((3, 4), stride=(1, 3))\n", + "\n", + " # Apply zipped product\n", + " zipped_result = cute.zipped_product(layout, tiler=tiler)\n", + " \n", + " # Apply tiled product\n", + " tiled_result = cute.tiled_product(layout, tiler=tiler)\n", + " \n", + " # Apply flat product\n", + " flat_result = cute.flat_product(layout, tiler=tiler)\n", + "\n", + " # Print results\n", + " print(\">>> Layout:\", layout)\n", + " print(\">>> Tiler :\", tiler)\n", + " print(\">>> Zipped Product Result:\", zipped_result)\n", + " print(\">>> Tiled Product Result:\", tiled_result)\n", + " print(\">>> Flat Product Result:\", flat_result)\n", + " cute.printf(\">?? Zipped Product Result: {}\", zipped_result)\n", + " cute.printf(\">?? Tiled Product Result: {}\", tiled_result)\n", + " cute.printf(\">?? Flat Product Result: {}\", flat_result)\n", + "\n", + "zipped_tiled_flat_product_example()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pythondsl_venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/python/CuTeDSL/notebooks/data_types.ipynb b/examples/python/CuTeDSL/notebooks/data_types.ipynb new file mode 100644 index 00000000..e618885d --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/data_types.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "\n", + "import cutlass\n", + "import cutlass.cute as cute" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Understanding data structure in CuTe DSL\n", + "\n", + "In most cases, data structures in CuTe DSL work the same as Python data structures with the notable difference that Python data structures in most cases are considered as static data which are interpreted by the DSL compiler embedded inside Python interpreter.\n", + "\n", + "To differentiate between compile-time and runtime values, CuTe DSL introduces primitive types that \n", + "represent dynamic values in JIT-compiled code.\n", + "\n", + "CuTe DSL provides a comprehensive set of primitive numeric types for representing dynamic values at \n", + "runtime. These types are formally defined within the CuTe DSL typing system:\n", + "\n", + "### Integer Types\n", + "- `Int8` - 8-bit signed integer\n", + "- `Int16` - 16-bit signed integer \n", + "- `Int32` - 32-bit signed integer\n", + "- `Int64` - 64-bit signed integer\n", + "- `Int128` - 128-bit signed integer\n", + "- `Uint8` - 8-bit unsigned integer\n", + "- `Uint16` - 16-bit unsigned integer\n", + "- `Uint32` - 32-bit unsigned integer\n", + "- `Uint64` - 64-bit unsigned integer\n", + "- `Uint128` - 128-bit unsigned integer\n", + "\n", + "### Floating Point Types\n", + "- `Float16` - 16-bit floating point\n", + "- `Float32` - 32-bit floating point \n", + "- `Float64` - 64-bit floating point\n", + "- `BFloat16` - Brain Floating Point format (16-bit)\n", + "- `TFloat32` - Tensor Float32 format (reduced precision format used in tensor operations)\n", + "- `Float8E4M3` - 8-bit floating point with 4-bit exponent and 3-bit mantissa\n", + "- `Float8E5M2` - 8-bit floating point with 5-bit exponent and 2-bit mantissa\n", + "\n", + "These specialized types are designed to represent dynamic values in CuTe DSL code that will be \n", + "evaluated at runtime, in contrast to Python's built-in numeric types which are evaluated during \n", + "compilation.\n", + "\n", + "### Example usage:\n", + "\n", + "```python\n", + "x = cutlass.Int32(5) # Creates a 32-bit integer\n", + "y = cutlass.Float32(3.14) # Creates a 32-bit float\n", + "\n", + "@cute.jit\n", + "def foo(a: cutlass.Int32): # annotate `a` as 32-bit integer passed to jit function via ABI\n", + " ...\n", + "```\n", + "To differentiate between compile-time and runtime values, CuTe DSL introduces primitive types that \n", + "represent dynamic values in JIT-compiled code.\n", + "\n", + "CuTe DSL provides a comprehensive set of primitive numeric types for representing dynamic values at \n", + "runtime. These types are formally defined within the CuTe DSL typing system:\n", + "\n", + "### Integer Types\n", + "- `Int8` - 8-bit signed integer\n", + "- `Int16` - 16-bit signed integer \n", + "- `Int32` - 32-bit signed integer\n", + "- `Int64` - 64-bit signed integer\n", + "- `Int128` - 128-bit signed integer\n", + "- `Uint8` - 8-bit unsigned integer\n", + "- `Uint16` - 16-bit unsigned integer\n", + "- `Uint32` - 32-bit unsigned integer\n", + "- `Uint64` - 64-bit unsigned integer\n", + "- `Uint128` - 128-bit unsigned integer\n", + "\n", + "### Floating Point Types\n", + "- `Float16` - 16-bit floating point\n", + "- `Float32` - 32-bit floating point \n", + "- `Float64` - 64-bit floating point\n", + "- `BFloat16` - Brain Floating Point format (16-bit)\n", + "- `TFloat32` - Tensor Float32 format (reduced precision format used in tensor operations)\n", + "- `Float8E4M3` - 8-bit floating point with 4-bit exponent and 3-bit mantissa\n", + "- `Float8E5M2` - 8-bit floating point with 5-bit exponent and 2-bit mantissa\n", + "\n", + "These specialized types are designed to represent dynamic values in CuTe DSL code that will be \n", + "evaluated at runtime, in contrast to Python's built-in numeric types which are evaluated during \n", + "compilation.\n", + "\n", + "### Example usage:\n", + "\n", + "```python\n", + "x = cutlass.Int32(5) # Creates a 32-bit integer\n", + "y = cutlass.Float32(3.14) # Creates a 32-bit float\n", + "\n", + "@cute.jit\n", + "def foo(a: cutlass.Int32): # annotate `a` as 32-bit integer passed to jit function via ABI\n", + " ...\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a(static) = ?\n", + "b(static) = ?\n", + "a(dynamic) = 3.140000\n", + "b(dynamic) = 5\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def bar():\n", + " a = cutlass.Float32(3.14)\n", + " print(\"a(static) =\", a) # prints `a(static) = ?`\n", + " cute.printf(\"a(dynamic) = {}\", a) # prints `a(dynamic) = 3.140000`\n", + "\n", + " b = cutlass.Int32(5)\n", + " print(\"b(static) =\", b) # prints `b(static) = 5`\n", + " cute.printf(\"b(dynamic) = {}\", b) # prints `b(dynamic) = 5`\n", + "\n", + "bar()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Type Conversion API\n", + "\n", + "CUTLASS numeric types provide type conversion through the `to()` method available on all Numeric types. This allows you to convert between different numeric data types at runtime.\n", + "\n", + "Syntax:\n", + "\n", + "```python\n", + "new_value = value.to(target_type)\n", + "```\n", + "\n", + "The `to()` method supports conversion between:\n", + "- Integer types (Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64)\n", + "- Floating point types (Float16, Float32, Float64, BFloat16)\n", + "- Mixed integer/floating point conversions\n", + "\n", + "Note that when converting from floating point to integer types, the decimal portion is truncated. When converting between types with different ranges, values may be clamped or lose precision if they exceed the target type's representable range." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Int32(42) => Float32(42.000000)\n", + "Float32(3.140000) => Int32(3)\n", + "Int32(127) => Int8(127)\n", + "Int32(300) => Int8(44) (truncated due to range limitation)\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def type_conversion():\n", + " # Convert from Int32 to Float32\n", + " x = cutlass.Int32(42)\n", + " y = x.to(cutlass.Float32)\n", + " cute.printf(\"Int32({}) => Float32({})\", x, y)\n", + "\n", + " # Convert from Float32 to Int32\n", + " a = cutlass.Float32(3.14)\n", + " b = a.to(cutlass.Int32)\n", + " cute.printf(\"Float32({}) => Int32({})\", a, b)\n", + "\n", + " # Convert from Int32 to Int8\n", + " c = cutlass.Int32(127)\n", + " d = c.to(cutlass.Int8)\n", + " cute.printf(\"Int32({}) => Int8({})\", c, d)\n", + "\n", + " # Convert from Int32 to Int8 with value exceeding Int8 range\n", + " e = cutlass.Int32(300)\n", + " f = e.to(cutlass.Int8)\n", + " cute.printf(\"Int32({}) => Int8({}) (truncated due to range limitation)\", e, f)\n", + "\n", + "type_conversion()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operator Overloading\n", + "\n", + "CUTLASS numeric types support Python's built-in operators, allowing you to write natural mathematical expressions. The operators work with both CUTLASS numeric types and Python native numeric types.\n", + "\n", + "Supported operators include:\n", + "- Arithmetic: `+`, `-`, `*`, `/`, `//`, `%`, `**`\n", + "- Comparison: `<`, `<=`, `==`, `!=`, `>=`, `>`\n", + "- Bitwise: `&`, `|`, `^`, `<<`, `>>`\n", + "- Unary: `-` (negation), `~` (bitwise NOT)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a: Int32(10), b: Int32(3)\n", + "x: Float32(5.500000)\n", + "\n", + "a + b = 13\n", + "x * 2 = 11.000000\n", + "a + x = 15.500000 (Int32 + Float32 promotes to Float32)\n", + "a / b = 3.333333\n", + "x / 2.0 = 2.750000\n", + "a > b = 1\n", + "a & b = 2\n", + "-a = -10\n", + "~a = -11\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def operator_demo():\n", + " # Arithmetic operators\n", + " a = cutlass.Int32(10)\n", + " b = cutlass.Int32(3)\n", + " cute.printf(\"a: Int32({}), b: Int32({})\", a, b)\n", + "\n", + " x = cutlass.Float32(5.5)\n", + " cute.printf(\"x: Float32({})\", x)\n", + "\n", + " cute.printf(\"\")\n", + "\n", + " sum_result = a + b\n", + " cute.printf(\"a + b = {}\", sum_result)\n", + "\n", + " y = x * 2 # Multiplying with Python native type\n", + " cute.printf(\"x * 2 = {}\", y)\n", + "\n", + " # Mixed type arithmetic (Int32 + Float32) that integer is converted into float32\n", + " mixed_result = a + x\n", + " cute.printf(\"a + x = {} (Int32 + Float32 promotes to Float32)\", mixed_result)\n", + "\n", + " # Division with Int32 (note: integer division)\n", + " div_result = a / b\n", + " cute.printf(\"a / b = {}\", div_result)\n", + "\n", + " # Float division\n", + " float_div = x / cutlass.Float32(2.0)\n", + " cute.printf(\"x / 2.0 = {}\", float_div)\n", + "\n", + " # Comparison operators\n", + " is_greater = a > b\n", + " cute.printf(\"a > b = {}\", is_greater)\n", + "\n", + " # Bitwise operators\n", + " bit_and = a & b\n", + " cute.printf(\"a & b = {}\", bit_and)\n", + "\n", + " neg_a = -a\n", + " cute.printf(\"-a = {}\", neg_a)\n", + "\n", + " not_a = ~a\n", + " cute.printf(\"~a = {}\", not_a)\n", + "\n", + "operator_demo()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb b/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb new file mode 100644 index 00000000..9cebc273 --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/elementwise_add.ipynb @@ -0,0 +1,838 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from functools import partial\n", + "\n", + "import cutlass\n", + "import cutlass.cute as cute\n", + "from cutlass.cute.runtime import from_dlpack" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial: Elementwise Add Kernel in CuTe DSL\n", + "\n", + "This tutorial demonstrates how to implement a simple elementwise\n", + "addition kernel using the CuTe DSL (Domain Specific Language).\n", + "\n", + "\n", + "\n", + "Elementwise Addition\n", + "---------------------\n", + "\n", + "Elementwise addition is a fundamental operation in linear algebra.\n", + "Given two tensors of the same shape, the operation performs element-wise\n", + "addition to produce a result tensor of the same shape.\n", + "\n", + "For two 2D tensors :math:`A` and :math:`B` of shape :math:`(M, N)`,\n", + "the elementwise addition operation :math:`C = A + B` is defined as:\n", + "\n", + "$\n", + " C_{i,j} = A_{i,j} + B_{i,j}\n", + "$\n", + "\n", + "where:\n", + "\n", + "- $i \\in [0, M-1]$ represents the row index\n", + "- $j \\in [0, N-1]$ represents the column index\n", + "- $A_{i,j}$, $B_{i,j}$, and $C_{i,j}$ are the elements at position $(i,j)$ \n", + " in tensors $A$, $B$, and $C$ respectively\n", + "\n", + "This operation is performed independently for each element position,\n", + "making it highly parallelizable and well-suited for GPU implementation.\n", + "\n", + "Naive Elementwise Add Kernel\n", + "-----------------------------\n", + "\n", + "Let's start with a naive implementation that loads each element from\n", + "$A$ and $B$, adds them, and stores the result back to $C$." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.kernel\n", + "def naive_elementwise_add_kernel(\n", + " gA: cute.Tensor,\n", + " gB: cute.Tensor,\n", + " gC: cute.Tensor,\n", + "):\n", + " tidx, _, _ = cute.arch.thread_idx()\n", + " bidx, _, _ = cute.arch.block_idx()\n", + " bdim, _, _ = cute.arch.block_dim()\n", + "\n", + " thread_idx = bidx * bdim + tidx\n", + "\n", + " # Map thread index to logical index of input tensor\n", + " m, n = gA.shape\n", + " ni = thread_idx % n\n", + " mi = thread_idx // n\n", + "\n", + " # Map logical index to physical address via tensor layout\n", + " a_val = gA[mi, ni]\n", + " b_val = gB[mi, ni]\n", + "\n", + " # Perform element-wise addition\n", + " gC[mi, ni] = a_val + b_val" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structure of the Kernel\n", + "\n", + "The naive kernel simply maps each thread to one element with a 1-to-1 mapping.\n", + "In this kernel, we don't use CuTe layout algebra but only use basic\n", + "addressing to index the tensor.\n", + "\n", + "We can launch the kernel with the following JIT function:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.jit\n", + "def naive_elementwise_add(\n", + " mA: cute.Tensor,\n", + " mB: cute.Tensor,\n", + " mC: cute.Tensor\n", + "):\n", + " num_threads_per_block = 256\n", + "\n", + " m, n = mA.shape\n", + " kernel = naive_elementwise_add_kernel(mA, mB, mC)\n", + " kernel.launch(grid=((m * n) // num_threads_per_block, 1, 1),\n", + " block=(num_threads_per_block, 1, 1))\n", + "\n", + "M, N = 2048, 2048\n", + "\n", + "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n", + "\n", + "a_ = from_dlpack(a, assumed_align=16)\n", + "b_ = from_dlpack(b, assumed_align=16)\n", + "c_ = from_dlpack(c, assumed_align=16)\n", + "\n", + "# Compile kernel\n", + "naive_elementwise_add_ = cute.compile(naive_elementwise_add, a_, b_, c_)\n", + "naive_elementwise_add_(a_, b_, c_)\n", + "\n", + "# verify correctness\n", + "torch.testing.assert_close(c, a + b)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Benchmark performance\n", + "\n", + "Here's a utility function to benchmark our kernel implementations:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def benchmark(callable, *, num_warmups, num_iterations):\n", + " start_event = torch.cuda.Event(enable_timing=True)\n", + " end_event = torch.cuda.Event(enable_timing=True)\n", + "\n", + " torch.cuda.synchronize()\n", + "\n", + " for _ in range(num_warmups):\n", + " callable()\n", + "\n", + " start_event.record(stream=torch.cuda.current_stream())\n", + " for _ in range(num_iterations):\n", + " callable()\n", + " end_event.record(stream=torch.cuda.current_stream())\n", + " torch.cuda.synchronize()\n", + "\n", + " elapsed_time = start_event.elapsed_time(end_event)\n", + " avg_time = elapsed_time / num_iterations\n", + "\n", + " print(f\"Average execution time: {avg_time:.4f} ms\")\n", + " print(f\"Throughput: {(3 * a.numel() * 2) / (avg_time / 1000) / 1e9:.2f} GB/s\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average execution time: 0.0385 ms\n", + "Throughput: 653.44 GB/s\n" + ] + } + ], + "source": [ + "benchmark(partial(naive_elementwise_add_, a_, b_, c_), num_warmups=5, num_iterations=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Analysis\n", + "\n", + "While our naive implementation maps thread indices to contiguous tensor\n", + "dimensions for coalesced memory access, it doesn't have enough\n", + "in-flight load & store operations to hide memory latency.\n", + "\n", + "According to Little's Law:\n", + "\n", + "$ L = \\lambda \\times W $\n", + "\n", + "Where:\n", + "- $L$ is the average number of items in a system\n", + "- $\\lambda$ is the average arrival rate of items (bandwidth)\n", + "- $W$ is the average time an item spends in the system (latency)\n", + "\n", + "For our elementwise addition kernel:\n", + "\n", + "1. $L$: The number of load & store operations in-flight\n", + "2. $\\lambda$ (Bandwidth): Data transfer rate between memory and compute units\n", + "3. $W$ (Latency): Round-trip delay of memory requests\n", + "\n", + "For memory-bound operations like elementwise addition, performance is\n", + "limited by the number of in-flight load & store operations.\n", + "\n", + "## Vectorized Load and Store\n", + "\n", + "To improve performance according to Little's Law, we need to increase the number\n", + "of in-flight requests. We can do this by increasing the number of bytes handled\n", + "in each load & store operation per thread through vectorized memory access.\n", + "\n", + "Since Ampere GPUs support up to 128-bit per load/store and each element is 32-bit,\n", + "we can load 4 elements per vectorized operation on contiguous rows.\n", + "CuTe tiling operations make this vectorization straightforward.\n", + "\n", + "Using ``tiled_tensor = cute.zipped_divide(tensor, tiler)``, we can partition the input\n", + "``tensor`` into groups of ``tiler`` blocks. For vectorization, we specify ``tiler``\n", + "as the block of data each thread accesses (4 contiguous elements in the same row, or ``(1,4)``).\n", + "Different threads can then access different blocks by indexing into the 2nd mode of ``tiled_tensor``.\n", + "\n", + "```python\n", + "mA : cute.Tensor # (2048,2048):(2048,1)\n", + "gA = cute.zipped_divide(a, tiler=(1, 4)) # tiled/vectorized => ((1,4),(2048,512)):((0,1),(2048,4))\n", + "```\n", + "\n", + "$\n", + " \\begin{array}{ccccc}\n", + " & ((1,4) & , & (2048,512)) & : ((0,1),(2048,4)) \\\\\n", + " & \\underbrace{\\phantom{(1,4)}}_{tiler} & & \\underbrace{\\phantom{(2048,512)}}_{threads} & \\\\\n", + " & \\text{\\scriptsize per-thread} & & \\text{\\scriptsize num of tiles}\n", + " \\end{array}\n", + "$" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.kernel\n", + "def vectorized_elementwise_add_kernel(\n", + " gA: cute.Tensor,\n", + " gB: cute.Tensor,\n", + " gC: cute.Tensor,\n", + "):\n", + " tidx, _, _ = cute.arch.thread_idx()\n", + " bidx, _, _ = cute.arch.block_idx()\n", + " bdim, _, _ = cute.arch.block_dim()\n", + "\n", + " thread_idx = bidx * bdim + tidx\n", + "\n", + " # Map thread index to logical index of input tensor\n", + " m, n = gA.shape[1] # thread-domain\n", + " ni = thread_idx % n\n", + " mi = thread_idx // n\n", + "\n", + " # Map logical index to physical address via tensor layout\n", + " a_val = gA[(None, (mi, ni))].load()\n", + " b_val = gB[(None, (mi, ni))].load()\n", + " print(f\"[DSL INFO] sliced gA = {gA[(None, (mi, ni))]}\")\n", + " print(f\"[DSL INFO] sliced gB = {gB[(None, (mi, ni))]}\")\n", + "\n", + " # Perform element-wise addition\n", + " gC[(None, (mi, ni))] = a_val + b_val" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This vectorized kernel follows a similar structure to its naive non-vectorized counterpart,\n", + "with one key difference: the tensor slicing pattern. By using `(None, (mi, ni))` as the slice indices,\n", + "we can extract a `(1,4)` sub-tensor from `gA`, `gB` and `gC` like \n", + "\n", + "```python\n", + "gA[(None, (mi, ni))]\n", + "\n", + "```\n", + "\n", + "Then tensor data can be loaded into vector via the `.load()` method.\n", + "\n", + "\n", + "```\n", + " slice\n", + " ((1,4),(2048,512)):((0,1),(2048,4)) ==> ((1,4)):((0,1))\n", + " ^ ^ ^\n", + " | | |\n", + " (None, (mi, ni))\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[DSL INFO] Tiled Tensors:\n", + "[DSL INFO] gA = tensor> o ((1,4),(2048,512)):((0,1),(2048,4))>\n", + "[DSL INFO] gB = tensor> o ((1,4),(2048,512)):((0,1),(2048,4))>\n", + "[DSL INFO] gC = tensor> o ((1,4),(2048,512)):((0,1),(2048,4))>\n", + "[DSL INFO] sliced gA = tensor> o ((1,4)):((0,1))>\n", + "[DSL INFO] sliced gB = tensor> o ((1,4)):((0,1))>\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def vectorized_elementwise_add(\n", + " mA: cute.Tensor,\n", + " mB: cute.Tensor,\n", + " mC: cute.Tensor\n", + "):\n", + " threads_per_block = 256\n", + "\n", + " gA = cute.zipped_divide(mA, (1, 4))\n", + " gB = cute.zipped_divide(mB, (1, 4))\n", + " gC = cute.zipped_divide(mC, (1, 4))\n", + "\n", + " print(f\"[DSL INFO] Tiled Tensors:\")\n", + " print(f\"[DSL INFO] gA = {gA}\")\n", + " print(f\"[DSL INFO] gB = {gB}\")\n", + " print(f\"[DSL INFO] gC = {gC}\")\n", + "\n", + " vectorized_elementwise_add_kernel(gA, gB, gC).launch(\n", + " grid=(cute.size(gC, mode=[1]) // threads_per_block, 1, 1),\n", + " block=(threads_per_block, 1, 1),\n", + " )\n", + "\n", + "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n", + "\n", + "a_ = from_dlpack(a, assumed_align=16)\n", + "b_ = from_dlpack(b, assumed_align=16)\n", + "c_ = from_dlpack(c, assumed_align=16)\n", + "\n", + "compiled_func = cute.compile(vectorized_elementwise_add, a_, b_, c_)\n", + "compiled_func(a_, b_, c_)\n", + "\n", + "# verify correctness\n", + "torch.testing.assert_close(c, a + b)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average execution time: 0.0202 ms\n", + "Throughput: 1244.98 GB/s\n" + ] + } + ], + "source": [ + "benchmark(partial(compiled_func, a_, b_, c_), num_warmups=5, num_iterations=100)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TV Layout\n", + "\n", + "Both the naive and vectorized kernels follow a common pattern to map thread indices\n", + "to physical addresses:\n", + "\n", + "Step 1: Map thread index to logical M/N coordinates\n", + "\n", + "```python\n", + " mi = thread_idx // n\n", + " ni = thread_idx % n\n", + "```\n", + "\n", + "Step 2: Map logical M/N coordinates to physical addresses using the tensor layout\n", + "\n", + "```python\n", + " a[(None, (mi, ni))].load()\n", + "```\n", + "\n", + "CuTe uses TV layout to represent this mapping from thread index and value index\n", + "(i.e., the 4 elements loaded per thread) to the logical coordinate space of a tensor.\n", + "By configuring different TV layouts, we can experiment with different memory access\n", + "patterns with minimal code changes.\n", + "\n", + "The following example demonstrates two levels of tiling: at the thread-block level\n", + "and at the thread level.\n", + "\n", + "For thread-block level tiling, each input & output tensor is first divided\n", + "into a group of ``(TileM, TileN)`` sub-tensors at the host side.\n", + "\n", + "Inside the GPU kernel, we provide the thread-block index to the 2nd mode of the tiled tensor\n", + "(``gA[((None, None), bidx)]``), which returns a thread-block local view of\n", + "a single ``(TileM, TileN)`` sub-tensor.\n", + "\n", + "For thread level tiling, we compose the sub-tensor (which maps from logical coordinates\n", + "to physical addresses) with the TV layout (which maps from thread & value indices to\n", + "logical coordinates). This gives us a tiled sub-tensor that maps from thread & value\n", + "indices directly to physical addresses.\n", + "\n", + "We then provide the thread index to the tiled sub-tensor (``tidfrgA[(tidx, None)]``)\n", + "to get a thread-local view of the data each thread accesses. Note that the thread index\n", + "is now in the 1st mode, as the tiled sub-tensor puts the thread mode before the value mode." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.kernel\n", + "def elementwise_add_kernel(\n", + " gA: cute.Tensor,\n", + " gB: cute.Tensor,\n", + " gC: cute.Tensor,\n", + " tv_layout: cute.Layout\n", + "):\n", + " tidx, _, _ = cute.arch.thread_idx()\n", + " bidx, _, _ = cute.arch.block_idx()\n", + "\n", + " #--------------------------------\n", + " # slice for thread-block level view\n", + " #--------------------------------\n", + " blk_coord = ((None, None), bidx)\n", + "\n", + " # logical coord -> address\n", + " blkA = gA[blk_coord] # (TileM, TileN) -> physical address\n", + " blkB = gB[blk_coord] # (TileM, TileN) -> physical address\n", + " blkC = gC[blk_coord] # (TileM, TileN) -> physical address\n", + "\n", + " #--------------------------------\n", + " # compose for thread-index & value-index to physical mapping\n", + " #--------------------------------\n", + " # blockA: (TileM, TileN) -> physical address\n", + " # tv_layout: (tid, vid) -> (TileM, TileN)\n", + " # tidfrgA = blkA o tv_layout\n", + " # tidfrgA: (tid, vid) -> physical address\n", + " tidfrgA = cute.composition(blkA, tv_layout)\n", + " tidfrgB = cute.composition(blkB, tv_layout)\n", + " tidfrgC = cute.composition(blkC, tv_layout)\n", + "\n", + " print(f\"Composed with TV layout:\")\n", + " print(f\" tidfrgA: {tidfrgA.type}\")\n", + "\n", + " #--------------------------------\n", + " # slice for thread-level view\n", + " #--------------------------------\n", + " # `None` represent slice of the entire per-thread data\n", + " thr_coord = (tidx, None)\n", + "\n", + " # slice for threads: vid -> address\n", + " thrA = tidfrgA[thr_coord] # (V) -> physical address\n", + " thrB = tidfrgB[thr_coord] # (V) -> physical address\n", + " thrC = tidfrgC[thr_coord] # (V) -> physical address\n", + "\n", + " thrC[None] = thrA.load() + thrB.load()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we take a closer look at the layout of zipped divided input tensor `gA`:\n", + "\n", + "```\n", + "Tiled to Thread Block:\n", + "\n", + " ((16,256),(128,8)) : ((2048,1),(32768,256))\n", + " ~~~~~~~~ ~~~~~~ ~~~~~~~~\n", + " | | |\n", + " | | |\n", + " | `------------------------> Number of Thread Blocks\n", + " | |\n", + " | |\n", + " `--------------------'\n", + " |\n", + " V\n", + " Thread Block\n", + " Tile\n", + "\n", + "Sliced to Thread-Block local sub-tensor (a (16, 128) tile): gA[((None, None), bidx)]\n", + "\n", + " (16,256) : (2048,1)\n", + " ~~~~~~ ~~~~~~\n", + " | | Tiled/Composed with TV Layout\n", + " | | \n", + " | | o ((32,4),(8,4)):((128,4),(16,1))\n", + " V V \n", + "~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~ \n", + "((32,4), (8,4)) : ((4,8192),(1,2048))\n", + " | |\n", + " | `--------> per thread fragment\n", + " |\n", + "Thread Block\n", + " Shape\n", + "\n", + "Sliced to Thread local sub-tensor (a (4,8) tile): tidfrgA[(tidx, None)]\n", + "\n", + "```\n", + "\n", + "The host code below shows the construction of the TV layout. By composing\n", + "a thread layout of ``(4,32):(32,1)`` (32 threads read contiguous elements on the row dimension,\n", + "then 4 warps read different rows) with a value layout of ``(4,8):(8,1)`` (each thread reads\n", + "8 contiguous elements on the row dimension across 4 contiguous rows),\n", + "we obtain the TV layout shown in the figure above." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tiler: (16, 256)\n", + "TV Layout: ((32,4),(8,4)):((128,4),(16,1))\n", + "Tiled Input Tensors:\n", + " gA: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + " gB: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + " gC: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + "Composed with TV layout:\n", + " tidfrgA: !cute.memref, \"((32,4),(8,4)):((8,8192),(1,2048))\">\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def elementwise_add(\n", + " mA: cute.Tensor,\n", + " mB: cute.Tensor,\n", + " mC: cute.Tensor,\n", + "):\n", + " # mA layout: (M, N):(N, 1)\n", + " # TV layout map thread & value index to (16, 256) logical tile\n", + " # - contiguous thread index maps to mode-1 because input layout is contiguous on\n", + " # mode-1 for coalesced load-store\n", + " # - each thread load 8 contiguous element each row and load 4 rows\n", + " thr_layout = cute.make_layout((4, 32), stride=(32, 1))\n", + " val_layout = cute.make_layout((4, 8), stride=(8, 1))\n", + " tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)\n", + " print(f\"Tiler: {tiler_mn}\")\n", + " print(f\"TV Layout: {tv_layout}\")\n", + "\n", + " gA = cute.zipped_divide(mA, tiler_mn) # ((TileM, TileN), (RestM, RestN))\n", + " gB = cute.zipped_divide(mB, tiler_mn) # ((TileM, TileN), (RestM, RestN))\n", + " gC = cute.zipped_divide(mC, tiler_mn) # ((TileM, TileN), (RestM, RestN))\n", + "\n", + " print(f\"Tiled Input Tensors:\")\n", + " print(f\" gA: {gA.type}\")\n", + " print(f\" gB: {gB.type}\")\n", + " print(f\" gC: {gC.type}\")\n", + "\n", + " # Launch the kernel asynchronously\n", + " # Async token(s) can also be specified as dependencies\n", + " elementwise_add_kernel(\n", + " gA, gB, gC, tv_layout\n", + " ).launch(\n", + " grid=[cute.size(gC, mode=[1]), 1, 1],\n", + " block=[cute.size(tv_layout, mode=[0]), 1, 1],\n", + " )\n", + "\n", + "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n", + "\n", + "a_ = from_dlpack(a, assumed_align=16)\n", + "b_ = from_dlpack(b, assumed_align=16)\n", + "c_ = from_dlpack(c, assumed_align=16)\n", + "\n", + "elementwise_add_ = cute.compile(elementwise_add, a_, b_, c_)\n", + "elementwise_add_(a_, b_, c_)\n", + "\n", + "# verify correctness\n", + "torch.testing.assert_close(c, a + b)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average execution time: 0.0222 ms\n", + "Throughput: 1133.58 GB/s\n" + ] + } + ], + "source": [ + "benchmark(partial(elementwise_add_, a_, b_, c_), num_warmups=5, num_iterations=200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using Lambda Function\n", + "\n", + "CuTe DSL is built on top of Python. It can leverage Python to implement meta-programming to generate flexible kernels.\n", + "E.g. we can write kernel template that take custom binary operations to generate kernels for arbitrary binary operations.\n", + "\n", + "\n", + "```python\n", + "@cute.jit\n", + "def elementwise_apply(\n", + " op: cutlass.Constexpr,\n", + " mA: cute.Tensor,\n", + " mB: cute.Tensor,\n", + " mC: cute.Tensor\n", + "):\n", + " ...\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tiler: (16, 256)\n", + "TV Layout: ((32,4),(8,4)):((128,4),(16,1))\n", + "Tiled Input Tensors:\n", + " gA: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + " gB: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + " gC: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + "Composed with TV layout:\n", + " tidfrgA: !cute.memref, \"((32,4),(8,4)):((8,8192),(1,2048))\">\n" + ] + } + ], + "source": [ + "@cute.kernel\n", + "def elementwise_apply_kernel(\n", + " op: cutlass.Constexpr, # lambda function must be const expr to generate code at compile time\n", + " gA: cute.Tensor,\n", + " gB: cute.Tensor,\n", + " gC: cute.Tensor,\n", + " tv_layout: cute.Layout\n", + "):\n", + " tidx, _, _ = cute.arch.thread_idx()\n", + " bidx, _, _ = cute.arch.block_idx()\n", + "\n", + " blk_coord = ((None, None), bidx)\n", + "\n", + " # logical coord -> address\n", + " blkA = gA[blk_coord] # (TileM, TileN) -> physical address\n", + " blkB = gB[blk_coord] # (TileM, TileN) -> physical address\n", + " blkC = gC[blk_coord] # (TileM, TileN) -> physical address\n", + "\n", + " tidfrgA = cute.composition(blkA, tv_layout)\n", + " tidfrgB = cute.composition(blkB, tv_layout)\n", + " tidfrgC = cute.composition(blkC, tv_layout)\n", + "\n", + " print(f\"Composed with TV layout:\")\n", + " print(f\" tidfrgA: {tidfrgA.type}\")\n", + "\n", + " thr_coord = (tidx, None)\n", + "\n", + " # slice for threads: vid -> address\n", + " thrA = tidfrgA[thr_coord] # (V) -> physical address\n", + " thrB = tidfrgB[thr_coord] # (V) -> physical address\n", + " thrC = tidfrgC[thr_coord] # (V) -> physical address\n", + "\n", + " #--------------------------------\n", + " # apply custom operation\n", + " #--------------------------------\n", + " thrC[None] = op(thrA.load(), thrB.load())\n", + "\n", + "\n", + "@cute.jit\n", + "def elementwise_op(\n", + " op: cutlass.Constexpr,\n", + " mA: cute.Tensor,\n", + " mB: cute.Tensor,\n", + " mC: cute.Tensor,\n", + "):\n", + " # mA layout: (M, N):(N, 1)\n", + " # TV layout map thread & value index to (16, 256) logical tile\n", + " # - contiguous thread index maps to mode-1 because input layout is contiguous on\n", + " # mode-1 for coalesced load-store\n", + " # - each thread load 8 contiguous element each row and load 4 rows\n", + " thr_layout = cute.make_layout((4, 32), stride=(32, 1))\n", + " val_layout = cute.make_layout((4, 8), stride=(8, 1))\n", + " tiler_mn, tv_layout = cute.make_layout_tv(thr_layout, val_layout)\n", + " print(f\"Tiler: {tiler_mn}\")\n", + " print(f\"TV Layout: {tv_layout}\")\n", + "\n", + " gA = cute.zipped_divide(mA, tiler_mn) # ((TileM, TileN), (RestM, RestN))\n", + " gB = cute.zipped_divide(mB, tiler_mn) # ((TileM, TileN), (RestM, RestN))\n", + " gC = cute.zipped_divide(mC, tiler_mn) # ((TileM, TileN), (RestM, RestN))\n", + "\n", + " print(f\"Tiled Input Tensors:\")\n", + " print(f\" gA: {gA.type}\")\n", + " print(f\" gB: {gB.type}\")\n", + " print(f\" gC: {gC.type}\")\n", + "\n", + " # Launch the kernel asynchronously\n", + " # Async token(s) can also be specified as dependencies\n", + " elementwise_apply_kernel(\n", + " op, gA, gB, gC, tv_layout\n", + " ).launch(\n", + " grid=[cute.size(gC, mode=[1]), 1, 1],\n", + " block=[cute.size(tv_layout, mode=[0]), 1, 1],\n", + " )\n", + "\n", + "a = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "b = torch.randn(M, N, device=\"cuda\", dtype=torch.float16)\n", + "c = torch.zeros(M, N, device=\"cuda\", dtype=torch.float16)\n", + "\n", + "a_ = from_dlpack(a, assumed_align=16)\n", + "b_ = from_dlpack(b, assumed_align=16)\n", + "c_ = from_dlpack(c, assumed_align=16)\n", + "\n", + "from operator import mul\n", + "\n", + "elementwise_op(mul, a_, b_, c_)\n", + "\n", + "# verify correctness\n", + "torch.testing.assert_close(c, mul(a, b))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Custom operators can be more complex. For example, here's a function that performs\n", + "multiplication followed by ReLU:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tiler: (16, 256)\n", + "TV Layout: ((32,4),(8,4)):((128,4),(16,1))\n", + "Tiled Input Tensors:\n", + " gA: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + " gB: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + " gC: !cute.memref, \"((16,256),(128,8)):((2048,1),(32768,256))\">\n", + "Composed with TV layout:\n", + " tidfrgA: !cute.memref, \"((32,4),(8,4)):((8,8192),(1,2048))\">\n" + ] + } + ], + "source": [ + "def mul_relu(a, b):\n", + " tmp = a * b\n", + " return cute.where(tmp > 0, tmp, cute.full_like(tmp, 0))\n", + "\n", + "\n", + "# As we uses cute.where in customized operation, we need to create another relu function\n", + "def mul_relu_ref(a, b):\n", + " tmp = a * b\n", + " return torch.relu(tmp)\n", + "\n", + "\n", + "elementwise_op(mul_relu, a_, b_, c_)\n", + "\n", + "# verify correctness\n", + "torch.testing.assert_close(c, mul_relu_ref(a, b))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/CuTeDSL/notebooks/hello_world.ipynb b/examples/python/CuTeDSL/notebooks/hello_world.ipynb new file mode 100644 index 00000000..47719ae6 --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/hello_world.ipynb @@ -0,0 +1,173 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Your First Program with CuTe DSL\n", + "\n", + "## Introduction\n", + "\n", + "Welcome! In this tutorial, we'll write a simple \"Hello World\" program that runs on your GPU using CuTe DSL. This will help you understand the basics of GPU programming with our framework.\n", + "\n", + "### What You'll Learn\n", + "\n", + "- How to write code that runs on both CPU (host) and GPU (device),\n", + "- How to launch a GPU kernel (a function that runs on the GPU),\n", + "- Basic CUDA concepts like threads and thread blocks,\n", + "\n", + "### Step 1: Import Required Libraries\n", + "\n", + "First, let's import the libraries we need:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cutlass \n", + "import cutlass.cute as cute " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Step 2: Write Our GPU Kernel\n", + "A GPU kernel is a function that runs on the GPU. Here's a simple kernel that prints \"Hello World\".\n", + "Key concepts:\n", + "- `@cute.kernel`: This decorator tells CUTLASS that this function should run on the GPU\n", + "- `cute.arch.thread_idx()`: Gets the ID of the current GPU thread (like a worker's ID number)\n", + "- We only want one thread to print the message (thread 0) to avoid multiple prints" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.kernel\n", + "def kernel():\n", + " # Get the x component of the thread index (y and z components are unused)\n", + " tidx, _, _ = cute.arch.thread_idx()\n", + " # Only the first thread (thread 0) prints the message\n", + " if tidx == 0:\n", + " cute.printf(\"Hello world\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: Write Our Host Function\n", + "\n", + "Now we need a function that sets up the GPU and launches our kernel.\n", + "Key concepts:\n", + "- `@cute.jit`: This decorator is for functions that run on the CPU but can launch GPU code\n", + "- We need to initialize CUDA before using the GPU\n", + "- `.launch()` tells CUDA how many blocks, threads, shared memory, etc. to use" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.jit\n", + "def hello_world():\n", + "\n", + " # Print hello world from host code\n", + " cute.printf(\"hello world\")\n", + " \n", + " # Initialize CUDA context for launching a kernel with error checking\n", + " # We make context initialization explicit to allow users to control the context creation \n", + " # and avoid potential issues with multiple contexts\n", + " cutlass.cuda.initialize_cuda_context()\n", + "\n", + " # Launch kernel\n", + " kernel().launch(\n", + " grid=(1, 1, 1), # Single thread block\n", + " block=(32, 1, 1) # One warp (32 threads) per thread block\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: Run Our Program\n", + "\n", + "There are 2 ways we can run our program:\n", + "\n", + "1. compile and run immediately\n", + "2. separate compilation which allows us to compile the code once and run multiple times\n", + " \n", + "Please note the `Compiling...` for Method 2 prints before the \"Hello world\" of the first kernel. This shows the asynchronous behavior between CPU and GPU prints. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running hello_world()...\n", + "hello world\n", + "Compiling...\n", + "Hello world\n", + "Running compiled version...\n", + "hello world\n" + ] + } + ], + "source": [ + "# Method 1: Just-In-Time (JIT) compilation - compiles and runs the code immediately\n", + "print(\"Running hello_world()...\")\n", + "hello_world()\n", + "\n", + "# Method 2: Compile first (useful if you want to run the same code multiple times)\n", + "print(\"Compiling...\")\n", + "hello_world_compiled = cute.compile(hello_world)\n", + "# Run the pre-compiled version\n", + "print(\"Running compiled version...\")\n", + "hello_world_compiled()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/CuTeDSL/notebooks/images/cuda_graphs_image.png b/examples/python/CuTeDSL/notebooks/images/cuda_graphs_image.png new file mode 100644 index 0000000000000000000000000000000000000000..44fc138d319d5820377c987e3ac386913f1c0777 GIT binary patch literal 8586 zcmeHNXH=70myRIl1r!yjBB-Er>Cy#6K#<-$(wh{ecdrGcMj(LFktQwl4vC0#CA0(x zRmu${gb)awgbDi9HQ&svS?inmb${eN=j>DV-p@WOd%sT-qo<>OfrgC+003Ojc=XT! z0HA=7`@QF>$J!5in#@9go0RTM2zD6FLby3z<1^{Y+w8wT-zo17XCm5qXIu(OeVMk^gSJZ!GO^(N(6ibq$jP+Mx;`t)P3%IMnD+Z_tg z0OYF>>yGlQ!vbqfU0khwNSjU@lFFjO#m*zHG0nGu%ZT$6X zZcUc+j(Nr#ntzaLcvAXx?p(Uz=#1O*sC4NJ+pEh(O~KZAjs}NBru&A5E}lB(Bh4i& z4E)MW?k2iuk@j*^_9*~>&v?4~XtD?(C4xOeZa&8}FU2*Bx(?sa7>(zII?bP5pqV3qB|w6R6;){^@aH6N-gw&bRnBIa*P z054$GZT~n1fRz^GnmeoV$Qp}o;K>la5O_znCnRD1GvKYdT0lU+FvnT)DOpi+zs_{n zf}Ujk>dXJ#kzSXu%Tjb*dI-?kK!%t#SF+&>K4{$-KuAI$Os;!ZOhh}$`TM}`YS=4$ z5nV?Giac^(HsXf7rH!&%9Soc;z&B|%dQCz}C%3J1d5b|!YJ-W|(cJ!!e{ z?54+n-`!fJc$UaPM+6tU09R{mU^p8t3cvY)rOnye)E&!i8VSVP9|a;?MjtcqWUUffk3ajMzH8VvMb?-pc1#RDQ_Go$ z^6uM|c=}M^CkNTV#(!r;m;*@H#nr`ut6NN(Dk1LpALSbm<89^)5Rx1X4-+vo38L;X zb0;H0aSa`<9y9Iy_h6}MwJroiS$9Y<+`a=|zE`lhI^as~800_)l+ zP+WM`8GH~Ja@;L(du&W{&a|6gwM$AQt?8s5D-c@gk+{b9#f~;`NsdagCd~v;80m>^e%B^Qs(>hZ?nfiw0AYUKCAQawksG zWRdUnvtH6GbnVqKQ2sD|Z&wwsz~u{R`p>V!Gh5w(0Nzp~ z{OdUdN4S*ExX0v|N{iY}6OR*q!zCt*ou2GTFfs)-H80}i$EyA*%c>-BUPN+@lt`ST z$MSU1O^DN}M9Ju+Y|NLKEaI5*TAk;}Sz- zIX@&}m5`4WQ%l6=SAucojwiUR^H|@La5fi~k@-wBIn#C2>#4Cx;f_gjJWD16f*w;c zEh88C`*zYk1@>6jZOOxbONjTKFnhUi-N|iFtE3WU6ZuacHP=8;1Tqa&P@{4A$Yr@N zom*p#EyJkX`6PH37Xvlh^U{jIi~3ioEnF{X9fo!}Go)d|YoeD;vxF-o)IZIO%}u{vT%+{H)uitP7^AO(7!o2vZUF=7eH8xv)?IqZ=9HANVl&*rNe2+EUqrujb-k%zzW<=6Bh5a5#qL1NkP{)NozysFdMMpSKYFypyIDKO->qGnugn* z$(#2}LRiYLPNN2rUNx{~jMFn8v*a)!bh#10H*K9>s^l-hF$4m6u1K4vk#7=FMmPxc zc+~wI>*;PhUESU48Btc2n_I%11~b&}zI=hz8BQOCR^OPPvk_sbZ?xVdHHtm0@yeVp z?tsC@Xa_lm=N80f?+F-tb~&TdeoWUtJCN_F#(TSa;YuovRia8totNZ2CrxHsh}W0| ztm6J$tJ9EVSh5BH7Sj-M7a zHI!8fsj9qEmvB55^aveqT=$)8IT3E)+}?(o`hRT?5E2anf7_5Y4T*r>{dvK&$j(X; z&AS^gC2M=g>f@3!uX4m3ic`-{wOq$5^adfv65DwY^nz;f!7`;FKRs}*UI^kjF>gZB zQkU&^#vs(8RW;f6#7f1UDb8>;G>(L|caUebl}4`lpMZNO-=}P2fX{Z-bCV9+PU0AX zBenswF1N_Z>#bIpVnsW6>>j{$D1kNCFXVg3>y(E%^W2c3hh)@;Q9wazALZ6?N702# z^0ku(e@$ImK@~W&uoKEE?k*cDrCeHcS|(-otqOH%_1=S7B(ltwxTzmXB@KuXUi@NB zF9i2@s5mLLF%(zg$<)5v_?eM$y!z|CgD{k=syWanY6!K*AoDf$ecd7Vazq|Dg%7;7gDuyu)Q)~jhVN<9jg?gAVt!;eDA3A=)vuul_wD9gnk ztO_9n;JWe+COEr%-sd$#Hfdhv+{c?1u&3vIRO=fX%Ss_SMQ^;z300QH)}0}*eaYUo z(af8O^a*!m5A+QVF3p}6B1?Ey+pib8L_m^Gt+mBk6K|KtH=E=EtD%*Zm9leYRrG@a zQIQwEg}gSLgn4i5%QBNaeQC#XxMjWPdMuQJ%>-E48RKwC+~k6x^!dCO(Vr;54ZYYM zU3-a{d>-l+?bo;Dy2DxFOO04<39s*#1ljn`>$YF1Aid11-eExDtau#WpY{q_k~2g7 zm4f`>S?c%n`m-YVvm*GjBKWf+__HGTvm*GjBKUu<2!1je&`dIA@{6eed}e$UMrKSF zX#Pvm3rXsPmMmCJTomye+{!HdMj~V}0k|7UY zhKmF8I3*7$Wk*@`evp{LL%<#!_Kr?YR2@yWohLPat+Z-tTF#;XO6)bCt&Vu97C1NB^f94?)2Ooi4*m_-Y*KdK z8(TJSdP?i{xlNu_Clb8^$Z7t{^W*A>=EHYv)3!jC%4jC-QPAb=^6Uvba0dhD^gz?| zxW$G~zmbkGgUo3Mf9z8Esp!2K20KLoBd#cvtq>xRoC9WHE z)}2{7392pDaNqX<(vWz=c?gFC`(`i_>Sj?G%47uy8Yk4`gx_1-DLV84LhyFAB;16s z?;>qhFDL1FHR|hLkM}ODYJ=kT3V-P0dy3?C-#hN;TK%mw5GElxDFgZx#9fMyftJ|~ zVKZda-NWpo2Q-H?-3pfBqQSTpYeQ6)@e;#PWivNUK|D1ScB2$*p{o1HFj7+IM`Rn~ zx?#Z;P!Uuu3Jq0leCnaMN-z+Xk_)kd( zW6F#P*t&tBv7CK1R+suEW6*ABR?`N?){T?2Ard3ZzLvOtmI!PAZC8Z7jHL7-q6VzL zlWf4F1@DSWNe#)<+T(aMg6u}145Sg~+u&1JL7dQ_S6qbm zoQ&nI@Pv>?p<$W}02K%}$?g{;4j z=qS}M(l$W9rc-DS3Twg!elwQZ$t)vGwlWoc(l6pFamOw?!kHP@YHm%lt4=H0&+uJ7 zGEd&=(=FjInqGQi*!0uby4l6V3nQXZ-;fV;)wE_N#TcT z0&r9V$%{=aF$ezcWtD3uy_&jrFNkhi)=9x7LA(KC6^iU_vDOrF1)4_pc=w5L7YyQe z$0MYAr;CVp!mT26f-zq5yU(nCr;z^8>!3HIcWivP!bvxKFdJ5BDQX&+;S|#GWES zfNawQlDNq{?jj=4)Q>q?8uM@G>L0FNEdp{>e2@xmDE~NuCbL@aGYvpwckfF`Y;%-ne;`p~clU<9 z(+^Ygn;qHdd9gWRm2Fb0IkB@ACYLkX7=0z{KK8NMrV~|&s_edp=_vQvSh8bDRk*oj z`1@&bNdYw8V8-C%+B{v+Mm&Yg1mA7z)pI_Tbtm zLt!$J{}`RSb7-olr{Y;ME7-vKrt!qQs!AGhS(lR$eIk!7y#4w}`$I}zTI(?M)~y~+ z4qXX_22SmgT^O%kAQ|rOK>)yrB$bh(sJXbBMzX2+MozgZ12$E5T&jWB|3R$5Z4H~E z`beWEAXC}u$3Uiba2xGFz4+pltHy>(JH7lWM_1jATa4zf>Y-m>T~m66X4G3di?!S)VRn$-Onq^86@RiLcy;fVixyD)M2cvqLZW2 zj~{iU2AkRNki9 z^5hQ0x}GP!!|3?;0koRihNZ1O$+8YKlG9gwyU#LE?Tm&dbPh=ha!vK;v5-rDoZAqY zv`r0IrSj$##T0FcRk~}A46Gc4I2~BcPIWnbg7D>yC)vQI8Xhr}pi@^4e$+k(o3f>T zWOdhb3w{-l_-*(j2eQ`8ArU!O^CEzQ)~p{rLnmaNQB}kx(2#GHY#`Vyx)hq0t|a8k)t|wJt@tZguv9NL8Wd(N)YNT5+TzrG{r7~6TLpPLB3jbd2c!{@3 z*}AFv&eZFejPSi1R34c|=5l&VF7yT>EJk_F7D5hTnjA9~8H(_7XRLbJQ{e_mNz#V? zY?I=h0X%gP!H(h4*{stp)>+YfCY7rP@r??Q;k&1o?rm@Mq< z%#ibJZddM?+hsF;dZ;x<5;E^s=3}9Ab!XAe8(f_ z9bz2@Nl>Y^0^L>;&rvf;woc$q6y%Pe&8EMz4?kpx&TDk@2ZzFuU2WR7zvEkR6j8P= zDC|WyBZCTk)LUlx&lEb{l!6OPZBoY?(EIl-=oR`@Ze5BTIs(vgn8y#ss55ny204lX zzCt)S&E@?@loGOG%yLUxGFxf|I-dK{3UpsP+X{(>32AG>|o_pS-O{Jn1`K%`F7+PBc$B*K>w@bxFWp))sb zTVayiHn*<;?3FEt3qC7aHdGxxZPKW^#c=xL|L6Yr_ICNdy!QqEdQUD^AWs0KFaP>m bz>gEkEBfuj)~)@(Q?Z7s&O`8nColg4Ecelt literal 0 HcmV?d00001 diff --git a/examples/python/CuTeDSL/notebooks/print.ipynb b/examples/python/CuTeDSL/notebooks/print.ipynb new file mode 100644 index 00000000..64787bb4 --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/print.ipynb @@ -0,0 +1,425 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Printing with CuTe DSL\n", + "\n", + "This notebook demonstrates the different ways to print values in CuTe and explains the important distinction between static (compile-time) and dynamic (runtime) values.\n", + "\n", + "## Key Concepts\n", + "- Static values: Known at compile time\n", + "- Dynamic values: Only known at runtime\n", + "- Different printing methods for different scenarios\n", + "- Layout representation in CuTe\n", + "- Tensor visualization and formatting" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cutlass\n", + "import cutlass.cute as cute\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Print Example Function\n", + "\n", + "The `print_example` function demonstrates several important concepts:\n", + "\n", + "### 1. Python's `print` vs CuTe's `cute.printf`\n", + "- `print`: Can only show static values at compile time\n", + "- `cute.printf`: Can display both static and dynamic values at runtime\n", + "\n", + "### 2. Value Types\n", + "- `a`: Dynamic `Int32` value (runtime)\n", + "- `b`: Static `Constexpr[int]` value (compile-time)\n", + "\n", + "### 3. Layout Printing\n", + "Shows how layouts are represented differently in static vs dynamic contexts:\n", + "- Static context: Unknown values shown as `?`\n", + "- Dynamic context: Actual values displayed" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.jit\n", + "def print_example(a: cutlass.Int32, b: cutlass.Constexpr[int]):\n", + " \"\"\"\n", + " Demonstrates different printing methods in CuTe and how they handle static vs dynamic values.\n", + "\n", + " This example shows:\n", + " 1. How Python's `print` function works with static values at compile time but can't show dynamic values\n", + " 2. How `cute.printf` can display both static and dynamic values at runtime\n", + " 3. The difference between types in static vs dynamic contexts\n", + " 4. How layouts are represented in both printing methods\n", + "\n", + " Args:\n", + " a: A dynamic Int32 value that will be determined at runtime\n", + " b: A static (compile-time constant) integer value\n", + " \"\"\"\n", + " # Use Python `print` to print static information\n", + " print(\">>>\", b) # => 2\n", + " # `a` is dynamic value\n", + " print(\">>>\", a) # => ?\n", + "\n", + " # Use `cute.printf` to print dynamic information\n", + " cute.printf(\">?? {}\", a) # => 8\n", + " cute.printf(\">?? {}\", b) # => 2\n", + "\n", + " print(\">>>\", type(a)) # => \n", + " print(\">>>\", type(b)) # => \n", + "\n", + " layout = cute.make_layout((a, b))\n", + " print(\">>>\", layout) # => (?,2):(1,?)\n", + " cute.printf(\">?? {}\", layout) # => (8,2):(1,8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compile and Run\n", + "\n", + "**Direct Compilation and Run**\n", + " - `print_example(cutlass.Int32(8), 2)`\n", + " - Compiles and runs in one step will execute both static and dynamic print\n", + " * `>>>` stands for static print\n", + " * `>??` stands for dynamic print" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> 2\n", + ">>> ?\n", + ">>> Int32\n", + ">>> \n", + ">>> (?,2):(1,?)\n", + ">?? 8\n", + ">?? 2\n", + ">?? (8,2):(1,8)\n" + ] + } + ], + "source": [ + "print_example(cutlass.Int32(8), 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compile Function\n", + "\n", + "When compiles the function with `cute.compile(print_example, cutlass.Int32(8), 2)`, Python interpreter \n", + "traces code and only evaluate static expression and print static information." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">>> 2\n", + ">>> ?\n", + ">>> Int32\n", + ">>> \n", + ">>> (?,2):(1,?)\n" + ] + } + ], + "source": [ + "print_example_compiled = cute.compile(print_example, cutlass.Int32(8), 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Call compiled function\n", + "\n", + "Only print out runtime information" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ">?? 8\n", + ">?? 2\n", + ">?? (8,2):(1,8)\n" + ] + } + ], + "source": [ + "print_example_compiled(cutlass.Int32(8))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Format String Example\n", + "\n", + "The `format_string_example` function shows an important limitation:\n", + "- F-strings in CuTe are evaluated at compile time\n", + "- This means dynamic values won't show their runtime values in f-strings\n", + "- Use `cute.printf` when you need to see runtime values" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Direct run output:\n", + "a: ?, b: 2\n", + "layout: (?,2):(1,?)\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def format_string_example(a: cutlass.Int32, b: cutlass.Constexpr[int]):\n", + " \"\"\"\n", + " Format string is evaluated at compile time.\n", + " \"\"\"\n", + " print(f\"a: {a}, b: {b}\")\n", + "\n", + " layout = cute.make_layout((a, b))\n", + " print(f\"layout: {layout}\")\n", + "\n", + "print(\"Direct run output:\")\n", + "format_string_example(cutlass.Int32(8), 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Printing Tensor Examples\n", + "\n", + "CuTe provides specialized functionality for printing tensors through the `print_tensor` operation. The `cute.print_tensor` takes the following parameter:\n", + "- `Tensor` (required): A CuTe tensor object that you want to print. The tensor must support load and store operations\n", + "- `verbose` (optional, default=False): A boolean flag that controls the level of detail in the output. When set to True, it will print indices details for each element in the tensor.\n", + "\n", + "Below example code shows the difference between verbose ON and OFF, and how to print a sub range of the given tensor." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from cutlass.cute.runtime import from_dlpack\n", + "\n", + "@cute.jit\n", + "def print_tensor_basic(x : cute.Tensor):\n", + " # Print the tensor\n", + " print(\"Basic output:\")\n", + " cute.print_tensor(x)\n", + " \n", + "@cute.jit\n", + "def print_tensor_verbose(x : cute.Tensor):\n", + " # Print the tensor with verbose mode\n", + " print(\"Verbose output:\")\n", + " cute.print_tensor(x, verbose=True)\n", + "\n", + "@cute.jit\n", + "def print_tensor_slice(x : cute.Tensor, coord : tuple):\n", + " # slice a 2D tensor from the 3D tensor\n", + " sliced_data = cute.slice_(x, coord)\n", + " y = cute.make_fragment(sliced_data.layout, sliced_data.element_type)\n", + " # Convert to TensorSSA format by loading the sliced data into the fragment\n", + " y.store(sliced_data.load())\n", + " print(\"Slice output:\")\n", + " cute.print_tensor(y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default `cute.print_tensor` will output CuTe tensor with datatype, storage space, CuTe layout information, and print data in torch-style format." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Basic output:\n", + "tensor(raw_ptr(0x000000000a5f1d50: f32, generic, align<4>) o (4,3,2):(6,2,1), data=\n", + " [[[ 0.000000, 2.000000, 4.000000, ],\n", + " [ 6.000000, 8.000000, 10.000000, ],\n", + " [ 12.000000, 14.000000, 16.000000, ],\n", + " [ 18.000000, 20.000000, 22.000000, ]],\n", + "\n", + " [[ 1.000000, 3.000000, 5.000000, ],\n", + " [ 7.000000, 9.000000, 11.000000, ],\n", + " [ 13.000000, 15.000000, 17.000000, ],\n", + " [ 19.000000, 21.000000, 23.000000, ]]])\n" + ] + } + ], + "source": [ + "def tensor_print_example1():\n", + " shape = (4, 3, 2)\n", + " \n", + " # Creates [0,...,23] and reshape to (4, 3, 2)\n", + " data = np.arange(24, dtype=np.float32).reshape(*shape) \n", + " \n", + " print_tensor_basic(from_dlpack(data))\n", + "\n", + "tensor_print_example1()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The verbosed print will show coodination details of each element in the tensor. The below example shows how we index element in a 2D 4x3 tensor space." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Verbose output:\n", + "tensor(raw_ptr(0x000000000a814cc0: f32, generic, align<4>) o (4,3):(3,1), data= (\n", + "\t(0,0)= 0.000000\n", + "\t(0,1)= 1.000000\n", + "\t(0,2)= 2.000000\n", + "\t(1,0)= 3.000000\n", + "\t(1,1)= 4.000000\n", + "\t(1,2)= 5.000000\n", + "\t(2,0)= 6.000000\n", + "\t(2,1)= 7.000000\n", + "\t(2,2)= 8.000000\n", + "\t(3,0)= 9.000000\n", + "\t(3,1)= 10.000000\n", + "\t(3,2)= 11.000000\n", + ")\n" + ] + } + ], + "source": [ + "def tensor_print_example2():\n", + " shape = (4, 3)\n", + " \n", + " # Creates [0,...,11] and reshape to (4, 3)\n", + " data = np.arange(12, dtype=np.float32).reshape(*shape) \n", + " \n", + " print_tensor_verbose(from_dlpack(data))\n", + "\n", + "tensor_print_example2()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To print a subset elements in the given Tensor, we can use cute.slice_ to select a range of the given tensor, load them into register and then print the values with `cute.print_tensor`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Slice output:\n", + "tensor(raw_ptr(0x00007ffeeae1fc60: f32, rmem, align<32>) o (4):(3), data=\n", + " [ 0.000000, ],\n", + " [ 3.000000, ],\n", + " [Slice output:\n", + " 6.000000, ],\n", + " [ 9.000000, ])\n", + "tensor(raw_ptr(0x00007ffeeae1fc60: f32, rmem, align<32>) o (3):(1), data=\n", + " [ 3.000000, ],\n", + " [ 4.000000, ],\n", + " [ 5.000000, ])\n" + ] + } + ], + "source": [ + "def tensor_print_example3():\n", + " shape = (4, 3)\n", + " \n", + " # Creates [0,...,11] and reshape to (4, 3)\n", + " data = np.arange(12, dtype=np.float32).reshape(*shape) \n", + " \n", + " print_tensor_slice(from_dlpack(data), (None, 0))\n", + " print_tensor_slice(from_dlpack(data), (1, None))\n", + "\n", + "tensor_print_example3()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/CuTeDSL/notebooks/tensor.ipynb b/examples/python/CuTeDSL/notebooks/tensor.ipynb new file mode 100644 index 00000000..80b9cff1 --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/tensor.ipynb @@ -0,0 +1,390 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cutlass\n", + "import cutlass.cute as cute" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tensor\n", + "\n", + "A tensor in CuTe is created through the composition of two key components:\n", + "\n", + "1. An **Engine** (E) - A random-access, pointer-like object that supports:\n", + " - Offset operation: `e + d → e` (offset engine by elements of a layout's codomain)\n", + " - Dereference operation: `*e → v` (dereference engine to produce value)\n", + "\n", + "2. A **Layout** (L) - Defines the mapping from coordinates to offsets\n", + "\n", + "A tensor is formally defined as the composition of an engine E with a layout L, expressed as `T = E ∘ L`. When evaluating a tensor at coordinate c, it:\n", + "\n", + "1. Maps the coordinate c to the codomain using the layout\n", + "2. Offsets the engine accordingly\n", + "3. Dereferences the result to obtain the tensor's value\n", + "\n", + "This can be expressed mathematically as:\n", + "\n", + "```\n", + "T(c) = (E ∘ L)(c) = *(E + L(c))\n", + "```\n", + "\n", + "## Example Usage\n", + "\n", + "Here's a simple example of creating a tensor using pointer and layout `(8,5):(5,1)` and fill with ones:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "@cute.jit\n", + "def create_tensor_from_ptr(ptr: cute.Pointer):\n", + " layout = cute.make_layout((8, 5), stride=(5, 1))\n", + " tensor = cute.make_tensor(ptr, layout)\n", + " tensor.fill(1)\n", + " cute.print_tensor(tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This creates a tensor where:\n", + "- The engine is a pointer\n", + "- The layout with shape `(8, 5)` and stride `(5, 1)`\n", + "- The resulting tensor can be evaluated using coordinates defined by the layout\n", + "\n", + "We can test this by allocating buffer with torch and run test with pointer to torch tensor" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(raw_ptr(0x000000000736b0c0: f32, generic, align<4>) o (8,5):(5,1), data=\n", + " [[ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n", + " [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n", + " [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n", + " ...\n", + " [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n", + " [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ],\n", + " [ 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, ]])\n" + ] + } + ], + "source": [ + "import torch\n", + "\n", + "from cutlass.torch import dtype as torch_dtype\n", + "import cutlass.cute.runtime as cute_rt\n", + "\n", + "a = torch.randn(8, 5, dtype=torch_dtype(cutlass.Float32))\n", + "ptr_a = cute_rt.make_ptr(cutlass.Float32, a.data_ptr())\n", + "\n", + "create_tensor_from_ptr(ptr_a)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## DLPACK support \n", + "\n", + "CuTe DSL is designed to support dlpack protocol natively. This offers easy integration with frameworks \n", + "supporting DLPack, e.g. torch, numpy, jax, tensorflow, etc.\n", + "\n", + "For more information, please refer to DLPACK project: https://github.com/dmlc/dlpack\n", + "\n", + "Calling `from_dlpack` can convert any tensor or ndarray object supporting `__dlpack__` and `__dlpack_device__`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from cutlass.cute.runtime import from_dlpack\n", + "\n", + "@cute.jit\n", + "def print_tensor_dlpack(src: cute.Tensor):\n", + " print(src)\n", + " cute.print_tensor(src)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor o (8,5):(5,1)>\n", + "tensor(raw_ptr(0x0000000007559340: f32, generic, align<4>) o (8,5):(5,1), data=\n", + " [[-1.151769, 1.019397, -0.371175, -0.717776, 0.502176, ],\n", + " [ 0.114282, 0.900084, 0.320770, 1.564574, -0.632329, ],\n", + " [-0.570140, 0.178112, -0.423079, 1.936198, 0.003355, ],\n", + " ...\n", + " [-2.425393, -0.275528, 1.267157, -0.811101, -0.985456, ],\n", + " [ 0.777889, -2.114074, 0.357184, -0.321312, -0.938138, ],\n", + " [ 1.959564, 1.797602, 0.116901, 0.306198, -1.837295, ]])\n" + ] + } + ], + "source": [ + "a = torch.randn(8, 5, dtype=torch_dtype(cutlass.Float32))\n", + "\n", + "print_tensor_dlpack(from_dlpack(a))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor o (8,8):(8,1)>\n", + "tensor(raw_ptr(0x0000000007979da0: f32, generic, align<4>) o (8,8):(8,1), data=\n", + " [[ 0.122739, -0.605744, -1.442022, ..., -0.356501, -0.993329, -0.091110, ],\n", + " [ 0.278448, 0.318482, -0.276867, ..., 1.542181, -1.701539, -0.309454, ],\n", + " [ 0.563565, -0.753936, 0.131214, ..., 0.437912, -0.482277, -0.051540, ],\n", + " ...\n", + " [-1.974096, -0.177881, 0.426807, ..., -1.579115, -0.304974, 0.451164, ],\n", + " [ 0.149851, -0.704689, -0.295063, ..., -0.653001, 0.008871, 0.903916, ],\n", + " [ 1.188619, 1.519662, 1.270734, ..., 0.404082, 0.173200, 0.093476, ]])\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "a = np.random.randn(8, 8).astype(np.float32)\n", + "\n", + "print_tensor_dlpack(from_dlpack(a))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tensor Evaluation Methods\n", + "\n", + "Tensors support two primary methods of evaluation:\n", + "\n", + "### 1. Full Evaluation\n", + "When applying the tensor evaluation with a complete coordinate c, it computes the offset, applies it to the engine, \n", + "and dereferences it to return the stored value. This is the straightforward case where you want to access \n", + "a specific element of the tensor.\n", + "\n", + "### 2. Partial Evaluation (Slicing)\n", + "When evaluating with an incomplete coordinate c = c' ⊕ c* (where c* represents the unspecified portion), \n", + "the result is a new tensor which is a slice of the original tensor with its engine offset to account for \n", + "the coordinates that were provided. This operation can be expressed as:\n", + "\n", + "```\n", + "T(c) = (E ∘ L)(c) = (E + L(c')) ∘ L(c*) = T'(c*)\n", + "```\n", + "\n", + "Slicing effectively reduces the dimensionality of the tensor, creating a sub-tensor that can be \n", + "further evaluated or manipulated." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a[2] = 10.000000 (equivalent to a[(2,0)])\n", + "a[9] = 6.000000 (equivalent to a[(1,1)])\n", + "a[2,0] = 10.000000\n", + "a[2,4] = 14.000000\n", + "a[(2,4)] = 14.000000\n", + "a[2,3] = 100.000000\n", + "a[(2,4)] = 101.000000\n", + "tensor([[ 0., 1., 2., 3., 4.],\n", + " [ 5., 6., 7., 8., 9.],\n", + " [ 10., 11., 12., 100., 101.],\n", + " [ 15., 16., 17., 18., 19.],\n", + " [ 20., 21., 22., 23., 24.],\n", + " [ 25., 26., 27., 28., 29.],\n", + " [ 30., 31., 32., 33., 34.],\n", + " [ 35., 36., 37., 38., 39.]])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def tensor_access_item(a: cute.Tensor):\n", + " # access data using linear index\n", + " cute.printf(\"a[2] = {} (equivalent to a[{}])\", a[2],\n", + " cute.make_identity_tensor(a.layout.shape)[2])\n", + " cute.printf(\"a[9] = {} (equivalent to a[{}])\", a[9],\n", + " cute.make_identity_tensor(a.layout.shape)[9])\n", + "\n", + " # access data using n-d coordinates, following two are equivalent\n", + " cute.printf(\"a[2,0] = {}\", a[2, 0])\n", + " cute.printf(\"a[2,4] = {}\", a[2, 4])\n", + " cute.printf(\"a[(2,4)] = {}\", a[2, 4])\n", + "\n", + " # assign value to tensor@(2,4)\n", + " a[2,3] = 100.0\n", + " a[2,4] = 101.0\n", + " cute.printf(\"a[2,3] = {}\", a[2,3])\n", + " cute.printf(\"a[(2,4)] = {}\", a[(2,4)])\n", + "\n", + "@cute.kernel\n", + "def print_tensor_gpu(ptr: cute.Pointer):\n", + " layout = cute.make_layout((8, 5), stride=(5, 1))\n", + " tensor = cute.make_tensor(ptr, layout)\n", + "\n", + " tidx, _, _ = cute.arch.thread_idx()\n", + "\n", + " if tidx == 0:\n", + " cute.print_tensor(tensor)\n", + "\n", + "\n", + "# Create a tensor with sequential data using torch\n", + "data = torch.arange(0, 8*5, dtype=torch.float32).reshape(8, 5)\n", + "tensor_access_item(from_dlpack(data))\n", + "\n", + "print(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Tensor as memory view\n", + "\n", + "In CUDA programming, different memory spaces have different characteristics in terms of access speed, scope, and lifetime:\n", + "\n", + "- **generic**: Default memory space that can refer to any other memory space.\n", + "- **global memory (gmem)**: Accessible by all threads across all blocks, but has higher latency.\n", + "- **shared memory (smem)**: Accessible by all threads within a block, with much lower latency than global memory.\n", + "- **register memory (rmem)**: Thread-private memory with the lowest latency, but limited capacity.\n", + "- **tensor memory (tmem)**: Specialized memory introduced in NVIDIA Blackwell architecture for tensor operations.\n", + "\n", + "When creating tensors in CuTe, you can specify the memory space to optimize performance based on your access patterns.\n", + "\n", + "For more information on CUDA memory spaces, see the [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#memory-hierarchy).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Coordinate Tensor\n", + "\n", + "A coordinate tensor is a special type of tensor that maps coordinates to coordinates rather than to values. \n", + "The key distinction is that while regular tensors map coordinates to some value type (like numbers), \n", + "coordinate tensors map coordinates to other coordinates.\n", + "\n", + "For example, given a shape (4,4), a coordinate tensor using row-major layout would appear as:\n", + "\n", + "\\begin{bmatrix} \n", + "(0,0) & (0,1) & (0,2) & (0,3) \\\\\n", + "(1,0) & (1,1) & (1,2) & (1,3) \\\\\n", + "(2,0) & (2,1) & (2,2) & (2,3) \\\\\n", + "(3,0) & (3,1) & (3,2) & (3,3)\n", + "\\end{bmatrix}\n", + "\n", + "The same shape with a column-major layout would appear as:\n", + "\n", + "\\begin{bmatrix}\n", + "(0,0) & (1,0) & (2,0) & (3,0) \\\\\n", + "(0,1) & (1,1) & (2,1) & (3,1) \\\\\n", + "(0,2) & (1,2) & (2,2) & (3,2) \\\\\n", + "(0,3) & (1,3) & (2,3) & (3,3)\n", + "\\end{bmatrix}\n", + "\n", + "The key points about coordinate tensors are:\n", + "- Each element in the tensor is itself a coordinate tuple (i,j) rather than a scalar value\n", + "- The coordinates map to themselves - so position (1,2) contains the coordinate (1,2)\n", + "- The layout (row-major vs column-major) determines how these coordinate tuples are arranged in memory\n", + "\n", + "For example, coordinate tensors can be created using the `make_identity_tensor` utility:\n", + "\n", + "```python\n", + "coord_tensor = make_identity_tensor(layout.shape())\n", + "```\n", + "\n", + "This creates a tensor that maps each coordinate to itself, providing a reference point for understanding how other layouts transform these coordinates." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor<(0,0) o (8,4):(1@0,1@1)>\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def print_tensor_coord(a: cute.Tensor):\n", + " coord_tensor = cute.make_identity_tensor(a.layout.shape)\n", + " print(coord_tensor)\n", + "\n", + "a = torch.randn(8,4, dtype=torch_dtype(cutlass.Float32))\n", + "print_tensor_coord(from_dlpack(a))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/CuTeDSL/notebooks/tensorssa.ipynb b/examples/python/CuTeDSL/notebooks/tensorssa.ipynb new file mode 100644 index 00000000..8d83e02e --- /dev/null +++ b/examples/python/CuTeDSL/notebooks/tensorssa.ipynb @@ -0,0 +1,558 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import cutlass\n", + "import cutlass.cute as cute\n", + "from cutlass.cute.runtime import from_dlpack\n", + "\n", + "import numpy as np\n", + "import torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to the TensorSSA in CuTe DSL\n", + "\n", + "This tutorial introduces what is the `TensorSSA` and why we need it. We also give some examples to show how to use `TensorSSA`.\n", + "\n", + "## What is TensorSSA\n", + "\n", + "`TensorSSA` is a Python class that represents a tensor value in Static Single Assignment (SSA) form within the CuTe DSL. You can think of it as a tensor residing in a (simulated) register.\n", + "\n", + "## Why TensorSSA\n", + "\n", + "`TensorSSA` encapsulates the underlying MLIR tensor value into an object that's easier to manipulate in Python. By overloading numerous Python operators (like `+`, `-`, `*`, `/`, `[]`, etc.), it allows users to express tensor computations (primarily element-wise operations and reductions) in a more Pythonic way. These element-wise operations are then translated into optimized vectorization instructions.\n", + "\n", + "It's part of the CuTe DSL, serving as a bridge between the user-described computational logic and the lower-level MLIR IR, particularly for representing and manipulating register-level data.\n", + "\n", + "## When to use TensorSSA\n", + "\n", + "`TensorSSA` is primarily used in the following scenarios:\n", + "\n", + "### Load from memory and store to memory" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a_vec: tensor_value o (3, 4)>\n", + "b_vec: tensor_value o (3, 4)>\n", + "tensor(raw_ptr(0x0000000006cff170: f32, generic, align<4>) o (3,4):(4,1), data=\n", + " [[ 2.000000, 2.000000, 2.000000, 2.000000, ],\n", + " [ 2.000000, 2.000000, 2.000000, 2.000000, ],\n", + " [ 2.000000, 2.000000, 2.000000, 2.000000, ]])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def load_and_store(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n", + " \"\"\"\n", + " Load data from memory and store the result to memory.\n", + "\n", + " :param res: The destination tensor to store the result.\n", + " :param a: The source tensor to be loaded.\n", + " :param b: The source tensor to be loaded.\n", + " \"\"\"\n", + " a_vec = a.load()\n", + " print(f\"a_vec: {a_vec}\") # prints `a_vec: vector<12xf32> o (3, 4)`\n", + " b_vec = b.load()\n", + " print(f\"b_vec: {b_vec}\") # prints `b_vec: vector<12xf32> o (3, 4)`\n", + " res.store(a_vec + b_vec)\n", + " cute.print_tensor(res)\n", + "\n", + "a = np.ones(12).reshape((3, 4)).astype(np.float32)\n", + "b = np.ones(12).reshape((3, 4)).astype(np.float32)\n", + "c = np.zeros(12).reshape((3, 4)).astype(np.float32)\n", + "load_and_store(from_dlpack(c), from_dlpack(a), from_dlpack(b))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Register-Level Tensor Operations\n", + "\n", + "When writing kernel logic, various computations, transformations, slicing, etc., are performed on data loaded into registers." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_value o (4, 2, 3)> -> tensor_value o (4, 3)>\n", + "tensor(raw_ptr(0x00000000071acaf0: f32, generic, align<4>) o (4,3):(3,1), data=\n", + " [[ 3.000000, 4.000000, 5.000000, ],\n", + " [ 9.000000, 10.000000, 11.000000, ],\n", + " [ 15.000000, 16.000000, 17.000000, ],\n", + " [ 21.000000, 22.000000, 23.000000, ]])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def apply_slice(src: cute.Tensor, dst: cute.Tensor, indices: cutlass.Constexpr):\n", + " \"\"\"\n", + " Apply slice operation on the src tensor and store the result to the dst tensor.\n", + "\n", + " :param src: The source tensor to be sliced.\n", + " :param dst: The destination tensor to store the result.\n", + " :param indices: The indices to slice the source tensor.\n", + " \"\"\"\n", + " src_vec = src.load()\n", + " dst_vec = src_vec[indices]\n", + " print(f\"{src_vec} -> {dst_vec}\")\n", + " if isinstance(dst_vec, cute.TensorSSA):\n", + " dst.store(dst_vec)\n", + " cute.print_tensor(dst)\n", + " else:\n", + " dst[0] = dst_vec\n", + " cute.print_tensor(dst)\n", + "\n", + "def slice_1():\n", + " src_shape = (4, 2, 3)\n", + " dst_shape = (4, 3)\n", + " indices = (None, 1, None)\n", + "\n", + " \"\"\"\n", + " a:\n", + " [[[ 0. 1. 2.]\n", + " [ 3. 4. 5.]]\n", + "\n", + " [[ 6. 7. 8.]\n", + " [ 9. 10. 11.]]\n", + "\n", + " [[12. 13. 14.]\n", + " [15. 16. 17.]]\n", + "\n", + " [[18. 19. 20.]\n", + " [21. 22. 23.]]]\n", + " \"\"\"\n", + " a = np.arange(np.prod(src_shape)).reshape(*src_shape).astype(np.float32)\n", + " dst = np.random.randn(*dst_shape).astype(np.float32)\n", + " apply_slice(from_dlpack(a), from_dlpack(dst), indices)\n", + "\n", + "slice_1()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor_value o (4, 2, 3)> -> ?\n", + "tensor(raw_ptr(0x00000000013cbbe0: f32, generic, align<4>) o (1):(1), data=\n", + " [ 10.000000, ])\n" + ] + } + ], + "source": [ + "def slice_2():\n", + " src_shape = (4, 2, 3)\n", + " dst_shape = (1,)\n", + " indices = 10\n", + " a = np.arange(np.prod(src_shape)).reshape(*src_shape).astype(np.float32)\n", + " dst = np.random.randn(*dst_shape).astype(np.float32)\n", + " apply_slice(from_dlpack(a), from_dlpack(dst), indices)\n", + "\n", + "slice_2()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Arithmetic Operations\n", + "\n", + "As we mentioned earlier, there're many tensor operations whose operands are `TensorSSA`. And they are all element-wise operations. We give some examples below.\n", + "\n", + "### Binary Operations\n", + "\n", + "For binary operations, the LHS operand is `TensorSSA` and the RHS operand can be either `TensorSSA` or `Numeric`. When the RHS is `Numeric`, it will be broadcast to a `TensorSSA`." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n", + " [ 3.000000, ],\n", + " [ 3.000000, ],\n", + " [ 3.000000, ])\n", + "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n", + " [-1.000000, ],\n", + " [-1.000000, ],\n", + " [-1.000000, ])\n", + "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n", + " [ 2.000000, ],\n", + " [ 2.000000, ],\n", + " [ 2.000000, ])\n", + "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n", + " [ 0.500000, ],\n", + " [ 0.500000, ],\n", + " [ 0.500000, ])\n", + "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n", + " [ 0.000000, ],\n", + " [ 0.000000, ],\n", + " [ 0.000000, ])\n", + "tensor(raw_ptr(0x00000000074f0e70: f32, generic, align<4>) o (3):(1), data=\n", + " [ 1.000000, ],\n", + " [ 1.000000, ],\n", + " [ 1.000000, ])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def binary_op_1(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n", + " a_vec = a.load()\n", + " b_vec = b.load()\n", + "\n", + " add_res = a_vec + b_vec\n", + " res.store(add_res)\n", + " cute.print_tensor(res) # prints [3.000000, 3.000000, 3.000000]\n", + "\n", + " sub_res = a_vec - b_vec\n", + " res.store(sub_res)\n", + " cute.print_tensor(res) # prints [-1.000000, -1.000000, -1.000000]\n", + "\n", + " mul_res = a_vec * b_vec\n", + " res.store(mul_res)\n", + " cute.print_tensor(res) # prints [2.000000, 2.000000, 2.000000]\n", + "\n", + " div_res = a_vec / b_vec\n", + " res.store(div_res)\n", + " cute.print_tensor(res) # prints [0.500000, 0.500000, 0.500000]\n", + "\n", + " floor_div_res = a_vec // b_vec\n", + " res.store(floor_div_res)\n", + " cute.print_tensor(res) # prints [0.000000, 0.000000, 0.000000]\n", + "\n", + " mod_res = a_vec % b_vec\n", + " res.store(mod_res)\n", + " cute.print_tensor(res) # prints [1.000000, 1.000000, 1.000000]\n", + "\n", + "\n", + "a = np.empty((3,), dtype=np.float32)\n", + "a.fill(1.0)\n", + "b = np.empty((3,), dtype=np.float32)\n", + "b.fill(2.0)\n", + "res = np.empty((3,), dtype=np.float32)\n", + "binary_op_1(from_dlpack(res), from_dlpack(a), from_dlpack(b))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n", + " [ 3.000000, ],\n", + " [ 3.000000, ],\n", + " [ 3.000000, ])\n", + "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n", + " [-1.000000, ],\n", + " [-1.000000, ],\n", + " [-1.000000, ])\n", + "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n", + " [ 2.000000, ],\n", + " [ 2.000000, ],\n", + " [ 2.000000, ])\n", + "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n", + " [ 0.500000, ],\n", + " [ 0.500000, ],\n", + " [ 0.500000, ])\n", + "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n", + " [ 0.000000, ],\n", + " [ 0.000000, ],\n", + " [ 0.000000, ])\n", + "tensor(raw_ptr(0x0000000007828ed0: f32, generic, align<4>) o (3):(1), data=\n", + " [ 1.000000, ],\n", + " [ 1.000000, ],\n", + " [ 1.000000, ])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def binary_op_2(res: cute.Tensor, a: cute.Tensor, c: cutlass.Constexpr):\n", + " a_vec = a.load()\n", + "\n", + " add_res = a_vec + c\n", + " res.store(add_res)\n", + " cute.print_tensor(res) # prints [3.000000, 3.000000, 3.000000]\n", + "\n", + " sub_res = a_vec - c\n", + " res.store(sub_res)\n", + " cute.print_tensor(res) # prints [-1.000000, -1.000000, -1.000000]\n", + "\n", + " mul_res = a_vec * c\n", + " res.store(mul_res)\n", + " cute.print_tensor(res) # prints [2.000000, 2.000000, 2.000000]\n", + "\n", + " div_res = a_vec / c\n", + " res.store(div_res)\n", + " cute.print_tensor(res) # prints [0.500000, 0.500000, 0.500000]\n", + "\n", + " floor_div_res = a_vec // c\n", + " res.store(floor_div_res)\n", + " cute.print_tensor(res) # prints [0.000000, 0.000000, 0.000000]\n", + "\n", + " mod_res = a_vec % c\n", + " res.store(mod_res)\n", + " cute.print_tensor(res) # prints [1.000000, 1.000000, 1.000000]\n", + "\n", + "a = np.empty((3,), dtype=np.float32)\n", + "a.fill(1.0)\n", + "c = 2.0\n", + "res = np.empty((3,), dtype=np.float32)\n", + "binary_op_2(from_dlpack(res), from_dlpack(a), c)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[False True False]\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def binary_op_3(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n", + " a_vec = a.load()\n", + " b_vec = b.load()\n", + "\n", + " gt_res = a_vec > b_vec\n", + " res.store(gt_res)\n", + "\n", + " \"\"\"\n", + " ge_res = a_ >= b_ # [False, True, False]\n", + " lt_res = a_ < b_ # [True, False, True]\n", + " le_res = a_ <= b_ # [True, False, True]\n", + " eq_res = a_ == b_ # [False, False, False]\n", + " \"\"\"\n", + "\n", + "a = np.array([1, 2, 3], dtype=np.float32)\n", + "b = np.array([2, 1, 4], dtype=np.float32)\n", + "res = np.empty((3,), dtype=np.bool_)\n", + "binary_op_3(from_dlpack(res), from_dlpack(a), from_dlpack(b))\n", + "print(res) # prints [False, True, False]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[3 0 7]\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def binary_op_4(res: cute.Tensor, a: cute.Tensor, b: cute.Tensor):\n", + " a_vec = a.load()\n", + " b_vec = b.load()\n", + "\n", + " xor_res = a_vec ^ b_vec\n", + " res.store(xor_res)\n", + "\n", + " # or_res = a_vec | b_vec\n", + " # res.store(or_res) # prints [3, 2, 7]\n", + "\n", + " # and_res = a_vec & b_vec\n", + " # res.store(and_res) # prints [0, 2, 0]\n", + "\n", + "a = np.array([1, 2, 3], dtype=np.int32)\n", + "b = np.array([2, 2, 4], dtype=np.int32)\n", + "res = np.empty((3,), dtype=np.int32)\n", + "binary_op_4(from_dlpack(res), from_dlpack(a), from_dlpack(b))\n", + "print(res) # prints [3, 0, 7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Unary Operations" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(raw_ptr(0x0000000007fbd180: f32, generic, align<4>) o (3):(1), data=\n", + " [ 2.000000, ],\n", + " [ 2.000000, ],\n", + " [ 2.000000, ])\n", + "tensor(raw_ptr(0x0000000007fbd180: f32, generic, align<4>) o (3):(1), data=\n", + " [-0.756802, ],\n", + " [-0.756802, ],\n", + " [-0.756802, ])\n", + "tensor(raw_ptr(0x0000000007fbd180: f32, generic, align<4>) o (3):(1), data=\n", + " [ 16.000000, ],\n", + " [ 16.000000, ],\n", + " [ 16.000000, ])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def unary_op_1(res: cute.Tensor, a: cute.Tensor):\n", + " a_vec = a.load()\n", + "\n", + " sqrt_res = cute.math.sqrt(a_vec)\n", + " res.store(sqrt_res)\n", + " cute.print_tensor(res) # prints [2.000000, 2.000000, 2.000000]\n", + "\n", + " sin_res = cute.math.sin(a_vec)\n", + " res.store(sin_res)\n", + " cute.print_tensor(res) # prints [-0.756802, -0.756802, -0.756802]\n", + "\n", + " exp2_res = cute.math.exp2(a_vec)\n", + " res.store(exp2_res)\n", + " cute.print_tensor(res) # prints [16.000000, 16.000000, 16.000000]\n", + "\n", + "a = np.array([4.0, 4.0, 4.0], dtype=np.float32)\n", + "res = np.empty((3,), dtype=np.float32)\n", + "unary_op_1(from_dlpack(res), from_dlpack(a))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Reduction Operation\n", + "\n", + "The `TensorSSA`'s `reduce` method applies a specified reduction operation (`ReductionOp.ADD`, `ReductionOp.MUL`, `ReductionOp.MAX`, `ReductionOp.MIN`) starting with an initial value, and performs this reduction along the dimensions specified by the `reduction_profile.`. The result is typically a new `TensorSSA` with reduced dimensions or a scalar value if reduces across all axes." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "21.000000\n", + "tensor(raw_ptr(0x00007ffd1ea2bca0: f32, rmem, align<32>) o (2):(1), data=\n", + " [ 6.000000, ],\n", + " [ 15.000000, ])\n", + "tensor(raw_ptr(0x00007ffd1ea2bcc0: f32, rmem, align<32>) o (3):(1), data=\n", + " [ 6.000000, ],\n", + " [ 8.000000, ],\n", + " [ 10.000000, ])\n" + ] + } + ], + "source": [ + "@cute.jit\n", + "def reduction_op(a: cute.Tensor):\n", + " \"\"\"\n", + " Apply reduction operation on the src tensor.\n", + "\n", + " :param src: The source tensor to be reduced.\n", + " \"\"\"\n", + " a_vec = a.load()\n", + " red_res = a_vec.reduce(\n", + " cute.ReductionOp.ADD,\n", + " 0.0,\n", + " reduction_profile=0\n", + " )\n", + " cute.printf(red_res) # prints 21.000000\n", + "\n", + " red_res = a_vec.reduce(\n", + " cute.ReductionOp.ADD,\n", + " 0.0,\n", + " reduction_profile=(None, 1)\n", + " )\n", + " # We can't print the TensorSSA directly at this point, so we store it to a new Tensor and print it.\n", + " res = cute.make_fragment(red_res.shape, cutlass.Float32)\n", + " res.store(red_res)\n", + " cute.print_tensor(res) # prints [6.000000, 15.000000]\n", + "\n", + " red_res = a_vec.reduce(\n", + " cute.ReductionOp.ADD,\n", + " 1.0,\n", + " reduction_profile=(1, None)\n", + " )\n", + " res = cute.make_fragment(red_res.shape, cutlass.Float32)\n", + " res.store(red_res)\n", + " cute.print_tensor(res) # prints [6.000000, 8.000000, 10.000000]\n", + "\n", + "\n", + "a = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)\n", + "reduction_op(from_dlpack(a))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/examples/python/00_basic_gemm.ipynb b/examples/python/deprecated/00_basic_gemm.ipynb similarity index 100% rename from examples/python/00_basic_gemm.ipynb rename to examples/python/deprecated/00_basic_gemm.ipynb diff --git a/examples/python/01_epilogue.ipynb b/examples/python/deprecated/01_epilogue.ipynb similarity index 100% rename from examples/python/01_epilogue.ipynb rename to examples/python/deprecated/01_epilogue.ipynb diff --git a/examples/python/02_pytorch_extension_grouped_gemm.ipynb b/examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb similarity index 100% rename from examples/python/02_pytorch_extension_grouped_gemm.ipynb rename to examples/python/deprecated/02_pytorch_extension_grouped_gemm.ipynb diff --git a/examples/python/03_basic_conv2d.ipynb b/examples/python/deprecated/03_basic_conv2d.ipynb similarity index 100% rename from examples/python/03_basic_conv2d.ipynb rename to examples/python/deprecated/03_basic_conv2d.ipynb diff --git a/examples/python/04_epilogue_visitor.ipynb b/examples/python/deprecated/04_epilogue_visitor.ipynb similarity index 100% rename from examples/python/04_epilogue_visitor.ipynb rename to examples/python/deprecated/04_epilogue_visitor.ipynb diff --git a/examples/python/README.md b/examples/python/deprecated/README.md similarity index 100% rename from examples/python/README.md rename to examples/python/deprecated/README.md diff --git a/include/cute/arch/config.hpp b/include/cute/arch/config.hpp index 2383b4e6..8ec8ffb2 100644 --- a/include/cute/arch/config.hpp +++ b/include/cute/arch/config.hpp @@ -93,6 +93,7 @@ # define CUTE_ARCH_STSM_SM90_ENABLED #endif + #if (defined(CUTLASS_ARCH_MMA_SM100A_ENABLED) || defined(CUTLASS_ARCH_MMA_SM101A_ENABLED)) # define CUTE_ARCH_TCGEN05_S8_MMA_ENABLED #endif diff --git a/include/cute/arch/mma_sm100_umma.hpp b/include/cute/arch/mma_sm100_umma.hpp index 4b6d7f86..f754e266 100644 --- a/include/cute/arch/mma_sm100_umma.hpp +++ b/include/cute/arch/mma_sm100_umma.hpp @@ -1394,7 +1394,11 @@ struct SM100_MMA_MXF4_SS "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1411,7 +1415,11 @@ struct SM100_MMA_MXF4_SS "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.block32 [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1457,7 +1465,11 @@ struct SM100_MMA_MXF4NVF4_SS_SPARSE "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.sp.cta_group::1.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1475,7 +1487,11 @@ struct SM100_MMA_MXF4NVF4_SS_SPARSE "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.block32 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.sp.cta_group::1.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1520,7 +1536,11 @@ struct SM100_MMA_MXF4_2x1SM_SS "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1537,7 +1557,11 @@ struct SM100_MMA_MXF4_2x1SM_SS "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.block32 [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1582,7 +1606,11 @@ struct SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.block16 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.sp.cta_group::2.kind::mxf4nvf4.block_scale.scale_vec::4X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), @@ -1600,7 +1628,11 @@ struct SM100_MMA_MXF4NVF4_2x1SM_SS_SPARSE "{\n\t" ".reg .pred p;\n\t" "setp.ne.b32 p, %4, 0;\n\t" +#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) + "tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.block32 [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#else "tcgen05.mma.sp.cta_group::2.kind::mxf4.block_scale.scale_vec::2X [%0], %1, %2, [%7], %3, [%5], [%6], p; \n\t" +#endif "}\n" : : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(uint32_t(idescE>>32)), "r"(scaleC), diff --git a/include/cute/atom/copy_traits_sm100_tma.hpp b/include/cute/atom/copy_traits_sm100_tma.hpp index 851db289..0212db11 100644 --- a/include/cute/atom/copy_traits_sm100_tma.hpp +++ b/include/cute/atom/copy_traits_sm100_tma.hpp @@ -104,7 +104,7 @@ struct Copy_Traits auto get_tma_tensor(GShape const& g_shape) const { static_assert(is_congruent::value); - return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_)); + return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_)); } // Don't try to execute a copy with SM100_TMA_2SM_LOAD before calling .with() @@ -192,7 +192,7 @@ struct Copy_Traits auto get_tma_tensor(GShape const& g_shape) const { static_assert(is_congruent::value); - return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_)); + return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_)); } // Don't try to execute a copy with SM100_TMA_2SM_LOAD_MULTICAST_OP before calling .with() diff --git a/include/cute/atom/copy_traits_sm90_tma.hpp b/include/cute/atom/copy_traits_sm90_tma.hpp index ad668cee..209a8448 100644 --- a/include/cute/atom/copy_traits_sm90_tma.hpp +++ b/include/cute/atom/copy_traits_sm90_tma.hpp @@ -146,7 +146,7 @@ struct Copy_Traits auto get_tma_tensor(GShape const& g_shape) const { static_assert(is_congruent::value); - return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_)); + return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_)); } // Don't try to execute a copy with SM90_TMA_LOAD before calling .with() @@ -276,7 +276,7 @@ struct Copy_Traits auto get_tma_tensor(GShape const& g_shape) const { static_assert(is_congruent::value); - return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_)); + return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_)); } // Don't try to execute a copy with SM90_TMA_LOAD_MULTICAST before calling .with() @@ -350,7 +350,7 @@ struct Copy_Traits auto get_tma_tensor(GShape const& g_shape) const { static_assert(is_congruent::value); - return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_)); + return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_)); } // Construct new TMA_STORE with (unsafe) swapped out TMA descriptor ptr (for grouped gemm/ptr array gemm) @@ -463,7 +463,7 @@ struct Copy_Traits auto get_tma_tensor(GShape const& g_shape) const { static_assert(is_congruent::value); - return make_counting_tensor(make_layout(g_shape, aux_params_.g_stride_)); + return make_coord_tensor(make_layout(g_shape, aux_params_.g_stride_)); } template diff --git a/include/cute/tensor_impl.hpp b/include/cute/tensor_impl.hpp index 0d914488..e65ad419 100644 --- a/include/cute/tensor_impl.hpp +++ b/include/cute/tensor_impl.hpp @@ -474,14 +474,14 @@ make_fragment_like(Tensor const& tensor) } // -// make_counting_tensor +// make_coord_tensor // Make a tensor from a layout by binding it to a counting iter with 0-offset of the same profile as the codomain. // template ::value)> CUTE_HOST_DEVICE constexpr auto -make_counting_tensor(Layout const& layout) +make_coord_tensor(Layout const& layout) { return make_tensor(make_inttuple_iter(coprofile(layout)), layout); } @@ -496,7 +496,7 @@ CUTE_HOST_DEVICE constexpr auto make_identity_tensor(Shape const& shape) { - return make_counting_tensor(make_identity_layout(shape)); + return make_coord_tensor(make_identity_layout(shape)); } // diff --git a/include/cutlass/arch/config.h b/include/cutlass/arch/config.h index e5daf829..60be8d72 100644 --- a/include/cutlass/arch/config.h +++ b/include/cutlass/arch/config.h @@ -105,10 +105,8 @@ ///////////////////////////////////////////////////////////////////////////////////////////////// - - // SM101 and SM101a -#if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)) +#if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8) #define CUTLASS_ARCH_MMA_SM101_SUPPORTED 1 #if (!defined(CUTLASS_ARCH_MMA_SM101_ENABLED) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1010) #define CUTLASS_ARCH_MMA_SM101_ENABLED 1 @@ -118,7 +116,7 @@ #endif // SM101f - #if (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)) + #if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9) #define CUTLASS_ARCH_MMA_SM101F_SUPPORTED 1 #endif @@ -130,6 +128,8 @@ ///////////////////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////////////////// + // SM120 and SM120a #if !CUTLASS_CLANG_CUDA && (__CUDACC_VER_MAJOR__ > 12 || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)) #define CUTLASS_ARCH_MMA_SM120_SUPPORTED 1 diff --git a/include/cutlass/epilogue/collective/builders/sm120_builder.inl b/include/cutlass/epilogue/collective/builders/sm120_builder.inl index e1c1bff8..80e84e9a 100644 --- a/include/cutlass/epilogue/collective/builders/sm120_builder.inl +++ b/include/cutlass/epilogue/collective/builders/sm120_builder.inl @@ -284,6 +284,46 @@ struct CallbacksBuilder< >; }; +// Overload CallbacksBuilder to pick the correct copy atoms for PtrArray epilogue fusions +template < + int StagesC, + int StagesD, + int FragmentSize, + bool ReuseSmemC, + bool DelayTmaStore, + int NumEpilogueWarpgroups, + class FusionOp, + class TileShape_MNK, + class EpilogueTile_MN, + class AccLoadOp, + class ElementAccumulator +> +struct CallbacksBuilder< + Sm120PtrArrayTmaWarpSpecialized, + FusionOp, + TileShape_MNK, + EpilogueTile_MN, + ElementAccumulator, + AccLoadOp, + cute::enable_if_t<(FusionOp::IsAuxOutSupported ^ FusionOp::IsAuxInSupported) // only one aux tensor + && not cute::is_subbyte_v> +> { + using GmemStrideTypeAux = gemm::TagToStrideC_t; + using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom< + GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); + + using CopyOpR2S = decltype(detail::sm120_get_smem_store_op_for_accumulator()); + + using CopyOpS2R = decltype(detail::sm120_get_smem_load_op_for_source()); + + using SmemCopyOpAux = cute::conditional_t; + + using Callbacks = fusion::FusionCallbacks< + Sm120PtrArrayTmaWarpSpecialized, + FusionOp, TileShape_MNK, EpilogueTile_MN, + SmemLayoutAtomAux, SmemCopyOpAux + >; +}; // Helper for building TMA warp-specialized collective epilogues, specialized by // the fusion operation performed and the dispatch policy to use. diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl index 9cb03fdc..ef0d7c4b 100644 --- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl +++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl @@ -115,12 +115,13 @@ sm90_compute_tile_shape_or_override() { if constexpr (cute::is_same_v) { auto epi_tile = [&] () { if constexpr (detail::sm90_is_cooperative_v) { + auto tile_m = cute::min(_128{}, size<0>(TileShape_MNK{})); auto tile_n = cute::gcd(cute::min(_32{}, size<1>(TileShape_MNK{})), size<1>(TileShape_MNK{})); return make_shape(tile_m, tile_n); } else if constexpr (detail::sm90_is_warp_specialized_v) { - constexpr int N_perf = sizeof_bits_v == 8 ? 64 : 32; + constexpr int N_perf = (sizeof_bits_v == 8) && (size<1>(TileShape_MNK{}) % 64 == 0) ? 64 : 32; auto tile_m = cute::min(_64{}, size<0>(TileShape_MNK{})); auto tile_n = cute::gcd(cute::min(Int{}, size<1>(TileShape_MNK{})), size<1>(TileShape_MNK{})); return make_shape(tile_m, tile_n); @@ -194,9 +195,9 @@ struct CallbacksBuilder< using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom< GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator< - GmemStrideTypeAux, typename FusionOp::ElementAux>()); + GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source< - GmemStrideTypeAux, typename FusionOp::ElementAux>()); + GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); using SmemCopyOpAux = cute::conditional_t; using Callbacks = fusion::FusionCallbacks< @@ -234,9 +235,9 @@ struct CallbacksBuilder< using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom< GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); using CopyOpR2S = decltype(detail::sm90_get_smem_store_op_for_accumulator< - GmemStrideTypeAux, typename FusionOp::ElementAux>()); + GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); using CopyOpS2R = decltype(detail::sm90_get_smem_load_op_for_source< - GmemStrideTypeAux, typename FusionOp::ElementAux>()); + GmemStrideTypeAux, typename FusionOp::ElementAux, EpilogueTile_MN>()); using SmemCopyOpAux = cute::conditional_t; using Callbacks = fusion::FusionCallbacks< @@ -316,7 +317,17 @@ struct Sm90TmaBuilderImpl { >; // Get the smallest tiled copy we can use to retile the accumulators - using CopyAtomC = Copy_Atom; + // using CopyAtomC = Copy_Atom; + using CopyAtomC = cute::conditional_t< + size<1>(EpilogueTile_MN{}) % 16 == 0, + Copy_Atom, + cute::conditional_t< + size<1>(EpilogueTile_MN{}) % 8 == 0, + Copy_Atom, + void + > + >; + static_assert(!cute::is_same_v, "CopyAtomC can't be void, divisiblity check for EpilogueTile_MN failed"); // Get register to register tiled copy that happen before shared memory store. // Apply void as no register transform op needed currently. using CopyOpR2R = void; @@ -343,10 +354,10 @@ struct Sm90TmaBuilderImpl { FusionCallbacks, CopyOpG2S, decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom()), - decltype(detail::sm90_get_smem_load_op_for_source()), + decltype(detail::sm90_get_smem_load_op_for_source()), CopyOpS2G, decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom()), - decltype(detail::sm90_get_smem_store_op_for_accumulator()), + decltype(detail::sm90_get_smem_store_op_for_accumulator()), CopyAtomC, CopyOpR2R >; @@ -404,7 +415,7 @@ struct AuxLoadDescriptor { >() ); using CopyOpS2R = - decltype(detail::sm90_get_smem_load_op_for_source()); + decltype(detail::sm90_get_smem_load_op_for_source()); }; // Get Stride, SmemLayout, and CopyOpS2R for AuxStore node @@ -425,7 +436,7 @@ struct AuxStoreDescriptor { >() ); using CopyOpR2S = - decltype(detail::sm90_get_smem_store_op_for_accumulator()); + decltype(detail::sm90_get_smem_store_op_for_accumulator()); }; } // namespace detail @@ -745,7 +756,7 @@ private: using SmemLayoutAtomAux = decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom< GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>()); using SmemCopyOpAux = decltype(detail::sm90_get_smem_store_op_for_accumulator< - GmemStrideTypeAux, typename Schedule::ElementT>()); + GmemStrideTypeAux, typename Schedule::ElementT, EpilogueTile_MN>()); using FusionOperationAux = fusion::LinCombPerRowBiasEltActAux< GmemLayoutTagD, Schedule::template ActivationFunctor, ElementD, ElementCompute, typename Schedule::ElementT, typename Schedule::ElementBias, ElementC_, ElementCompute @@ -769,7 +780,17 @@ private: using GmemStrideTypeD = gemm::TagToStrideC_t; // Get the smallest tiled copy we can use to retile the accumulators - using CopyAtomC = Copy_Atom; + using CopyAtomC = cute::conditional_t< + size<1>(EpilogueTile_MN{}) % 16 == 0, + Copy_Atom, + cute::conditional_t< + size<1>(EpilogueTile_MN{}) % 8 == 0, + Copy_Atom, + void + > + >; + static_assert(!cute::is_same_v, "CopyAtomC can't be void, divisiblity check for EpilogueTile_MN failed"); + // Get register to register tiled copy that happen before shared memory store. // Apply void as no register transform op needed. using CopyOpR2R = void; @@ -788,10 +809,10 @@ public: cute::conditional_t, SM90_TMA_LOAD, decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom()), - decltype(detail::sm90_get_smem_load_op_for_source()), + decltype(detail::sm90_get_smem_load_op_for_source()), SM90_TMA_STORE, decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom()), - decltype(detail::sm90_get_smem_store_op_for_accumulator()), + decltype(detail::sm90_get_smem_store_op_for_accumulator()), CopyAtomC, CopyOpR2R >; diff --git a/include/cutlass/epilogue/collective/builders/sm90_common.inl b/include/cutlass/epilogue/collective/builders/sm90_common.inl index a6affcfc..c0a90396 100644 --- a/include/cutlass/epilogue/collective/builders/sm90_common.inl +++ b/include/cutlass/epilogue/collective/builders/sm90_common.inl @@ -37,16 +37,26 @@ namespace cutlass::epilogue::collective::detail { /////////////////////////////////////////////////////////////////////////////// // Selects the largest vectorized smem store atom available -template +template constexpr auto sm90_get_smem_store_op_for_accumulator() { using namespace cute; if constexpr (sizeof(ElementD) == 2 && size<0>(GmemStrideTypeD{}) == 1) { - return SM90_U16x8_STSM_T{}; + if constexpr (size<1>(EpilogueTile_MN{}) % 16 == 0) { + return SM90_U16x8_STSM_T{}; + } + else if constexpr (size<1>(EpilogueTile_MN{}) % 8 == 0) { + return SM90_U16x4_STSM_T{}; + } } else if constexpr (sizeof(ElementD) == 2 && size<1>(GmemStrideTypeD{}) == 1) { - return SM90_U32x4_STSM_N{}; + if constexpr (size<1>(EpilogueTile_MN{}) % 16 == 0) { + return SM90_U32x4_STSM_N{}; + } + else if constexpr (size<1>(EpilogueTile_MN{}) % 8 == 0) { + return SM90_U32x2_STSM_N{}; + } } else { // auto-vectorizing store @@ -55,20 +65,26 @@ sm90_get_smem_store_op_for_accumulator() { } // Selects the largest vectorized smem load atom available -template +template constexpr auto sm90_get_smem_load_op_for_source() { using namespace cute; // Reuse the logic from smem store selector - using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator()); + using SmemStoreOp = decltype(sm90_get_smem_store_op_for_accumulator()); if constexpr (cute::is_same_v) { return SM75_U16x8_LDSM_T{}; } + else if constexpr (cute::is_same_v) { + return SM75_U16x4_LDSM_T{}; + } else if constexpr (cute::is_same_v) { return SM75_U32x4_LDSM_N{}; } + else if constexpr (cute::is_same_v) { + return SM75_U32x2_LDSM_N{}; + } else { // auto-vectorizing load return AutoVectorizingCopyWithAssumedAlignment<128>{}; diff --git a/include/cutlass/epilogue/collective/default_epilogue.hpp b/include/cutlass/epilogue/collective/default_epilogue.hpp index 0d019b1c..ed34bc10 100644 --- a/include/cutlass/epilogue/collective/default_epilogue.hpp +++ b/include/cutlass/epilogue/collective/default_epilogue.hpp @@ -215,8 +215,8 @@ public: Tensor cD_mn = local_tile(mD_crd, take<0,2>(blk_shape_MNK), make_coord(m_coord, n_coord)); // (BLK_M,BLK_N) Tensor tCcD_mn = thr_mma.partition_C(cD_mn); // (VEC,THR_M,THR_N) // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (BLK_M,BLK_N) - Tensor tCcD = make_counting_tensor(tCcD_mn.layout()); // (VEC,THR_M,THR_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (BLK_M,BLK_N) + Tensor tCcD = make_coord_tensor(tCcD_mn.layout()); // (VEC,THR_M,THR_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = shape_MN - cD_mn(_0{}); // (m,n) auto residue_tCcD = shape_MN - tCcD_mn(_0{}); // (m,n) diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp index 0ed7d6b9..1f0a915d 100644 --- a/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm100_epilogue_array_tma_warpspecialized.hpp @@ -286,12 +286,8 @@ public: void* workspace) { // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc. // These will be replaced with correct values before the initial tma load. - auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1)); - // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc. - // These will be replaced with correct values before the initial tma load. - constexpr int tma_alignment_bits = 128; - auto init_M = tma_alignment_bits; - auto init_N = tma_alignment_bits; + auto init_M = int32_t(size<0>(CtaTileShape{})); + auto init_N = int32_t(size<1>(CtaTileShape{})); auto init_L = 1; InternalStrideC stride_c; @@ -745,8 +741,8 @@ public: Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord)); // (CTA_M,CTA_N) Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{})); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (CTA_M,CTA_N) - Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (CTA_M,CTA_N) + Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = make_coord(M,N) - cD_mn(_0{}); // (m,n) auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{}); // (m,n) @@ -786,8 +782,8 @@ public: [[maybe_unused]] bool reverse_epi_n = ReuseTmem && acc_pipe_consumer_state.phase() == 0; static_assert(not (ReuseTmem && AccumulatorPipeline::Stages != 1), "Tmem reuse requires 1 accumulator stage"); - // Predication for TMA store (one warp issues TMA store) - bool issue_tma_store = warp_idx == 0; + // Predication for TMA store (a single thread from one warp issues TMA store) + bool issue_tma_store = (warp_idx == 0) && cute::elect_one_sync(); // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight. // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can @@ -1118,8 +1114,8 @@ public: Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord)); // (CTA_M,CTA_N) Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{})); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (CTA_M,CTA_N) - Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (CTA_M,CTA_N) + Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = make_coord(M,N) - cD_mn(_0{}); // (m,n) auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{}); // (m,n) diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp index c2172798..e32cdfa4 100644 --- a/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp +++ b/include/cutlass/epilogue/collective/sm100_epilogue_nosmem.hpp @@ -606,8 +606,8 @@ public: // Construct the EVT consumer callbacks auto residue_cD = make_coord(M,N) - cD(_0{}); auto residue_tTR_cD = make_coord(M,N) - tTR_cD(_0{}); - Tensor cD_ = make_counting_tensor(cD.layout()); - Tensor tTR_cD_ = make_counting_tensor(tTR_cD.layout()); + Tensor cD_ = make_coord_tensor(cD.layout()); + Tensor tTR_cD_ = make_coord_tensor(tTR_cD.layout()); constexpr bool RefSrc = false; Tensor mC = make_tensor(make_gmem_ptr(params.ptr_C), make_shape(M,N,L), params.dC); diff --git a/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp index 3f445bf5..412a4b7b 100644 --- a/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm100_epilogue_tma_warpspecialized.hpp @@ -695,8 +695,8 @@ public: Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord)); // (CTA_M,CTA_N) Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{})); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (CTA_M,CTA_N) - Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (CTA_M,CTA_N) + Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = make_coord(M,N) - cD_mn(_0{}); // (m,n) auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{}); // (m,n) @@ -1065,8 +1065,8 @@ public: Tensor cD_mn = local_tile(mD_crd, take<0,2>(cta_tile_mnk), make_coord(m_coord, n_coord)); // (CTA_M,CTA_N) Tensor tTR_cD_mn = thread_t2r.partition_D(flat_divide(cD_mn, EpilogueTile{})); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (CTA_M,CTA_N) - Tensor tTR_cD = make_counting_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (CTA_M,CTA_N) + Tensor tTR_cD = make_coord_tensor(tTR_cD_mn.layout()); // (T2R,T2R_M,T2R_N,EPI_M,EPI_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = make_coord(M,N) - cD_mn(_0{}); // (m,n) auto residue_tTR_cD = make_coord(M,N) - tTR_cD_mn(_0{}); // (m,n) diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp index 41c95f16..af53a1c6 100644 --- a/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm90_epilogue_array_tma_warpspecialized.hpp @@ -304,11 +304,9 @@ public: [[maybe_unused]] void* workspace) { // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc. // These will be replaced with correct values before the initial tma load. - auto init_shape = repeat_like(append<4>(typename ProblemShape::UnderlyingProblemShape{}, 1), int32_t(1)); - constexpr int tma_alignment_bits = 128; - auto init_M = tma_alignment_bits; - auto init_N = tma_alignment_bits; - auto init_L = get<3>(init_shape); + auto init_M = int32_t(size<0>(CtaTileMNK{})); + auto init_N = int32_t(size<1>(CtaTileMNK{})); + auto init_L = 1; static_assert(!is_im2col_C and !is_im2col_D, "Im2Col not supported on C or D"); @@ -324,8 +322,6 @@ public: auto problem_shape_MNKL = append<4>(problem_shape.get_host_problem_shape(0), 1); init_M = get<0>(problem_shape_MNKL); init_N = get<1>(problem_shape_MNKL); - init_L = get<3>(problem_shape_MNKL); - stride_c = args.dC; stride_d = args.dD; } @@ -755,8 +751,8 @@ public: Tensor cD_mn = local_tile(mD_crd, take<0,2>(CtaTileMNK{}), make_coord(m_coord, n_coord)); // (CTA_M,CTA_N) Tensor tRS_cD_mn = thread_r2s.partition_S(flat_divide(cD_mn, EpilogueTile{})); // (R2S,R2S_M,R2S_N,EPI_M,EPI_N) // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (CTA_M,CTA_N) - Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout()); // (R2S,R2S_M,R2S_N,EPI_M,EPI_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (CTA_M,CTA_N) + Tensor tRS_cD = make_coord_tensor(tRS_cD_mn.layout()); // (R2S,R2S_M,R2S_N,EPI_M,EPI_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = make_coord(M,N) - cD_mn(_0{}); // (m,n) auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{}); // (m,n) @@ -803,8 +799,8 @@ public: // to ensure visibility of smem reads/writes to threads or TMA unit auto synchronize = [&] () { cutlass::arch::NamedBarrier::sync(size(TiledMma{}), cutlass::arch::ReservedNamedBarriers::EpilogueBarrier); }; - // Predication for TMA store (one warp issues TMA store) - bool issue_tma_store = (thread_idx / NumThreadsPerWarp) == 0; + // Predication for TMA store (a single thread from one warp issues TMA store) + bool issue_tma_store = ((thread_idx / NumThreadsPerWarp) == 0) && cute::elect_one_sync(); // In the reuse smem configuration we have StagesC smem buffers and at most StagesD committed TMA stores in flight. // The TMA store pipeline producer acquire returns when at most StagesD-1 committed stores are in-flight, so we can diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp index f244fafa..062b9a8b 100644 --- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp +++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp @@ -662,8 +662,8 @@ public: } }(); // Relative coordinate tensors (static) - Tensor cD = make_counting_tensor(cD_mn.layout()); // (CTA_M,CTA_N) - Tensor tRS_cD = make_counting_tensor(tRS_cD_mn.layout()); // (R2S,R2S_M,R2S_N,EPI_M,EPI_N) + Tensor cD = make_coord_tensor(cD_mn.layout()); // (CTA_M,CTA_N) + Tensor tRS_cD = make_coord_tensor(tRS_cD_mn.layout()); // (R2S,R2S_M,R2S_N,EPI_M,EPI_N) // Subtract the global "bottom right" corner from the local "top left" corner to get the max relative coordinate auto residue_cD = make_coord(M,N) - cD_mn(_0{}); // (m,n) auto residue_tRS_cD = make_coord(M,N) - tRS_cD_mn(_0{}); // (m,n) diff --git a/include/cutlass/epilogue/thread/activation.h b/include/cutlass/epilogue/thread/activation.h index c3abfdff..265a75ee 100644 --- a/include/cutlass/epilogue/thread/activation.h +++ b/include/cutlass/epilogue/thread/activation.h @@ -496,13 +496,29 @@ template <> struct HardSwish { using T = float; static const bool kIsHeavy = false; + static constexpr float kOneSixth = 0.16666667f; CUTLASS_HOST_DEVICE T operator()(T const &x) const { minimum mn; maximum mx; T relu6 = mn(mx(x + T(3), T(0)), T(6)); - return x * relu6 * 0.16666667f; + return x * relu6 * kOneSixth; + } +}; + +template <> +struct HardSwish { + using T = cutlass::half_t; + static const bool kIsHeavy = false; + static constexpr float kOneSixth = 0.16666667f; + + CUTLASS_HOST_DEVICE + T operator()(T const &x) const { + minimum mn; + maximum mx; + T relu6 = mn(mx(x + T(3), T(0)), T(6)); + return x * relu6 * T(kOneSixth); } }; @@ -524,22 +540,6 @@ struct HardSwish > { } }; -template -struct HardSwish > { - using T = half_t; - static const bool kIsHeavy = false; - - CUTLASS_HOST_DEVICE - Array operator()(Array const &value) const { - minimum > mn; - maximum > mx; - multiplies > mul; - plus > add; - - return mul(mul(mn(mx(add(value, T(3)), T(0)), T(6)), value), T(0.16666667f)); - } -}; - template using ScaledHardSwish = Scale>; diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h index 5a9a9888..628a8077 100644 --- a/include/cutlass/functional.h +++ b/include/cutlass/functional.h @@ -722,7 +722,7 @@ struct has_unqualified_conj : cutlass::platform::false_type template struct has_unqualified_conj< T, - decltype(conj(cutlass::platform::declval()), void()) + decltype(static_cast(conj(cutlass::platform::declval())), void()) > : cutlass::platform::true_type {}; diff --git a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl index dc99eb3d..8617e883 100644 --- a/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl +++ b/include/cutlass/gemm/collective/builders/sm100_blockwise_umma_builder.inl @@ -129,7 +129,10 @@ auto sm100_make_simt_gmem_tiled_copy_SFA() { using ScaleCopyTypeA = cute::uint_byte_t; using SmemScalingCopyAtomA = Copy_Atom, Element>; constexpr int ElementsPerSFACopy = static_cast(sizeof(ScaleCopyTypeA) / sizeof(Element)); - return make_tiled_copy(SmemScalingCopyAtomA{}, Layout>{}, Layout>>{}); + return make_tiled_copy( + SmemScalingCopyAtomA{}, + Layout>{}, // 32 threads + Layout, Int>>, Stride>>{}); } else { using SmemScalingCopyAtomA = Copy_Atom, Element>; @@ -138,9 +141,8 @@ auto sm100_make_simt_gmem_tiled_copy_SFA() { } else { // we expect scale Ks per tile to be small - constexpr int LeadingScalesPerTileSFA = ScaleKsPerTile; using SmemScalingCopyAtomA = Copy_Atom, Element>; - return make_tiled_copy(SmemScalingCopyAtomA{}, Layout>>{}, Layout>{}); + return make_tiled_copy(SmemScalingCopyAtomA{}, Layout>{}, Layout>{}); } } @@ -161,7 +163,10 @@ auto sm100_make_simt_gmem_tiled_copy_SFB() { using ScaleCopyTypeB = cute::uint_byte_t; using SmemScalingCopyAtomB = Copy_Atom, Element>; constexpr int ElementsPerSFBCopy = static_cast(sizeof(ScaleCopyTypeB) / sizeof(Element)); - return make_tiled_copy(SmemScalingCopyAtomB{}, Layout>{}, Layout>>{}); + return make_tiled_copy( + SmemScalingCopyAtomB{}, + Layout>{}, // 32 threads + Layout, Int>>, Stride>>{}); } else { using SmemScalingCopyAtomB = Copy_Atom, Element>; @@ -170,9 +175,8 @@ auto sm100_make_simt_gmem_tiled_copy_SFB() { } else { // we expect scale Ks per tile to be small - constexpr int LeadingScalesPerTileSFB = ScaleKsPerTile; using SmemScalingCopyAtomB = Copy_Atom, Element>; - return make_tiled_copy(SmemScalingCopyAtomB{}, Layout>>{}, Layout>{}); + return make_tiled_copy(SmemScalingCopyAtomB{}, Layout>{}, Layout>{}); } } diff --git a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl index b6c489da..c75af3ac 100644 --- a/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl +++ b/include/cutlass/gemm/collective/builders/sm90_gmma_builder.inl @@ -260,7 +260,9 @@ struct CollectiveBuilder< GmmaMajorB, ElementBMma, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; - static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage); + static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v> ? + sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0; + static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage + SchedulerPipelineStorage); static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout; static constexpr int PipelineStages = detail::compute_stage_count_or_override(TensorMapStorage); + static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v> ? + sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0; + static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage + SchedulerPipelineStorage); static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout; static constexpr int PipelineStages = IsMixedInput ? @@ -570,7 +574,9 @@ struct CollectiveBuilder< GmmaMajorB, ElementB, decltype(cute::get<1>(TileShape_MNK{})), decltype(cute::get<2>(TileShape_MNK{}))>()); static constexpr size_t TensorMapStorage = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; - static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage); + static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v> ? + sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0; + static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage + SchedulerPipelineStorage); static constexpr int Sm90ReducedSmemCapacityBytes = detail::sm90_smem_capacity_bytes - KernelSmemCarveout; static constexpr int PipelineStages = detail::compute_stage_count_or_override(TensorMapStorage) + static_cast(TileSchedulerCarveout); + static constexpr size_t SchedulerPipelineStorage = cute::is_pointer_v> ? + sizeof(cutlass::PipelineDetail::PipelineAsyncSharedStorage<8>) : 0; + static constexpr int KernelSmemCarveout = static_cast(TensorMapStorage + SchedulerPipelineStorage); static constexpr int ScaleMsPerTile = size<0>(TileShape_MNK{}) / ScaleGranularityM; static constexpr int ScaleNsPerTile = size<1>(TileShape_MNK{}) / ScaleGranularityN; diff --git a/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp b/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp index bdc877bd..b51d1256 100644 --- a/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp +++ b/include/cutlass/gemm/collective/sm100_blockscaled_mma_array_warpspecialized.hpp @@ -427,10 +427,9 @@ struct CollectiveMma< cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) { // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc. // These will be replaced with correct values before the initial tma load. - constexpr int tma_alignment_bits = 128; - auto init_M = tma_alignment_bits; - auto init_N = tma_alignment_bits; - auto init_K = tma_alignment_bits; + auto init_M = int32_t(size<0>(TileShape{})); + auto init_N = int32_t(size<1>(TileShape{})); + auto init_K = int32_t(size<2>(TileShape{})); auto init_L = 1; // Tensor pointers will be fixed before the first access diff --git a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp index bb05b52a..8fc171e8 100644 --- a/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp +++ b/include/cutlass/gemm/collective/sm100_mma_array_warpspecialized_blockwise_scaling.hpp @@ -190,8 +190,14 @@ struct CollectiveMma< using TransformB = TransformB_; using ArchTag = typename DispatchPolicy::ArchTag; - static constexpr int AlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator); - static constexpr int AlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator); + static constexpr int CopyAlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator); + static constexpr int CopyAlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator); + + static constexpr int AlignmentSFA = CopyAlignmentSFA * (GmemTiledCopySFA::AtomNumVal::value > 1 ? + (size<0,1>(InternalLayoutSFA{}.stride()) == 1 ? ScaleGranularityM : ScaleGranularityK) : 1); + static constexpr int AlignmentSFB = CopyAlignmentSFB * (GmemTiledCopySFB::AtomNumVal::value > 1 ? + (size<0,1>(InternalLayoutSFB{}.stride()) == 1 ? ScaleGranularityN : ScaleGranularityK) : 1); + using MainloopABPipeline = cutlass::PipelineTmaUmmaAsync< DispatchPolicy::Stages, @@ -522,8 +528,8 @@ struct CollectiveMma< auto [M,N,K,L] = problem_shape_MNKL; implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), InternalStrideA{}); implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(N,K,L), InternalStrideB{}); - implementable_sf = implementable_sf && cutlass::detail::check_alignment(ScaleConfig::tile_atom_to_shape_SFA(problem_shape_MNKL)); - implementable_sf = implementable_sf && cutlass::detail::check_alignment(ScaleConfig::tile_atom_to_shape_SFB(problem_shape_MNKL)); + implementable_sf = implementable_sf && cutlass::detail::check_alignment(ScaleConfig::tile_atom_to_shape_SFA(problem_shape_MNKL)); + implementable_sf = implementable_sf && cutlass::detail::check_alignment(ScaleConfig::tile_atom_to_shape_SFB(problem_shape_MNKL)); if (!implementable_sf) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n"); } diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp index c8a5367a..d86c58be 100644 --- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp +++ b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp @@ -201,8 +201,14 @@ struct CollectiveMma< AtomThrShapeMNK>; using AccumulatorPipelineState = typename AccumulatorPipeline::PipelineState; - static constexpr int AlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator); - static constexpr int AlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator); + static constexpr int CopyAlignmentSFA = GmemTiledCopySFA::AtomNumVal::value * sizeof(typename GmemTiledCopySFA::ValType) / sizeof(ElementAccumulator); + static constexpr int CopyAlignmentSFB = GmemTiledCopySFB::AtomNumVal::value * sizeof(typename GmemTiledCopySFB::ValType) / sizeof(ElementAccumulator); + + static constexpr int AlignmentSFA = CopyAlignmentSFA * (GmemTiledCopySFA::AtomNumVal::value > 1 ? + (size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleGranularityM : ScaleGranularityK) : 1); + static constexpr int AlignmentSFB = CopyAlignmentSFB * (GmemTiledCopySFB::AtomNumVal::value > 1 ? + (size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleGranularityN : ScaleGranularityK) : 1); + // Two arrivals per thread in the warp (1 arrival and 1 arrival through cp.async.mbarrier) static constexpr int NumMainloopSFProducerThreadEvents = 64; @@ -566,8 +572,8 @@ struct CollectiveMma< CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); } - bool implementable_sf = cutlass::detail::check_alignment(args.layout_SFA); - implementable_sf = implementable_sf && cutlass::detail::check_alignment(args.layout_SFB); + bool implementable_sf = cutlass::detail::check_alignment(args.layout_SFA); + implementable_sf = implementable_sf && cutlass::detail::check_alignment(args.layout_SFB); if (!implementable_sf) { CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for Scale Factors.\n"); diff --git a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp deleted file mode 100644 index f8d1a00a..00000000 --- a/include/cutlass/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp +++ /dev/null @@ -1,824 +0,0 @@ -/*************************************************************************************************** - * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: BSD-3-Clause - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, this - * list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - **************************************************************************************************/ - - - - -#pragma once -#include - -#include "cutlass/cutlass.h" -#include "cutlass/gemm/gemm.h" -#include "cutlass/gemm/dispatch_policy.hpp" -#include "cutlass/pipeline/pipeline.hpp" -#include "cutlass/numeric_conversion.h" -#include "cutlass/detail/sm100_tmem_helper.hpp" -#include "cutlass/detail/cluster.hpp" - -#include "cute/algorithm/functional.hpp" -#include "cute/arch/cluster_sm90.hpp" -#include "cute/atom/mma_atom.hpp" -#include "cute/atom/copy_atom.hpp" -#include "cute/algorithm/gemm.hpp" -#include "cute/tensor_predicate.hpp" -#include "cute/arch/mma_sm100.hpp" -#include "cutlass/trace.h" -#include "cutlass/kernel_hardware_info.hpp" - -///////////////////////////////////////////////////////////////////////////////////////////////// - -namespace cutlass::gemm::collective { -using namespace cute; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -// WarpSpecialized Mainloop for Mixed Input Kernels -template < - int Load2TransformPipelineStageCount_, - int Transform2MmaPipelineStageCount_, - int SchedulerPipelineStageCount_, - int AccumulatorPipelineStageCount_, - class ClusterShape, - class TileShape_, - class ElementA_, - class StrideA_, - class ElementB_, - class StrideB_, - class TiledMma_, - class GmemTiledCopyA_, - class SmemLayoutAtomsA_, - class CopyAtomsA_, - class TransformA_, - class GmemTiledCopyB_, - class SmemLayoutAtomsB_, - class CopyAtomsB_, - class TransformB_> -struct CollectiveMma< - MainloopSm100TmaUmmaWarpSpecializedMixedInput< - Load2TransformPipelineStageCount_, - Transform2MmaPipelineStageCount_, - SchedulerPipelineStageCount_, - AccumulatorPipelineStageCount_, - ClusterShape>, - TileShape_, - ElementA_, - StrideA_, - ElementB_, - StrideB_, - TiledMma_, - GmemTiledCopyA_, - SmemLayoutAtomsA_, - CopyAtomsA_, - TransformA_, - GmemTiledCopyB_, - SmemLayoutAtomsB_, - CopyAtomsB_, - TransformB_> -{ - // - // Type Aliases - // - - // Determine MMA type: MMA_1SM vs MMA_2SM - using AtomThrShapeMNK = Shape(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>; - using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedMixedInput< - Load2TransformPipelineStageCount_, - Transform2MmaPipelineStageCount_, - SchedulerPipelineStageCount_, - AccumulatorPipelineStageCount_, - ClusterShape>; - using TileShape = TileShape_; - using TiledMma = TiledMma_; - static constexpr bool IsDynamicCluster = not cute::is_static_v; - using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{})); - - // Define A and B block shapes for reduced size TMA_LOADs - using CtaShapeA_MK = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{})))); - using CtaShapeB_NK = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{})))); - - using ElementA = ElementA_; - using StrideA = StrideA_; - using ElementAMma = typename TiledMma::ValTypeA; - - static constexpr int IsSubbyteA = cute::sizeof_bits_v < 8; - using TmaElementA = cute::conditional_t; - - using ElementB = ElementB_; - using StrideB = StrideB_; - using ElementBMma = typename TiledMma::ValTypeB; - - using StrideScale = cute::Stride, int64_t, int64_t>; - using NonVoidStrideScale = cute::conditional_t< - cute::is_void_v, cute::Stride<_1, int64_t, int64_t>, StrideScale>; - - using ElementAccumulator = typename TiledMma::ValTypeC; - using GmemTiledCopyA = GmemTiledCopyA_; - using GmemTiledCopyB = GmemTiledCopyB_; - using SmemLayoutAtomsA = SmemLayoutAtomsA_; - using SmemLayoutAtomsB = SmemLayoutAtomsB_; - using CopyAtomsA = CopyAtomsA_; - using CopyAtomsB = CopyAtomsB_; - using TransformA = TransformA_; - using TransformB = TransformB_; - using ArchTag = typename DispatchPolicy::ArchTag; - - static_assert(sizeof(ElementA) < 2, "Matrix to be scaled should be provided in A otherwise input is not supported"); - static_assert(cute::is_same_v || cute::is_same_v || cute::is_same_v, "Compute type A should be cutlass::bfloat16_t or cutlass::half_t or cutlass::float_e4m3_t"); - - using Load2TransformPipeline = cutlass::PipelineTmaTransformAsync< - DispatchPolicy::Load2TransformPipelineStageCount, - AtomThrShapeMNK>; - using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState; - - using Transform2MmaPipeline = cutlass::PipelineUmmaConsumerAsync< - DispatchPolicy::Transform2MmaPipelineStageCount, - AtomThrShapeMNK>; - using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState; - - using Mma2AccumPipeline = cutlass::PipelineUmmaAsync< - DispatchPolicy::Schedule::AccumulatorPipelineStageCount, - AtomThrShapeMNK>; - using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState; - - // Thread Counts - static constexpr uint32_t NumAccumThreads = 128; //Maintains compatibility with input_transform kernel - static constexpr uint32_t NumTransformationThreads = 128; - - // Get the Algorithm parameters - constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount; - constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{}); - - using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom; - using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom; - using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom; - using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom; - - using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom; - using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom; - using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom; - using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom; - - static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert(((size<0,0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape."); - static_assert(((size<0,1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape."); - - static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); - static_assert(((size<0,0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape."); - static_assert(((size<0,1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, "SmemLayoutAtomCompute must evenly divide tile shape."); - - // Tile along K mode first before tiling over MN. PIPE mode last as usual. - // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs. - using SmemLayoutA = decltype(UMMA::tile_to_mma_shape( - SmemLayoutAtomA{}, - append(CtaShapeA_MK{}, Int{}), - (cute::conditional_t(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}))); - - using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape( - SmemLayoutAtomACompute{}, - append(CtaShapeA_MK{}, Int{}), - (cute::conditional_t(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}))); - - using SmemLayoutB = decltype(UMMA::tile_to_mma_shape( - SmemLayoutAtomB{}, - append(CtaShapeB_NK{}, Int{}), - (cute::conditional_t(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}))); - - static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2 && DispatchPolicy::Load2TransformPipelineStageCount >= 2, - "Specialization requires Stages set to value 2 or more."); - static_assert((cute::is_base_of::value || - cute::is_base_of::value ) && - cute::is_base_of::value, - "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop."); - static_assert((cute::is_same_v || cute::is_same_v), - "GmemTiledCopyA - invalid TMA copy atom specified."); - static_assert((cute::is_same_v || cute::is_same_v), - "GmemTiledCopyB - invalid TMA copy atom specified."); - - struct PipelineStorage { - using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage; - alignas(16) Load2TransformPipelineStorage load2transform_pipeline; - using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage; - alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline; - using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage; - alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline; - }; - - struct SharedStorage { - struct TensorStorage : cute::aligned_struct<128, _0> { - - struct TensorStorageUntransformed { - cute::ArrayEngine> smem_A; - cute::ArrayEngine> smem_B; - }; - - struct TensorStorageTransformedAinSmem { - alignas(1024) cute::ArrayEngine> smem_ACompute; - alignas(1024) cute::ArrayEngine> smem_BCompute; - }; - - union TensorStorageTransformedAinTmem { - alignas(1024) cute::ArrayEngine smem_ACompute; // No smem_ACompute - alignas(1024) cute::ArrayEngine> smem_BCompute; - }; - - using TensorStorageTransformed = cute::conditional_t< - cute::is_base_of::value, - TensorStorageTransformedAinSmem, - TensorStorageTransformedAinTmem>; - - TensorStorageUntransformed input; - TensorStorageTransformed compute; - } tensors; - - PipelineStorage pipeline; - }; - using TensorStorage = typename SharedStorage::TensorStorage; - - // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on - // loaded input A and B matrices to convert the data type - static constexpr uint32_t TmaTransactionBytes = - cutlass::bits_to_bytes(size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * size<2>(SmemLayoutA{}) * static_cast(sizeof_bits::value))+ - cutlass::bits_to_bytes(size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * size<2>(SmemLayoutB{}) * static_cast(sizeof_bits::value)); - - // Host side kernel arguments - struct Arguments { - ElementA const* ptr_A{nullptr}; - StrideA dA{}; - ElementB const* ptr_B{nullptr}; - StrideB dB{}; - }; - - // Device side kernel params - struct Params { - using ClusterLayout_VMNK = decltype(tiled_divide(make_layout(conditional_return(make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})), - make_tile(typename TiledMma::AtomThrID{}))); - - using TMA_A = decltype(make_tma_atom_A_sm100( - GmemTiledCopyA{}, - make_tensor(static_cast(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), - SmemLayoutA{}(_,_,_,cute::Int<0>{}), - TileShape{}, - TiledMma{}, - ClusterLayout_VMNK{}) - ); - using TMA_B = decltype(make_tma_atom_B_sm100( - GmemTiledCopyB{}, - make_tensor(static_cast(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), - SmemLayoutB{}(_,_,_,cute::Int<0>{}), - TileShape{}, - TiledMma{}, - ClusterLayout_VMNK{}) - ); - TMA_A tma_load_a; - TMA_B tma_load_b; - TMA_A tma_load_a_fallback; - TMA_B tma_load_b_fallback; - dim3 cluster_shape_fallback; - }; - - CUTLASS_DEVICE - CollectiveMma(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster) - : cluster_shape_(cluster_shape) - , block_rank_in_cluster_(block_rank_in_cluster) { - if constexpr (IsDynamicCluster) { - const bool is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x && - cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y); - observed_tma_load_a_ = is_fallback_cluster ? ¶ms.tma_load_a_fallback : ¶ms.tma_load_a; - observed_tma_load_b_ = is_fallback_cluster ? ¶ms.tma_load_b_fallback : ¶ms.tma_load_b; - } - else { - observed_tma_load_a_ = ¶ms.tma_load_a; - observed_tma_load_b_ = ¶ms.tma_load_b; - } - } - - template - static constexpr Params - to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) { - (void) workspace; - - // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - Tensor tensor_a = make_tensor(args.ptr_A, make_layout(make_shape(M,K,L), args.dA)); - Tensor tensor_b = make_tensor(args.ptr_B, make_layout(make_shape(N,K,L), args.dB)); - - auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape); - // Cluster layout for TMA construction - auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{})); - - auto cluster_shape_fallback = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback); - // Cluster layout for TMA construction - auto cluster_layout_vmnk_fallback = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{})); - - typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100( - GmemTiledCopyA{}, - tensor_a, - SmemLayoutA{}(_,_,_,cute::Int<0>{}), - TileShape{}, - TiledMma{}, - cluster_layout_vmnk); - - typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100( - GmemTiledCopyB{}, - tensor_b, - SmemLayoutB{}(_,_,_,cute::Int<0>{}), - TileShape{}, - TiledMma{}, - cluster_layout_vmnk); - - typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100( - GmemTiledCopyA{}, - tensor_a, - SmemLayoutA{}(_,_,_,cute::Int<0>{}), - TileShape{}, - TiledMma{}, - cluster_layout_vmnk_fallback); - - typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100( - GmemTiledCopyB{}, - tensor_b, - SmemLayoutB{}(_,_,_,cute::Int<0>{}), - TileShape{}, - TiledMma{}, - cluster_layout_vmnk_fallback); - - return { - tma_load_a, - tma_load_b, - tma_load_a_fallback, - tma_load_b_fallback, - hw_info.cluster_shape_fallback - }; - } - - template - static bool - can_implement( - ProblemShape const& problem_shape, - [[maybe_unused]] Arguments const& args) { - constexpr int tma_alignment_bits = 128; - auto problem_shape_MNKL = append<4>(problem_shape, 1); - auto [M,N,K,L] = problem_shape_MNKL; - - bool implementable = true; - constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value; - implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{}); - constexpr int min_tma_aligned_elements_B = tma_alignment_bits / cutlass::sizeof_bits::value; - implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(N,K,L), StrideB{}); - - if (!implementable) { - CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n"); - } - return implementable; - } - - /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance - CUTLASS_DEVICE static void - prefetch_tma_descriptors(Params const& params) { - if constexpr (IsDynamicCluster) { - dim3 cs = cute::cluster_shape(); - const bool is_fallback_cluster = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y); - if (is_fallback_cluster) { - cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor()); - cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor()); - } - else { - cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor()); - cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor()); - } - } - else { - cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor()); - cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor()); - } - } - - /// Construct A Single Stage's Accumulator Shape - CUTLASS_DEVICE auto - partition_accumulator_shape() { - auto acc_shape = partition_shape_C(TiledMma{}, take<0,2>(TileShape{})); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N) - - return acc_shape; - } - - /// Produce the inputs to the transform threads by loading inputs from gmem -> smem - template < - class GTensorA, class GTensorB, - class GTensorPartitionedA, class GTensorPartitionedB, - class STensorA, class STensorB, - class TileCoordMNKL, - class KTileIterator - > - CUTLASS_DEVICE auto - load( - Params const& params, - Load2TransformPipeline pipeline, - Load2TransformPipelineState load2xform_pipeline_state, - cute::tuple const& load_inputs, - TileCoordMNKL const& cta_coord_mnkl, - KTileIterator k_tile_iter, int k_tile_count) { - - auto [unused_gA, unused_gB, - tAgA_mkl, tBgB_nkl, tAsA, tBsB, - mcast_mask_a, mcast_mask_b] = load_inputs; - - // slice out the work coord from tiled tensors - Tensor tAgA = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl)); - Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl)); - - uint32_t skip_wait = (k_tile_count <= 0); - auto pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait); - - // Issue the Mainloop loads - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) { - // LOCK mainloop_load2xform_pipeline_state for _writing_ - pipeline.producer_acquire(load2xform_pipeline_state, pipeline_flag); - int write_stage = load2xform_pipeline_state.index(); - - using BarrierType = typename Load2TransformPipeline::ProducerBarrierType; - BarrierType* tma_barrier = pipeline.producer_get_barrier(load2xform_pipeline_state); - - // Advance mainloop_pipe - ++load2xform_pipeline_state; - - skip_wait = (k_tile_count <= 1); - pipeline_flag = pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait); - - copy(observed_tma_load_a_->with(*tma_barrier, mcast_mask_a), tAgA(_,*k_tile_iter), tAsA(_,write_stage)); - copy(observed_tma_load_b_->with(*tma_barrier, mcast_mask_b), tBgB(_,*k_tile_iter), tBsB(_,write_stage)); - - ++k_tile_iter; - } - - return cute::make_tuple(load2xform_pipeline_state, k_tile_iter); - - } - - /// Set up the data needed by this collective for load. - /// Returned tuple must contain at least two elements, with the first two elements being: - /// gA_mkl - The tiled tensor for input A - /// gB_nkl - The tiled tensor for input B - // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks - template - CUTLASS_DEVICE auto - load_init( - ProblemShape_MNKL const& problem_shape_MNKL, - Params const& params, - TensorStorage& shared_storage) const { - auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL); - - ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{})); - - Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl); // (MMA, MMA_M, MMA_K, m, k, l) - Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl); // (MMA, MMA_N, MMA_K, n, k, l) - - Tensor sA = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{}); // (MMA,MMA_M,MMA_K,PIPE) - Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{}); // (MMA,MMA_N,MMA_K,PIPE) - - // Define the CTA-in-cluster Layout and Coord - Layout cta_layout_mnk = make_layout(cluster_shape_); - Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{})); - auto cta_coord_vmnk = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_); - - // Project the cta_layout for tma_a along the n-modes - auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_, - get<2>(cta_coord_vmnk), make_layout(size<2>(cta_layout_vmnk)), - group_modes<0,3>(sA), group_modes<0,3>(tCgA_mkl)); - - // Project the cta_layout for tma_b along the m-modes - auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_, - get<1>(cta_coord_vmnk), make_layout(size<1>(cta_layout_vmnk)), - group_modes<0,3>(sB), group_modes<0,3>(tCgB_nkl)); - - // TMA Multicast Masks - uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk); - uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk); - - - return cute::make_tuple( - gA_mkl, gB_nkl, // for scheduler - tAgA_mkl, tBgB_nkl, tAsA, tBsB, // for input tensor values - mcast_mask_a, mcast_mask_b); // multicast masks - } - - template< - class KTileIterator, class Accumulator, - class GTensorA, class DstCopyA, class SrcTensorA, class DstTensorA, - class GTensorB - > - CUTLASS_DEVICE auto - transform( - Load2TransformPipeline load2transform_pipeline, - Load2TransformPipelineState load2transform_pipeline_consumer_state, - Transform2MmaPipeline transform2mma_pipeline, - Transform2MmaPipelineState transform2mma_pipeline_producer_state, - Accumulator accumulators, - cute::tuple input_operands, - KTileIterator k_tile_iter, int k_tile_count) { - - cutlass::arch::NamedBarrier transform_bar(NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier); - - // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM) - // tAdA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM or TMEM) - // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, SmemStages (In SMEM) - // tBsB : (Copy,#Copy),MMA_Rest,MMA_N_Rest,MMA_K_Rest, NumComputeMtxs, SmemStages (In SMEM) - auto [unused_tAgA, dst_copy_A, tAsA, tAsACompute, - unused_tBgB] = input_operands; - - // Create the tensors in registers - auto tArA = make_tensor(tAsA(_,_,_,_,0).shape()); - auto tArACompute = make_tensor(tAsA(_,_,_,_,0).shape()); - - auto tArA_x2 = recast>(tArA); - auto tArACompute_x2 = recast>(tArACompute); - - - uint32_t skip_wait = (k_tile_count <= 0); - auto load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait); - auto transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait); - - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) { - - load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag); - transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag); - - int load2transform_consumer_index = load2transform_pipeline_consumer_state.index(); // read stage - int transform2mma_producer_index = transform2mma_pipeline_producer_state.index(); //write stage - - auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state; - auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state; - - // Copy the input A matrix from SMEM - copy(AutoVectorizingCopy{}, tAsA(_,_,_,_,load2transform_consumer_index), tArA); - //Transform Input A stored in registers - cute::transform(tArA_x2, tArACompute_x2, cutlass::NumericArrayConverter::convert); - //Transformed A stored in TMEM - copy(dst_copy_A, tArACompute, tAsACompute(_,_,_,_,transform2mma_producer_index)); - - // Loads from SMEM are done. Signal the mainloop load as early as possible - transform_bar.sync(); - load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state); - - // fence for SMEM writes - cutlass::arch::fence_view_async_shared(); - if constexpr (is_tmem::value) { - // fence for TMEM writes if A operand is coming from TMEM - cutlass::arch::fence_view_async_tmem_store(); - } - - // Let the MMA know we are done transforming - transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state); - - // Next pipeline stage - ++load2transform_pipeline_consumer_state; - ++transform2mma_pipeline_producer_state; - - skip_wait = (k_tile_count <= 1); - // Peek the next pipeline stage's barriers - load2transform_flag = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait); - transform2mma_flag = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait); - } - return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state); - } - - template - CUTLASS_DEVICE auto - transform_init( - Params const& params, - ProblemShape_MNKL const& problem_shape_MNKL, - Accumulator accumulators, - TensorStorage& shared_storage) { - - auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL); - - Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{}); - Tensor sA = as_position_independent_swizzle_tensor(sA_orig); //tCsA - Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{}); //tCsACompute - - // Map input, compute, and fragment tensors to - // Copy strategies and partitioned tensors. These will become the input - // operands of the transform function. Depending on MMA atom type, the - // operands can reside in SMEM or TMEM - auto setup_copy_ops = [&] ( - auto tensor_input, - auto input_copy_atom, - auto tensor_compute, - auto make_fragment, - auto compute_copy_atom) constexpr { - - auto fragment_compute = make_fragment(tensor_compute); //tCrA(Compute) - if constexpr (cute::is_tmem>::value) { - // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation. - // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation - // See: TmemAllocMode::Duplicated. - Tensor tensor_input2x = [&] () constexpr { - if constexpr (decltype(size<0,0>(fragment_compute) == Int<128>{} && size<0,0>(tensor_input) == Int<64>{})::value) { - return make_tensor(tensor_input.data(), - logical_product(tensor_input.layout(), - make_tile(make_tile(Layout<_2,_0>{},_),_,_,_))); // ((128,16),m,k,PIPE) - } - else { - return tensor_input; - } - }(); //tCsA_2x - - fragment_compute.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); //tCrA.data() - auto reg2tmem_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_,_,0,0)); - auto thr_reg2tmem_tiled_copy = reg2tmem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads); - auto partitioned_tensor_input = thr_reg2tmem_tiled_copy.partition_S(tensor_input2x); - auto partitioned_tensor_compute = thr_reg2tmem_tiled_copy.partition_D(fragment_compute); - return cute::make_tuple(reg2tmem_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute); - } - else { - auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute); - auto reg2smem_tiled_copy = make_cotiled_copy(compute_copy_atom, Layout, Stride< _8,_1>>{}, - tensor_compute(_,_,0,0).layout()); - - auto thr_reg2smem_tiled_copy = reg2smem_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads); - auto partitioned_tensor_input = thr_reg2smem_tiled_copy.partition_S(tensor_input); - auto partitioned_tensor_compute = thr_reg2smem_tiled_copy.partition_D(tensor_compute_ind_sw); - - return cute::make_tuple(AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute); - } - }; - - auto [dst_copy_A, tAsA, tAsACompute] = - setup_copy_ops(sA, InputCopyAtomA{}, sACompute, [&](auto &arg) {return TiledMma::make_fragment_A(arg);}, ComputeCopyAtomA{}); - - return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute, - gB_nkl); - } - - /// Perform a collective-scoped matrix multiply-accumulate - /// Consumer Perspective - template < - class FrgEngine, class FrgLayout, - class TensorA, class TensorB - > - CUTLASS_DEVICE auto - mma( - Transform2MmaPipeline transform2mma_pipeline, - Transform2MmaPipelineState transform2mma_pipeline_consumer_state, - Mma2AccumPipeline mma2accum_pipeline, - Mma2AccumPipelineState mma2accum_pipeline_producer_state, - cute::Tensor const& accumulators, - cute::tuple const& input_operands, - int k_tile_count - ) { - TiledMma tiled_mma; - - auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state; - auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state; - - uint32_t skip_wait = (k_tile_count <= 0); - auto transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait); - ++next_transform2mma_pipeline_consumer_state; - - - // tCrA : (MMA), MMA_M, MMA_K, SmemStage (In SMEM or TMEM) - // We use SMEM stages to match #buffers in Load <-> Convert - // tCrB : (MMA), MMA_N, MMA_K, SmemStages (In SMEM) - auto const [tCrA, tCrB] = input_operands; - - int remaining_accum_promotions = k_tile_count; - uint32_t mma2accum_skip_wait = (remaining_accum_promotions <= 0); - auto mma2accum_flag = mma2accum_pipeline.producer_try_acquire(mma2accum_pipeline_producer_state, mma2accum_skip_wait); - mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state, mma2accum_flag); - auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state; - ++mma2accum_pipeline_producer_state; - - // No accumulator addition to the k_tile initially - tiled_mma.accumulate_ = UMMA::ScaleOut::Zero; - - CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) { - - transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag); - - int transform2mma_pipeline_consumer_state_index = curr_transform2mma_pipeline_consumer_state.index(); //read_stage - int mma2accum_pipeline_producer_state_index = curr_mma2accum_pipeline_producer_state.index(); //write_stage - - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - - auto tCtC = accumulators(_,_,_,mma2accum_pipeline_producer_state_index); - - auto tCrA0 = tCrA(_,_,_,transform2mma_pipeline_consumer_state_index); - auto tCrB0 = tCrB(_,_,_,transform2mma_pipeline_consumer_state_index); - - cute::gemm(tiled_mma, tCrA0(_,_,k_block), tCrB0(_,_,k_block), tCtC); // A[0]*B[0] - tiled_mma.accumulate_ = UMMA::ScaleOut::One; - - } - - transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state); - - skip_wait = (k_tile_count <= 1); - transform2mma_flag = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait); - - curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state; - ++next_transform2mma_pipeline_consumer_state; - } - - mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state); - - return cute::make_tuple(curr_transform2mma_pipeline_consumer_state, mma2accum_pipeline_producer_state); - } - - template - CUTLASS_DEVICE auto - mma_init(cute::Tensor const& accumulators, TensorStorage& shared_storage) const { - TiledMma tiled_mma; - - auto get_tCrA = [&] () constexpr { - if constexpr (cute::is_base_of::value) { - Tensor sACompute = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{}); - return tiled_mma.make_fragment_A(sACompute); - } - else { - auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{})); - tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); - return tCrA; - } - }; - - Tensor tCrA = get_tCrA(); - Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{}); - Tensor tCrB = tiled_mma.make_fragment_B(sB); - return cute::make_tuple(tCrA, tCrB); - } - - template - CUTLASS_DEVICE auto - accum_init(cute::Tensor const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) { - return accumulators; - } - -private: - template - CUTLASS_DEVICE - constexpr auto - tile_input_tensors(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const { - using X = cute::Underscore; - // Separate out problem shape for convenience - auto [M,N,K,L] = problem_shape_MNKL; - - // Represent the full tensors -- get these from TMA - Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M,K,L)); - Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N,K,L)); - - // Tile the tensors and defer the slice - Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_,_,_), Step<_1, X,_1>{}); - Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_,_,_), Step< X,_1,_1>{}); - - return cute::make_tuple(gA_mkl, gB_nkl); - } - - typename Params::TMA_A const* observed_tma_load_a_ = nullptr; - typename Params::TMA_B const* observed_tma_load_b_ = nullptr; - - ClusterShape cluster_shape_; - uint32_t block_rank_in_cluster_; -}; - -///////////////////////////////////////////////////////////////////////////////////////////////// - -} // namespace cutlass::gemm::collective - -///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp b/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp index e7f43fd5..fc7bc988 100755 --- a/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp +++ b/include/cutlass/gemm/collective/sm120_blockscaled_mma_array_tma.hpp @@ -353,13 +353,12 @@ struct CollectiveMma< (void) workspace; // These tensor shapes (only applicable for grouped gemm) and pointers are only used to create tensormap/tma desc. // These will be replaced with correct values before the initial tma load. - auto init_shape = repeat_like(typename ProblemShape::UnderlyingProblemShape{}, int32_t(1)); - constexpr int tma_alignment_bits = 128; - auto init_M = tma_alignment_bits; - auto init_N = tma_alignment_bits; - auto init_K = tma_alignment_bits; + auto init_M = int32_t(size<0>(TileShape{})); + auto init_N = int32_t(size<1>(TileShape{})); + auto init_K = int32_t(size<2>(TileShape{})); + auto init_L = 1; + // Batches/Groups are managed by using appropriate pointers to input matrices - const uint32_t init_L = 1; TmaInternalElementA const* ptr_A_first_batch = nullptr; TmaInternalElementB const* ptr_B_first_batch = nullptr; ElementSF const* ptr_SFA_first_batch = nullptr; @@ -1058,11 +1057,11 @@ struct CollectiveMma< Tensor tensor_sfb = make_tensor(ptr_SF, mainloop_params.layout_SFB[next_group]); - cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, + cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_a, tensor_a, prob_shape_A, prob_stride_A); cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfa, tensor_sfa, prob_shape_SFA, prob_stride_SFA); - cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, + cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_b, tensor_b, prob_shape_B, prob_stride_B); cute::detail::fill_tma_gmem_shape_stride(mainloop_params.tma_load_sfb, tensor_sfb, prob_shape_SFB, prob_stride_SFB); @@ -1091,7 +1090,7 @@ struct CollectiveMma< prob_stride_B); cute::tma_descriptor_replace_dims_strides_in_shared_mem(shared_tensormaps.smem_tensormap_SFB, prob_shape_SFB, - prob_stride_SFB); + prob_stride_SFB); } // The entire warp must call this function collectively (that is, the instructions are aligned) @@ -1122,6 +1121,10 @@ struct CollectiveMma< tensormaps_cp_fence_release ( TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B); diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp index 25a68671..dc30ae56 100644 --- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_rs_warpspecialized_mixed_input.hpp @@ -1330,6 +1330,10 @@ public: tensormaps_cp_fence_release ( TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B); diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp index 8b06d58b..da16d118 100644 --- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized.hpp @@ -529,10 +529,9 @@ struct CollectiveMma< // Prologue GMMAs int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - assert(k_tile_count >= 1); tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; warpgroup_fence_operand(accum); - { + if (k_tile_count > 0) { // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); pipeline.consumer_wait(smem_pipe_read, barrier_token); @@ -739,6 +738,10 @@ struct CollectiveMma< tensormaps_cp_fence_release ( TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B); diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp index 676382ad..53348dff 100644 --- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8.hpp @@ -747,6 +747,10 @@ struct CollectiveMma< tensormaps_cp_fence_release ( TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B); diff --git a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp index 27c03af4..6cec1862 100644 --- a/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_array_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -135,6 +135,10 @@ struct CollectiveMma< static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{}); static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4."); + static_assert(ScalePromotionInterval >= size<2>(TileShape{}) / tile_size<2>(TiledMma{}), + "ScalePromotionInterval must be greater than or equal to the number of stages of the MMA atom."); + static_assert(ScalePromotionInterval % (size<2>(TileShape{}) / tile_size<2>(TiledMma{})) == 0, + "ScalePromotionInterval must be a multiple of the number of stages of the MMA atom."); static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN; @@ -811,31 +815,37 @@ struct CollectiveMma< Tensor tCrSFB = make_tensor_like(tCsSFB(_, _, _, _0{})); // (MMA,MMA_M,MMA_N) // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - assert(k_tile_count >= 1); tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + + // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) + auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + pipeline.consumer_wait(smem_pipe_read, barrier_token); + // fence_operand(); GmmaFP8Accumulation accumulation(accum, ScalePromotionInterval, size<2>(tCrA)); - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - if constexpr (ScalePromotionInterval != 4) { - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - } - else { - // Always zero out the accumulator for finest granularity - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } + warpgroup_fence_operand(accumulation()); + + { int read_stage = smem_pipe_read.index(); // Load per block scale values from shared memory to registers copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA); copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB); + warpgroup_fence_operand(accumulation()); + warpgroup_arrive(); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + // (V,M) x (V,N) => (V,M,N) + cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + warpgroup_commit_batch(); + warpgroup_fence_operand(accumulation()); + + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{}); } @@ -854,16 +864,9 @@ struct CollectiveMma< } } - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M) x (V,N) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - - warpgroup_commit_batch(); + warpgroup_wait<0>(); + ++smem_pipe_read; + barrier_token = pipeline.consumer_try_wait(smem_pipe_read); // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB` if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { @@ -879,90 +882,16 @@ struct CollectiveMma< if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile > 1) { scale_if_needed(accumulation, tCrSFA, tCrSFB); } - - ++smem_pipe_read; } warpgroup_fence_operand(accumulation()); - CUTLASS_PRAGMA_UNROLL - for (int k_tile_prologue = prologue_mma_count - 1; k_tile_prologue > 0; --k_tile_prologue) - { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); - pipeline.consumer_wait(smem_pipe_read, barrier_token); - - if constexpr (ScalePromotionInterval != 4) { - if (accumulation.prepare_if_needed()) { - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - } - else { - // Always zero out the accumulator for finest granularity - tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; - } - - int read_stage = smem_pipe_read.index(); - // Load per block scale values from shared memory to registers - copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA); - copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB); - - if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { - tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{}); - } - if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) { - ElementBlockScale scale_b = tCrSFB(_0{}); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) { - filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b; - } - } - if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) { - ElementBlockScale scale_a = tCrSFA(_0{}); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) { - filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a; - } - } - - warpgroup_arrive(); - // Unroll the K mode manually to set scale D to 1 - CUTLASS_PRAGMA_UNROLL - for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { - // (V,M) x (V,N) => (V,M,N) - cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); - tiled_mma.accumulate_ = GMMA::ScaleOut::One; - } - - warpgroup_commit_batch(); - - // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB` - if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { - ElementBlockScale scale_ab = tCrSFA(_0{}); - scale_if_needed(accumulation, scale_ab); - } - if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) { - scale_if_needed(accumulation, tCrSFA); - } - if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) { - scale_if_needed(accumulation, tCrSFB); - } - if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile > 1) { - scale_if_needed(accumulation, tCrSFA, tCrSFB); - } - - ++smem_pipe_read; - } - - warpgroup_fence_operand(accumulation()); // Mainloop GMMAs - k_tile_count -= prologue_mma_count; + k_tile_count--; CUTLASS_PRAGMA_NO_UNROLL - for ( ; k_tile_count > 0; --k_tile_count) + for ( ; k_tile_count > 1; --k_tile_count) { - // WAIT on smem_pipe_read until its data are available (phase bit flips from rdPhaseBit value) - auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read); pipeline.consumer_wait(smem_pipe_read, barrier_token); // @@ -970,29 +899,10 @@ struct CollectiveMma< // int read_stage = smem_pipe_read.index(); - // fence_operand(); + // Load per block scale values from shared memory to registers (at most twice per block along M and/or N) - copy(tCsSFA(_,_,_,make_coord(_0{},read_stage)), tCrSFA); - copy(tCsSFB(_,_,_,make_coord(_0{},read_stage)), tCrSFB); - - if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { - tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{}); - } - if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) { - ElementBlockScale scale_b = tCrSFB(_0{}); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) { - filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b; - } - } - if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) { - ElementBlockScale scale_a = tCrSFA(_0{}); - CUTLASS_PRAGMA_UNROLL - for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) { - filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a; - } - } - + copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA); + copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB); if constexpr (ScalePromotionInterval != 4) { if (accumulation.prepare_if_needed()) { @@ -1005,7 +915,6 @@ struct CollectiveMma< } warpgroup_fence_operand(accumulation()); - warpgroup_arrive(); // Unroll the K mode manually to set scale D to 1 CUTLASS_PRAGMA_UNROLL @@ -1017,9 +926,31 @@ struct CollectiveMma< warpgroup_commit_batch(); /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed - warpgroup_wait(); warpgroup_fence_operand(accumulation()); + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { + tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{}); + } + if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) { + ElementBlockScale scale_b = tCrSFB(_0{}); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) { + filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b; + } + } + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) { + ElementBlockScale scale_a = tCrSFA(_0{}); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) { + filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a; + } + } + + warpgroup_wait<0>(); + pipeline.consumer_release(smem_pipe_release); // Unlock previous tile + ++smem_pipe_read; + barrier_token = pipeline.consumer_try_wait(smem_pipe_read); + // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB` if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { ElementBlockScale scale_ab = tCrSFA(_0{}); @@ -1035,13 +966,81 @@ struct CollectiveMma< scale_if_needed(accumulation, tCrSFA, tCrSFB); } - // UNLOCK smem_pipe_release, done _computing_ on it - pipeline.consumer_release(smem_pipe_release); - // Advance smem_pipe_read and smem_pipe_release - ++smem_pipe_read; ++smem_pipe_release; } + + if (k_tile_count) { + pipeline.consumer_wait(smem_pipe_read, barrier_token); + + // + // Compute on k_tile + // + + int read_stage = smem_pipe_read.index(); + // Load per block scale values from shared memory to registers (at most twice per block along M and/or N) + copy(tCsSFA(_,_,_,make_coord(_0{}, read_stage)), tCrSFA); + copy(tCsSFB(_,_,_,make_coord(_0{}, read_stage)), tCrSFB); + + if constexpr (ScalePromotionInterval != 4) { + if (accumulation.prepare_if_needed()) { + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + } + } + else { + // Always zero out the accumulator for finest granularity + tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + } + + warpgroup_fence_operand(accumulation()); + warpgroup_arrive(); + // Unroll the K mode manually to set scale D to 1 + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) { + // (V,M) x (V,N) => (V,M,N) + cute::gemm(tiled_mma, tCrA(_,_,k_block,read_stage), tCrB(_,_,k_block,read_stage), accumulation()); + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + } + warpgroup_commit_batch(); + + /// Wait on the GMMA barrier for K_PIPE_MMAS (or fewer) outstanding to ensure smem_pipe_write is consumed + warpgroup_fence_operand(accumulation()); + + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { + tCrSFA(_0{}) = tCrSFA(_0{}) * tCrSFB(_0{}); + } + if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) { + ElementBlockScale scale_b = tCrSFB(_0{}); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(filter_zeros(tCrSFA)); i++) { + filter_zeros(tCrSFA)(i) = filter_zeros(tCrSFA)(i) * scale_b; + } + } + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) { + ElementBlockScale scale_a = tCrSFA(_0{}); + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < size(filter_zeros(tCrSFB)); i++) { + filter_zeros(tCrSFB)(i) = filter_zeros(tCrSFB)(i) * scale_a; + } + } + warpgroup_wait<0>(); + pipeline.consumer_release(smem_pipe_release); // Unlock previous tile + + // Block scale the accumulators with reg tensor `tCrSFA` and `tCrSFB` + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { + ElementBlockScale scale_ab = tCrSFA(_0{}); + scale_if_needed(accumulation, scale_ab); + } + if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile == 1) { + scale_if_needed(accumulation, tCrSFA); + } + if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile > 1) { + scale_if_needed(accumulation, tCrSFB); + } + if constexpr (ScaleMsPerTile > 1 && ScaleNsPerTile > 1) { + scale_if_needed(accumulation, tCrSFA, tCrSFB); + } + } if constexpr (ScalePromotionInterval != 4) { // residues only exists when granularity is not the finnest if constexpr (ScaleMsPerTile == 1 && ScaleNsPerTile == 1) { @@ -1066,19 +1065,9 @@ struct CollectiveMma< /// Perform a Consumer Epilogue to release all buffers CUTLASS_DEVICE void mma_tail(MainloopPipeline pipeline, PipelineState smem_pipe_release, int k_tile_count) { - // Prologue GMMAs - int prologue_mma_count = min(K_PIPE_MMAS, k_tile_count); - k_tile_count -= prologue_mma_count; - - smem_pipe_release.advance(k_tile_count); - - // Wait on all GMMAs to complete - warpgroup_wait<0>(); - - for (int count = 0; count < prologue_mma_count; ++count) { - pipeline.consumer_release(smem_pipe_release); // UNLOCK smem_pipe_release, done _computing_ on it - ++smem_pipe_release; - } + // The pipeline is not released in the first iteration + smem_pipe_release.advance(k_tile_count - 1); + pipeline.consumer_release(smem_pipe_release); } // @@ -1198,6 +1187,10 @@ struct CollectiveMma< tensormaps_cp_fence_release ( TensorMapStorage& shared_tensormaps, cute::tuple const& input_tensormaps) { + if (cute::elect_one_sync()) { + cute::tma_desc_commit_group(); + cute::tma_desc_wait_group(); + } // Entire warp must do this (i.e. it's aligned) tma_descriptor_cp_fence_release(get<0>(input_tensormaps), shared_tensormaps.smem_tensormap_A); tma_descriptor_cp_fence_release(get<1>(input_tensormaps), shared_tensormaps.smem_tensormap_B); diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp index 5b8f1059..19009d5d 100644 --- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -128,6 +128,10 @@ struct CollectiveMma< static constexpr int ScalePromotionInterval = ScaleGranularityK / size<2>(typename TiledMma::AtomShape_MNK{}); static_assert(ScalePromotionInterval % 4 == 0, "ScalePromotionInterval must be a multiple of 4."); + static_assert(ScalePromotionInterval >= size<2>(TileShape{}) / tile_size<2>(TiledMma{}), + "ScalePromotionInterval must be greater than or equal to the number of stages of the MMA atom."); + static_assert(ScalePromotionInterval % (size<2>(TileShape{}) / tile_size<2>(TiledMma{})) == 0, + "ScalePromotionInterval must be a multiple of the number of stages of the MMA atom."); static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; static constexpr int ScaleNsPerTile = size<1>(TileShape{}) / ScaleGranularityN; @@ -213,7 +217,6 @@ struct CollectiveMma< StrideA dA; ElementB const* ptr_B; StrideB dB; - uint32_t mma_promotion_interval = 4; ElementBlockScale const* ptr_SFA; LayoutSFA layout_SFA; ElementBlockScale const* ptr_SFB; @@ -382,16 +385,6 @@ struct CollectiveMma< CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Problem size doesn't meet the minimum alignment requirements for using TMA to load scale B.\n"); } - /* MMA promotion interval should be a multiple of 4, since each mainloop iteration would issue 4 MMA instructions. */ - constexpr int pipe_k = size<2>(TileShape{}) / tile_size<2>(TiledMma{}); - if (args.mma_promotion_interval % 4 != 0 || - args.mma_promotion_interval != ScalePromotionInterval || - args.mma_promotion_interval % pipe_k != 0 || - pipe_k > args.mma_promotion_interval) { - implementable = false; - CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Argument mma_promotion_interval is invalid.\n"); - } - // We expect full tiles in K if (K % size<2>(TileShape{}) != 0) { implementable = false; @@ -1001,7 +994,7 @@ struct CollectiveMma< // Advance smem_pipe_read and smem_pipe_release ++smem_pipe_release; } - if (k_tile_count == 1) { + if (k_tile_count) { pipeline.consumer_wait(smem_pipe_read, barrier_token); // diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp index 7f8e6f30..712fc1ba 100644 --- a/include/cutlass/gemm/dispatch_policy.hpp +++ b/include/cutlass/gemm/dispatch_policy.hpp @@ -1016,6 +1016,8 @@ struct MainloopSm100ArrayTmaUmmaWarpSpecializedFastF32 { +// Mainloop schedule for array-based TMA + template< int Stages_, int SchedulerPipelineStageCount_, diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp index 055b56e3..78401097 100644 --- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized.hpp @@ -824,9 +824,6 @@ public: } else if (is_participant.sched) { - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } // Signal the epilogue warps to proceed once the prologue is complete epilogue_throttle_barrier.arrive(); @@ -837,6 +834,8 @@ public: // why this variable is needed. bool requires_clc_query = true; + cutlass::arch::wait_on_dependent_grids(); + do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. @@ -872,6 +871,9 @@ public: clc_pipeline.producer_tail(clc_pipe_producer_state); } else { + + cutlass::arch::wait_on_dependent_grids(); + do { auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state); work_tile_info = next_work_tile_info; diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp index 57c00407..76432e1e 100644 --- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp +++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_input_transform.hpp @@ -832,10 +832,6 @@ public: // Register reconfiguration arch::warpgroup_reg_dealloc(); - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } - // Signal the epilogue warps to proceed once the prologue is complete epilogue_throttle_barrier.arrive(); @@ -845,6 +841,8 @@ public: // See comment below where this variable is updated for a description of // why this variable is needed. bool requires_clc_query = true; + + cutlass::arch::wait_on_dependent_grids(); do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. @@ -883,6 +881,7 @@ public: clc_pipeline.producer_tail(clc_pipeline_producer_state); } else { + cutlass::arch::wait_on_dependent_grids(); do { auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipeline_producer_state); work_tile_info = next_work_tile_info; diff --git a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp index c036c0af..83eebaf5 100644 --- a/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp +++ b/include/cutlass/gemm/kernel/sm100_gemm_array_tma_warpspecialized_mma_transform.hpp @@ -941,6 +941,8 @@ public: // why this variable is needed. bool requires_clc_query = true; + cutlass::arch::wait_on_dependent_grids(); + do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. @@ -976,6 +978,7 @@ public: clc_pipeline.producer_tail(clc_pipe_producer_state); } else { + cutlass::arch::wait_on_dependent_grids(); do { auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(clc_pipeline, clc_pipe_producer_state); work_tile_info = next_work_tile_info; diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp index 222a7ad1..3989ffe3 100644 --- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized.hpp @@ -177,7 +177,6 @@ public: // Kernel level shared memory storage struct SharedStorage { - // Barriers should be allocated in lower 8KB of SMEM for SM100 struct PipelineStorage : cute::aligned_struct<16, _1> { using MainloopPipelineStorage = typename CollectiveMainloop::PipelineStorage; using EpiLoadPipelineStorage = typename CollectiveEpilogue::PipelineStorage; @@ -649,15 +648,14 @@ public: } else if (is_participant.sched) { - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } if constexpr (IsSchedDynamicPersistent) { // Whether a new CLC query must be performed. // See comment below where this variable is updated for a description of // why this variable is needed. bool requires_clc_query = true; + cutlass::arch::wait_on_dependent_grids(); + do { if (requires_clc_query) { // Query next clcID and update producer state diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp index ae712512..fcaae852 100644 --- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp +++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_input_transform.hpp @@ -717,10 +717,6 @@ public: // Register reconfiguration arch::warpgroup_reg_dealloc(); - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } - // Signal the epilogue warps to proceed once the prologue is complete epilogue_throttle_barrier.arrive(); @@ -729,6 +725,9 @@ public: // See comment below where this variable is updated for a description of // why this variable is needed. bool requires_clc_query = true; + + cutlass::arch::wait_on_dependent_grids(); + do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. diff --git a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp index 1826cce9..180bda31 100644 --- a/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp +++ b/include/cutlass/gemm/kernel/sm100_gemm_tma_warpspecialized_mma_transform.hpp @@ -769,10 +769,6 @@ public: // Register reconfiguration arch::warpgroup_reg_dealloc(); - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } - // Signal the epilogue warps to proceed once the prologue is complete epilogue_throttle_barrier.arrive(); @@ -783,6 +779,8 @@ public: // why this variable is needed. bool requires_clc_query = true; + cutlass::arch::wait_on_dependent_grids(); + do { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. diff --git a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp index a3494d33..0932f5c6 100644 --- a/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp +++ b/include/cutlass/gemm/kernel/sm100_sparse_gemm_tma_warpspecialized.hpp @@ -687,15 +687,14 @@ public: } else if (is_participant.sched) { - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } if constexpr (IsSchedDynamicPersistent) { // Whether a new CLC query must be performed. // See comment below where this variable is updated for a description of // why this variable is needed. bool requires_clc_query = true; + cutlass::arch::wait_on_dependent_grids(); + do { if (requires_clc_query) { // Query next clcID and update producer state diff --git a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp index ac57aa6d..610dfc6e 100644 --- a/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp +++ b/include/cutlass/gemm/kernel/sm120_gemm_tma_warpspecialized_cooperative_asymmetric_dma.hpp @@ -551,13 +551,12 @@ public: if (producer_warp_role == ProducerWarpRole::Warp1) { work_tile_info = scheduler.initial_work_tile_info(ClusterShape{}); - if constexpr (IsSchedDynamicPersistent) { - cutlass::arch::wait_on_dependent_grids(); - } - if constexpr (IsSchedDynamicPersistent) { bool requires_clc_query = true; TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state(); + + cutlass::arch::wait_on_dependent_grids(); + while (work_tile_info.is_valid()) { if (requires_clc_query) { // Throttle CLC query to mitigate workload imbalance caused by skews among persistent workers. diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp index 93e9b797..4f5723da 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_cooperative.hpp @@ -153,12 +153,12 @@ public: cute::conditional_t, void, ProblemShape> // Use void for default scheduler. >::Scheduler; - static constexpr auto TileSchedulerStages = 8; - using TileSchedulerArguments = typename TileScheduler::Arguments; using TileSchedulerParams = typename TileScheduler::Params; using TileSchedulerResponse = typename TileSchedulerResponseGetter::Type; + static constexpr auto TileSchedulerStages = 8; + static constexpr uint32_t NumLoadWarpGroups = 1; static constexpr uint32_t NumMmaThreads = size(TiledMma{}); static constexpr uint32_t NumMmaWarpGroups = NumMmaThreads / NumThreadsPerWarpGroup; @@ -378,7 +378,6 @@ public: if (status != Status::kSuccess) { return status; } - return status; } @@ -461,6 +460,7 @@ public: return TileScheduler{params.scheduler}; } } (); + // In a warp specialized kernel, collectives expose data movement and compute operations separately CollectiveMainloop collective_mainloop; CollectiveEpilogue collective_epilogue(params.epilogue, shared_storage.tensors.epilogue); @@ -484,7 +484,7 @@ public: typename TileSchedulerPipeline::Params tile_scheduler_pipeline_params; if constexpr (cute::is_same_v) { if (warp_group_role == WarpGroupRole::Producer - && producer_warp_role == ProducerWarpRole::Scheduler) { + && producer_warp_role == ProducerWarpRole::Scheduler) { tile_scheduler_pipeline_params.role = TileSchedulerPipeline::ThreadCategory::Producer; } else { @@ -499,7 +499,6 @@ public: tile_scheduler_pipeline_params.producer_arv_count = 1; } TileSchedulerPipeline tile_scheduler_pipeline(shared_storage.pipelines.scheduler, tile_scheduler_pipeline_params); - // Mainloop Load pipeline using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline; typename MainloopPipeline::Params mainloop_pipeline_params; @@ -683,9 +682,8 @@ public: block_rank_in_cluster, shared_storage.tensors.mainloop ); - // Update starting pipeline state for the next tile - // Wait for the last TMA stage to complete loading, before issuing tensormap updates - mainloop_pipe_producer_state.advance(work_k_tile_count - 1); + // Pipeline state is only advanced if there are K tiles to compute + mainloop_pipe_producer_state.advance(work_k_tile_count); // Signal for the epilogue load warp to begin if (do_load_order_arrive) { @@ -706,11 +704,6 @@ public: if constexpr (IsGroupedGemmKernel) { problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1); } - // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates - // Since this state is waiting for loads to finish, it must start in the inverted phase. - typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state = - {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()}; - mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state); collective_mainloop.tensormaps_perform_update( shared_storage.tensormaps.mainloop, params.mainloop, @@ -723,8 +716,6 @@ public: // Entire warp must do this (i.e. it's aligned) collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps); } - // Advance the producer state for the last remaining stage that was being waited for above - mainloop_pipe_producer_state.advance(1); } while (work_tile_info.is_valid()); // Scheduler work fetch loop // Make sure all Consumer Warp Groups have been waited upon @@ -771,8 +762,8 @@ public: block_rank_in_cluster, shared_storage.tensors.mainloop ); + // Update starting pipeline state for the next tile - // Wait for the last TMA stage to complete loading, before issuing tensormap updates mainloop_pipe_producer_state.advance(work_k_tile_count); // Get next work tile @@ -790,8 +781,8 @@ public: } } } while (work_tile_info.is_valid()); // Scheduler work fetch loop - } - } + } // End of auxiliary load needed check + } // Mainloop Auxiliary Load Producer Warp End // Epilogue Producer Warp else if (producer_warp_role == ProducerWarpRole::Epilogue && collective_epilogue.is_producer_load_needed()) { int32_t const sm_idx = blockIdx.x + (blockIdx.y * gridDim.x); @@ -854,6 +845,7 @@ public: wait ); } + work_tile_info = next_work_tile_info; if (increment_pipe) { ++tile_scheduler_pipe_consumer_state; @@ -917,8 +909,8 @@ public: // Converge before issuing tensormap fence release since fence is aligned __syncwarp(); - collective_epilogue.template tensormaps_cp_fence_release(shared_storage.tensormaps.epilogue, - epi_store_tensormap, + collective_epilogue.template tensormaps_cp_fence_release(shared_storage.tensormaps.epilogue, + epi_store_tensormap, consumer_warp_group_idx); } @@ -1021,7 +1013,7 @@ public: // Converge before issuing tensormap fence release since fence is aligned __syncwarp(); - collective_epilogue.template tensormaps_cp_fence_release(shared_storage.tensormaps.epilogue, + collective_epilogue.template tensormaps_cp_fence_release(shared_storage.tensormaps.epilogue, epi_store_tensormap, consumer_warp_group_idx); } diff --git a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp index c720c215..f33f4685 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_array_tma_warpspecialized_pingpong.hpp @@ -69,7 +69,6 @@ class GemmUniversal< cute::enable_if_t> > { - // Get the type of the scheduler response. template struct TileSchedulerResponseGetter { @@ -145,7 +144,6 @@ public: TileScheduler_ >; - using TileScheduler = typename detail::TileSchedulerSelector< SchedulerTag, ArchTag, @@ -646,6 +644,8 @@ public: cutlass::arch::warpgroup_reg_dealloc(); if (producer_warp_role == ProducerWarpRole::Scheduler) { + // GroupScheduler requires a producer warp to iterate over the group infos and push + // the work tile infos to the downstream pipelines. if constexpr (cute::is_same_v) { do { auto [next_work_tile_info, increment_pipe] = scheduler.advance_to_next_work(tile_scheduler_pipeline, tile_scheduler_pipe_producer_state); @@ -684,7 +684,8 @@ public: bool did_batch_change = true; do { if (!TileScheduler::valid_warpgroup_in_work_tile(work_tile_info)) { - auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work(work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state); + auto [next_work_tile_info, increment_pipe] = scheduler.fetch_next_work( + work_tile_info, tile_scheduler_pipeline, tile_scheduler_pipe_consumer_state); work_tile_info = next_work_tile_info; if (increment_pipe) { ++tile_scheduler_pipe_consumer_state; @@ -719,9 +720,8 @@ public: block_rank_in_cluster, shared_storage.tensors.mainloop ); - // Update starting pipeline state for the next tile - // Wait for the last TMA stage to complete loading, before issuing tensormap updates - mainloop_pipe_producer_state.advance(work_k_tile_count - 1); + // Pipeline state is only advanced if there are K tiles to compute + mainloop_pipe_producer_state.advance(work_k_tile_count); // Signal for the epilogue load warp to begin if (do_load_order_arrive) { @@ -742,11 +742,6 @@ public: if constexpr (IsGroupedGemmKernel) { problem_shape_MNKL = append<4>(params.problem_shape.get_problem_shape(curr_batch), 1); } - // Purpose of this pipeline state is to make sure TMA loads have finished before doing descriptor updates - // Since this state is waiting for loads to finish, it must start in the inverted phase. - typename CollectiveMainloop::PipelineState mainloop_pipe_tma_consumer_state = - {mainloop_pipe_producer_state.index(), !mainloop_pipe_producer_state.phase(), mainloop_pipe_producer_state.count()}; - mainloop_pipeline.consumer_wait(mainloop_pipe_tma_consumer_state); collective_mainloop.tensormaps_perform_update( shared_storage.tensormaps.mainloop, params.mainloop, @@ -759,8 +754,6 @@ public: // Entire warp must do this (i.e. it's aligned) collective_mainloop.tensormaps_cp_fence_release(shared_storage.tensormaps.mainloop, input_tensormaps); } - // Advance the producer state for the last remaining stage that was being waited for above - mainloop_pipe_producer_state.advance(1); } while (work_tile_info.is_valid()); // Scheduler work fetch loop // Make sure all Consumer Warp Groups have been waited upon diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp index 1264b230..dc5610fc 100644 --- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp +++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp @@ -546,6 +546,8 @@ public: if constexpr (IsSchedDynamicPersistent) { bool requires_clc_query = true; TileSchedulerPipelineState scheduler_pipe_producer_state = cutlass::make_producer_start_state(); + + cutlass::arch::wait_on_dependent_grids(); while (work_tile_info.is_valid()) { if (requires_clc_query) { diff --git a/include/cutlass/pipeline/sm100_pipeline.hpp b/include/cutlass/pipeline/sm100_pipeline.hpp index 3dba8dda..53bc9199 100644 --- a/include/cutlass/pipeline/sm100_pipeline.hpp +++ b/include/cutlass/pipeline/sm100_pipeline.hpp @@ -334,9 +334,9 @@ public: static constexpr bool IsDynamicCluster = not cute::is_static_v; static_assert(IsDynamicCluster or ((cute::size<0>(cluster_shape) % cute::size<0>(atom_thr_shape) == 0) && (cute::size<1>(cluster_shape) % cute::size<1>(atom_thr_shape) == 0))); - uint32_t const multicast_consumer_arrival_count = (cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) + - (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) - 1; - + uint32_t const num_consumer_per_cluster = params.num_consumers / NumThreadsPerWarpGroup; + uint32_t const multicast_consumer_arrival_count = ((cute::size<0>(cluster_shape) / cute::size<0>(atom_thr_shape)) + + (cute::size<1>(cluster_shape) / cute::size<1>(atom_thr_shape)) - 1) * num_consumer_per_cluster; cutlass::arch::detail::initialize_barrier_array_pair_aligned( storage.full_barrier_, storage.empty_barrier_, producer_arv_cnt, multicast_consumer_arrival_count); } diff --git a/include/cutlass/version.h b/include/cutlass/version.h index a2880049..41d78322 100644 --- a/include/cutlass/version.h +++ b/include/cutlass/version.h @@ -34,9 +34,9 @@ #include #include -#define CUTLASS_MAJOR 3 -#define CUTLASS_MINOR 9 -#define CUTLASS_PATCH 2 +#define CUTLASS_MAJOR 4 +#define CUTLASS_MINOR 0 +#define CUTLASS_PATCH 0 #ifdef CUTLASS_VERSIONS_GENERATED #include "cutlass/version_extended.h" diff --git a/media/docs/cpp/blackwell.rst b/media/docs/cpp/blackwell.rst new file mode 100644 index 00000000..ccb45239 --- /dev/null +++ b/media/docs/cpp/blackwell.rst @@ -0,0 +1,10 @@ +.. _blackwell: + +Blackwell Specific +================== + +.. toctree:: + :maxdepth: 2 + + Blackwell SM100/SM120 GEMMs + Blackwell Cluster Launch Control diff --git a/media/docs/cpp/blackwell_cluster_launch_control.md b/media/docs/cpp/blackwell_cluster_launch_control.md index a4006f20..1504c144 100644 --- a/media/docs/cpp/blackwell_cluster_launch_control.md +++ b/media/docs/cpp/blackwell_cluster_launch_control.md @@ -6,7 +6,7 @@ A GEMM workload usually consists of three phases: prologue, mainloop and epilogu Consider a GEMM that has `20x20x1` output tiles, running on a GPU with `100` SMs. There is another kernel occupying all the resources of `20` SMs so only `80` SMs can be used. Assume cluster shape is `1x1x1`. The following diagram shows how the schedule would look like for such a kernel. -

GEMM tiles are evenly divided among available SMs

+

A beautiful sunset

### Static Scheduler @@ -14,7 +14,7 @@ CUTLASS has adopted a software technique named **persistent kernels**. Persisten However, static scheduler is susceptible to workload imbalance if the resources of some SMs are unavailable. The following diagram illustrates this issue. -

GEMM tiles are unevenly divided among available SMs, leading to workload imbalance

+

A beautiful sunset

### Dynamic Scheduler with Cluster Launch Control A fundamental limitation of persistent scheduling is that the number of SMs this kernel can utilize is unknown in real time. Some SMs might be occupied by another kernel and thus their resources are unavailable. This makes it challenging to load-balance work across SMs. @@ -32,7 +32,7 @@ Cluster launch control follows the below rules: The following diagram shows how the schedule would look like with cluster launch control. -

GEMM tiles are dynamically allocated among available SMs, leading to a balanced workload

+

A beautiful sunset

## Programming Model ### Pseudo Code @@ -120,7 +120,7 @@ The CLC pipeline has a depth of 3 to overlap the CLC operations of multiple wave -# Copyright +### Copyright Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/blackwell_functionality.md b/media/docs/cpp/blackwell_functionality.md index 582899d3..df3d7f13 100644 --- a/media/docs/cpp/blackwell_functionality.md +++ b/media/docs/cpp/blackwell_functionality.md @@ -723,7 +723,7 @@ Specialized policies must be used to generate mixed-input-datatype `mx_float4_t` |----------------|----|----|----|----|------------------------------------| 128x128x128 | Y | N | N | N | `KernelTmaWarpSpecializedPingpong` or `KernelTmaWarpSpecializedCooperative` | -# Copyright +### Copyright Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/build/building_in_windows_with_visual_studio.md b/media/docs/cpp/build/building_in_windows_with_visual_studio.md index ebadf321..08935db1 100644 --- a/media/docs/cpp/build/building_in_windows_with_visual_studio.md +++ b/media/docs/cpp/build/building_in_windows_with_visual_studio.md @@ -5,7 +5,7 @@ Users and developers may build either in Visual Studio's graphical integrated development environment, or on the command line with `cmake --build`. -# Software prerequisites +## Software prerequisites 1. Windows 10 or 11 @@ -22,7 +22,7 @@ or on the command line with `cmake --build`. Visual Studio must be installed *before* the CUDA Toolkit. Otherwise, Visual Studio's build system won't know about CUDA. -# Operating system settings +## Operating system settings By default, Windows restricts the maximum file path length (`MAX_PATH`) to 260 characters. CUTLASS has many files and directory paths that challenge this requirement. @@ -48,7 +48,7 @@ before attempting to clone or build CUTLASS. [This Microsoft help article](https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation?tabs=registry) explains different ways to change the registry setting. -# Set up build environment +## Set up build environment 1. Run "git bash" to get a familiar command-line interface @@ -62,7 +62,7 @@ explains different ways to change the registry setting. Alternate approaches may rely on the CMake GUI and/or Windows' native command line. -# Building +## Building A successful CMake run will create a `CUTLASS.sln` Visual Studio "solution" file in the build directory. One can open this in Visual Studio and build the entire solution or any subset of projects as desired. @@ -77,7 +77,7 @@ Unlike with CMake's Makefile or Ninja generators, `CMAKE_BUILD_TYPE` has no effect on the Visual Studio generator, because the Visual Studio generator creates all build configurations. -# Tips +## Tips With Windows builds, one may find that CMake reruns unnecessarily. For example, cancelling a build and starting it again may rerun CMake. @@ -86,7 +86,7 @@ One work-around is to set the CMake option `CMAKE_SUPPRESS_REGENERATION=ON`. However, this turns off CMake's ability to detect on its own when it needs to rerun. As a result, one will need to know when to rerun CMake by hand. -## Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/build/building_with_clang_as_host_compiler.md b/media/docs/cpp/build/building_with_clang_as_host_compiler.md index 47b3971d..332d2006 100644 --- a/media/docs/cpp/build/building_with_clang_as_host_compiler.md +++ b/media/docs/cpp/build/building_with_clang_as_host_compiler.md @@ -5,7 +5,7 @@ Clang as host compiler, and NVCC as device compiler. This is NOT the same as building with Clang as both host and device compiler ("CUDA Clang"). -# Software prerequisites +## Software prerequisites 1. Clang (regularly tested with Clang 17; occasionally tested with Clang 10 and greater) @@ -29,9 +29,9 @@ A symptom of not installing all needed dependencies is the following error when attempting to use clang: `"/usr/bin/ld: cannot find -lstdc++: No such file or directory"`. -# Running CMake +## Running CMake -## Required CMake options +### Required CMake options The Clang build requires specifying the following CMake options. Replace `` with the path to your `clang++` executable. @@ -55,7 +55,7 @@ then one can set `CMAKE_CUDA_COMPILER` as follows. * `CMAKE_CUDA_COMPILER=${PATH_TO_CUDA_TOOLKIT}/bin/nvcc` -## Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/build/index.rst b/media/docs/cpp/build/index.rst new file mode 100644 index 00000000..ddd34d16 --- /dev/null +++ b/media/docs/cpp/build/index.rst @@ -0,0 +1,10 @@ +.. _cpp_build: + +Build +===== + +.. toctree:: + :maxdepth: 1 + + Building on Windows with Visual Studio + Building with Clang as host compiler diff --git a/media/docs/cpp/code_organization.md b/media/docs/cpp/code_organization.md index 84d9ab0f..fd0292fe 100644 --- a/media/docs/cpp/code_organization.md +++ b/media/docs/cpp/code_organization.md @@ -1,6 +1,6 @@ ![ALT](../../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Code Organization") -# CUTLASS Code Organization +# Code Organization This document describes the layout of the CUTLASS repository. The main components are: diff --git a/media/docs/cpp/cute/02_layout_algebra.md b/media/docs/cpp/cute/02_layout_algebra.md index c2accec9..e48ede48 100644 --- a/media/docs/cpp/cute/02_layout_algebra.md +++ b/media/docs/cpp/cute/02_layout_algebra.md @@ -249,9 +249,7 @@ auto same_r = make_layout(composition(layout<0>(a), get<0>(tiler)), We often use the `` notation to distinguish `Tiler`s from the concatenation-of-sublayouts notation `(LayoutA, LayoutB, ...)` that we used previously. The `result` in the above code can be depicted as the 3x8 sublayout of the original layout highlighted in the figure below. -

- composition1.png -

+![composition1.png](../../../images/cute/composition1.png) For convenience, CuTe also interprets `Shape`s as a tiler as well. A `Shape` is interpreted as tuple-of-layouts-with-stride-1: ```cpp @@ -268,9 +266,7 @@ auto tiler = make_shape(Int<3>{}, Int<8>{}); auto result = composition(a, tiler); ``` where `result` can be depicted as the 3x8 sublayout of the original layout highlighted in the figure below. -

- composition2.png -

+![composition2.png](../../../images/cute/composition2.png) ## Composition Tilers @@ -323,9 +319,7 @@ The `cotarget` parameter above is most commonly an integer -- you can see we onl * `complement((2,2):(1,6), 24)` is `(3,2):(2,12)`. Note that `((2,2),(3,2)):((1,6),(2,12))` has cosize `24` and produces unique indices. -

- complement1.png -

+![complement1.png](../../../images/cute/complement1.png) As a visualization, the above figure depicts the codomain of the last example. The image of the original layout `(2,2):(1,6)` is colored in gray. The complement effectively "repeats" the original layout (displayed in the other colors) such that the codomain size of the result is `24`. The complement `(3,2):(2,12)` can be viewed as the "layout of the repetition." ## Division (Tiling) @@ -371,9 +365,7 @@ This is computed in the three steps described in the implementation above. * Concantenation of `(B,B*) = (4,(2,3)):(2,(1,8))`. * Composition of `A = (4,2,3):(2,1,8)` with `(B,B*)` is then `((2,2),(2,3)):((4,1),(2,8))`. -

- divide1.png -

+![divide1.png](../../../images/cute/divide1.png) The above figure depicts `A` as a 1-D layout with the elements pointed to by `B` highlighted in gray. The layout `B` describes our "tile" of data, and there are six of those tiles in `A` shown by each of the colors. After the divide, the first mode of the result is the tile of data and the second mode of the result iterates over each tile. @@ -383,9 +375,7 @@ Using the `Tiler` concept defined above, this immediately generalizes to multidi Similar to the 2-D composition example above, consider a 2-D layout `A = (9,(4,8)):(59,(13,1))` and want to apply `3:3` down the columns (mode-0) and `(2,4):(1,8)` across the rows (mode-1). This means the tiler can be written as `B = <3:3, (2,4):(1,8)>`. -

- divide2.png -

+![divide2.png](../../../images/cute/divide2.png) The above figure depicts `A` as a 2-D layout with the elements pointed to by `B` highlighted in gray. The layout `B` describes our "tile" of data, and there are twelve of those tiles in `A` shown by each of the colors. After the divide, the first mode of each mode of the result is the tile of data and the second mode of each mode iterates over each tile. In that sense, this operation can be viewed as a kind of `gather` operation or as simply a permutation on the rows and cols. @@ -429,9 +419,7 @@ We note that `logical_divide` preserves the *semantics* of the modes while permu This is not the case with `zipped_divide`. The mode-0 in the `zipped_divide` result is the `Tile` itself (of whatever rank the `Tiler` was) and mode-1 is the layout of those tiles. It doesn't always make sense to plot these as 2-D layouts, because the `M`-mode is now more aptly the "tile-mode" and the `N`-mode is more aptly the "rest-mode". Regardless, we still can plot the resulting layout as 2-D as shown below. -

- divide3.png -

+![divide3.png](../../../images/cute/divide3.png) We've kept each tile as its color in the previous images for clarity. Clearly, iterating across tiles is now equivalent to iterating across a row of this layout and iterating over elements within a tile is equivalent to iterating down a column of this layout. As we'll see in the `Tensor` section, this can be used to great effect in partitioning within or across tiles of data. @@ -476,9 +464,7 @@ This is computed in the three steps described in the implementation above. * Composition of `A* = (2,3):(2,8)` with `B = 6:1` is then `(2,3):(2,8)`. * Concatenation of `(A,A* o B) = ((2,2),(2,3)):((4,1),(2,8))`. -

- product1.png -

+![product1.png](../../../images/cute/product1.png) The above figure depicts `A` and `B` as a 1-D layouts. The layout `B` describes the number and order of repetitions of `A` and they are colored for clarity. After the product, the first mode of the result is the tile of data and the second mode of the result iterates over each tile. @@ -486,9 +472,7 @@ Note that the result is identical to the result of the 1-D Logical Divide exampl Of course, we can change the number and order of the tiles in the product by changing `B`. -

- product2.png -

+![product2.png](../../../images/cute/product2.png) For example, in the above image with `B = (4,2):(2,1)`, there are 8 repeated tiles instead of 6 and the tiles are in a different order. @@ -496,9 +480,7 @@ For example, in the above image with `B = (4,2):(2,1)`, there are 8 repeated til We can use the by-mode `tiler` strategies previously developed to write multidimensional products as well. -

- product2d.png -

+![product2d.png](../../../images/cute/product2d.png) The above image demonstates the use of a `tiler` to apply `logical_product` by-mode. Despite this **not being the recommended approach**, the result is a rank-2 layout consisting of 2x5 row-major block that is tiled across a 3x4 column-major arrangement. @@ -519,17 +501,13 @@ Because `A` is always compatible with mode-0 of the result and `B` is always com This is exactly what `blocked_product` and `raked_product` do and it is why they are called rank-sensitive. Unlike other CuTe functions that take `Layout` arguments, these care about the top-level rank of the arguments so that each mode can be reassociated after the `logical_product`. -

- productblocked2d.png -

+![productblocked2d.png](../../../images/cute/productblocked2d.png) The above image shows the same result as the `tiler` approach, but with much more intuitive arguments. A 2x5 row-major layout is arranged as a tile in a 3x4 column-major arrangement. Also note that `blocked_product` went ahead and `coalesced` mode-0 for us. Similarly, `raked_product` combines the modes slightly differently. Instead of the resulting "column" mode being constructed from the `A` "column" mode then the `B` "column" mode, the resulting "column" mode is constructed from the `B` "column" mode then the `A` "column" mode. -

- productraked2d.png -

+![productraked2d.png](../../../images/cute/productraked2d.png) This results in the "tile" `A` now being interleaved or "raked" with the "layout-of-tiles" `B` instead of appearing as blocks. Other references call this a "cyclic distribution." diff --git a/media/docs/cpp/cute/03_tensor.md b/media/docs/cpp/cute/03_tensor.md index 45abb88e..aead2907 100644 --- a/media/docs/cpp/cute/03_tensor.md +++ b/media/docs/cpp/cute/03_tensor.md @@ -269,9 +269,7 @@ Tensor E = A(make_coord(_,1),make_coord(0,_,1)); Tensor F = A(make_coord(2,_),make_coord(_,3,_)); ``` -

- slice.png -

+![slice.png](../../../images/cute/slice.png) In the image above, a `Tensor` is sliced in various ways and the subtensors generated by those slices are highlighted within the original tensor. Note that tensor `C` and `D` contain the same elements, but have different ranks and shapes due to the use of `_` versus the use of `make_coord(_,_)`. In each case, the rank of the result is equal to the number of `Underscore`s in the slicing coordinate. @@ -327,9 +325,7 @@ Tensor tv = composition(A, tv_layout); // (8,4) Tensor v = tv(threadIdx.x, _); // (4) ``` -

- tv_layout.png -

+![tv_layout.png](../../../images/cute/tv_layout.png) The above image is a visual representation of the above code. An arbitrary 4x8 layout of data is composed with a specific 8x4 TV-layout that represents a partitioning pattern. The result of the composition is on the right where each threads' values are arranged across each row. The bottom layout depicts the inverse TV layout which shows the mapping of 4x8 logical coordinates to the thread id and value id they will be mapped to. diff --git a/media/docs/cpp/cute/0t_mma_atom.md b/media/docs/cpp/cute/0t_mma_atom.md index aa6da8c2..ab57c92e 100644 --- a/media/docs/cpp/cute/0t_mma_atom.md +++ b/media/docs/cpp/cute/0t_mma_atom.md @@ -208,9 +208,7 @@ Volta architecture implements an HMMA instruction where a group of 8 threads cal We first take a look at how we would take the ISA semantics of thread and data partitioning for the HMMA instruction, and encode it in a Traits struct. The HMMA NT instruction has the thread-data layout: -

- HMMA.8x8x4.NT.png -

+![HMMA.8x8x4.NT.png](../../../images/cute/HMMA.8x8x4.NT.png) ### Types @@ -250,9 +248,7 @@ Again, this layout function maps the logical thread id [0,8) of the MMA operatio Let us look at exactly how the 8 threads within a QP are mapped to the A, B and C matrices. For the C and D matrices, the above image is broken down a bit more below. On the left is shown the whole QP level view, and on the right is shown the values owned by just thread 0. -

- HMMA.8x8x4.quadpair.C.png -

+![HMMA.8x8x4.quadpair.C.png](../../../images/cute/HMMA.8x8x4.quadpair.C.png) The metainformation of this single instruction level view is what we want to encode in CuTe. Specifically, the QP level view in this diagram corresponds to the four MMA traits for [SM70_F32F16F16F32](https://github.com/NVIDIA/cutlass/tree/main/include/cute/arch/mma_sm70.hpp). These structs contain the `Element` types, the `Shape_MNK`, and the `ThrID` mapping we constructed above. Now, let us take a look at the definition of `CLayout`, the thread-data layout of accumulators. The job of `CLayout` is to construct a mapping between the `(logical_thr_id, logical_val_id)` and `(m, n)` coordinate in the C matrix which can then be used to build up more complicated layouts and operations like the 16x16x4 WMMA. @@ -320,9 +316,7 @@ In the case of F16 accumulators, the layout is way less complex. Each row of acc A and B matrix layouts depend on whether the sources are transposed or not. The diagram below shows the thread ID to data ownership map for A and B matrices in the case of NT and TN transposes. -

- HMMA.8x8x4.quadpair.AB.png -

+![HMMA.8x8x4.quadpair.AB.png](../../../images/cute/HMMA.8x8x4.quadpair.AB.png) Let's look at the TN layout for A matrix first (right side in the diagram). Again, there are the same 8 logical threads, but each threads owns only 4 elements this time. The shape of `ALayout` will then be `Shape<_8, _4>`. As for the strides, we again need a similar mapping between `(m, k) == m + k * M`. Looking down the `M` mode, we go from `(T0, V0)` to `(T1, V0)` which is a stride of 1 for all 8 threads. For the `K` mode, as we go across, we go from `(T0, V0)` to `(T0, V1)`, which makes a stride of 8 for all 4 values. Therefore, the A layout is: @@ -375,17 +369,13 @@ using ThrID = Layout<_128, _1>; Accumulators are mapped hierarchically in GMMA, starting from the concept of a core matrix and building up to a layout for the whole C matrix tile. Let's look at this core matrix first. We only consider fp16 accumulators here, but extensions of fp32 accumulators as trivial as we will see later. Each core matrix has the layout as shown in the diagram below. -

- gmma_coremat_cd_fp16.png -

+![gmma_coremat_cd_fp16.png](../../../images/cute/gmma_coremat_cd_fp16.png) As in the Volta examples, the thread IDs are logical only, and which of the four warps they belong to in the warpgroup is not important. Then GMMA tiles this core matrix first vertically along the M mode, and then repeats that column of core matrices along the N mode to construct the full MxN tile. This tiling is shown in the image below. -

- gmma_wg_n_slice.png -

+![gmma_wg_n_slice.png](../../../images/cute/gmma_wg_n_slice.png) With this image, we are again ready to start building the `CLayout` for `SM90_64x128x16_F16F16F16F16_TN` atom. Same as before, we are constructing a mapping between the `(logical_thr_id, logical_val_id) -> (m, n)` coordinate spaces. @@ -452,9 +442,7 @@ Let's start with `SM70_8x8x4_F32F16F16F32_NT`. MMA_Atom mma = MMA_Atom{}; print_latex(mma); ``` -

- HMMA.8x8x4.NT_Atom.png -

+![HMMA.8x8x4.NT_Atom.png](../../../images/cute/HMMA.8x8x4.NT_Atom.png) The above is equivalent to ```cpp @@ -472,9 +460,7 @@ We can create an object akin to a WMMA by using four of these quadpair MMAs: Stride<_2,_1>>{}); // 2x2 n-major layout of Atoms print_latex(mma); ``` -

- HMMA.8x8x4.NT_2x2.png -

+![HMMA.8x8x4.NT_2x2.png](../../../images/cute/HMMA.8x8x4.NT_2x2.png) This `TiledMMA` replicates the `MMA_Atom` across threads as we can see the `T4` and `T8` and `T12` threads in the `C`-matrix that were not used before. Each quadrant of the `C`-matrix is a replica of the atom's partitioning pattern for a new quadpair and this replication follows a `(2,2):(2,1)` layout. The above represents a 16x16x4 MMA now, but we can immediately expand this "tile size" up to 32x32x4 instead: @@ -485,9 +471,7 @@ The above represents a 16x16x4 MMA now, but we can immediately expand this "tile Tile<_32,_32,_4>{}); // 32x32x4 tiler print_latex(mma); ``` -

- HMMA.8x8x4.NT_2x2_32x32x4.png -

+![HMMA.8x8x4.NT_2x2_32x32x4.png](../../../images/cute/HMMA.8x8x4.NT_2x2_32x32x4.png) This `TiledMMA` replicates the previous `TiledMMA` across values instead of threads. We can see the `T0V8` and `T16V8` and `T8V8` values in the `C`-matrix that were not used before. Each quadrant of the `C`-matrix is a replica of the previous `TiledMMA`'s partitioning pattern for a new set of values. Continuing, we see that there are eight values that `T0` receives from the `A`-matrix. Those reads occur at coordinates @@ -513,9 +497,7 @@ which are separate, but we might prefer them to be next to each other. That is w _4>{}); // Permutation on K, size 4 identity print_latex(mma); ``` -

- HMMA.8x8x4.NT_2x2_32Mx32x4.png -

+![HMMA.8x8x4.NT_2x2_32Mx32x4.png](../../../images/cute/HMMA.8x8x4.NT_2x2_32Mx32x4.png) That layout `(4,4,2):(1,8,4)` is read like a scatter permutation, telling the m-coords of the original image where to go in the new image. ``` diff --git a/media/docs/cpp/cute/0x_gemm_tutorial.md b/media/docs/cpp/cute/0x_gemm_tutorial.md index 44ea84dc..38e57e4e 100644 --- a/media/docs/cpp/cute/0x_gemm_tutorial.md +++ b/media/docs/cpp/cute/0x_gemm_tutorial.md @@ -334,9 +334,7 @@ These thread layouts are then used to partition the tiles of data in global memo ``` where we've used the same projection-style interface to avoid applying the `N`-mode of `tC` to the `(BLK_M,BLK_K)` shape of `sA` and avoid applying the `M`-mode of `tC` to the `(BLK_N,BLK_K)` shape of `sB`. -

- tC_partitioning.png -

+![tC_partitioning.png](../../../images/cute/tC_partitioning.png) This diagram shows a `tC` layout, highlights two threads in green and blue, shows the projections of the `tC` layout, and finally highlights the subtensors within `sA`, `sB`, and `gC` that `tCsA`, `tCsB`, and `tCgC` represent. With the data partitioned across the threads, *every thread* can now participate in the compute step by writing @@ -390,9 +388,7 @@ As a first example, lets look at the `TiledCopy` that `gemm_nt` generates. print_latex(copyA); ``` The easiest way to see what this `TiledCopy` does is to look at the partition pattern in LaTeX. -

- TiledCopyA.png -

+![TiledCopyA.png](../../../images/cute/TiledCopyA.png) On the left is the source-tensor partitioning and on the right is the destination-tensor partitioning. The partition patterns are the same for this case, but there exist PTX instructions which require different patterns in the source and destination. The diagram shows that each thread reads 4x1 `TA` elements and there are 32x8 threads. The `UniversalCopy` forces the instruction to use a 128-bit copy instruction. If the partition (of `sA` or `gA` in this case) does not result in 4 `TA` elements that cannot be vectorized to a 128-bit load/store, then CuTe will statically fail with an error message to that effect. To use the `TiledCopy`, the kernel writes @@ -421,9 +417,7 @@ As a first example, lets look at the `TiledMMA` that `gemm_nt` generates. print_latex(mmaC); ``` The easiest way to see what this `TiledMMA` does is to look at the partition pattern in LaTeX. -

- TiledMmaC.png -

+![TiledMmaC.png](../../../images/cute/TiledMmaC.png) On the left is the A-tensor partitioning, on the top is the B-tensor partitioning, and in the middle is the C-tensor partitioning.Because the `UniversalFMA` is a 1x1x1 MMA instruction, a 16x16x1 tiling of them results in a 16x16x1 `TiledMMA`. Other MMA instructions will have different threads involved and have different instruction sizes. In this case, all threads will read a single element from `A`, `B`, and `C` each. To use the `TiledMMA`, the kernel writes diff --git a/media/docs/cpp/cute/0z_tma_tensors.md b/media/docs/cpp/cute/0z_tma_tensors.md index a7e2a012..4b9c0070 100644 --- a/media/docs/cpp/cute/0z_tma_tensors.md +++ b/media/docs/cpp/cute/0z_tma_tensors.md @@ -8,7 +8,7 @@ What is an `ArithTuple`? Are those tensor strides? What do those mean? What is t This documentation intends to answer those questions and introduce some of the more advanced features of CuTe. -# Introduction to TMA instructions +## Introduction to TMA instructions The Tensor Memory Accelerator (TMA) is a set of instructions for copying possibly multidimensional arrays between global and shared memory. TMA was introduced in the Hopper architecture. A single TMA instruction can copy an entire tile of data all at once. As a result, the hardware no longer needs to compute individual memory addresses and issue a separate copy instruction for each element of the tile. @@ -53,9 +53,9 @@ That means that an ordinary CuTe Tensor that stores a GMEM pointer and computes What do we do? -# Building a TMA Tensor +## Building a TMA Tensor -## Implicit CuTe Tensors +### Implicit CuTe Tensors All CuTe Tensors are compositions of Layouts and Iterators. An ordinary global memory tensor's iterator is its global memory pointer. However, a CuTe Tensor's iterator doesn't have to be a pointer; it can be any random-access iterator. @@ -83,7 +83,7 @@ This tensor maps logical coordinates to on-the-fly computed integers. Because it But the TMA doesn't consume pointers or integers, it consumes coordinates. Can we make a tensor of implicit TMA coordinates for the TMA instruction to consume? If so, then we could presumably also tile and partition and slice that tensor of coordinates so that we would always have the right TMA coordinate to give to the instruction. -## ArithTupleIterators and ArithTuples +### ArithTupleIterators and ArithTuples First, we build a `counting_iterator` equivalent for TMA coordinates. It should support @@ -110,7 +110,7 @@ In summary, one creates a TMA descriptor for the *whole global memory tensor*. T We can now track and offset TMA coordinates with this iterator, but how do we get CuTe Layouts to generate non-integer offsets? -## Strides aren't just integers +### Strides aren't just integers Ordinary tensors have a layout that maps a logical coordinate `(i,j)` into a 1-D linear index `k`. @@ -122,7 +122,7 @@ to a TMA coordinate, rather than to a 1-D linear index. To do this, we can abstract what a stride is. Strides need not be integers, but rather any algebraic object that supports inner-product with the integers (the logical coordinate). The obvious choice is the `ArithmeticTuple` we used earlier since they can be added to each other, but this time additionally equipped with an `operator*` so it can also be scaled by an integer. -### Aside: Integer-module strides +#### Aside: Integer-module strides A group of objects that support addition between elements and product between elements and integers is called an integer-module. @@ -133,7 +133,7 @@ Rank-R tuples of integers are an integer-module. In principle, layout strides may be any integer-module. -### Basis elements +#### Basis elements CuTe's basis elements live in the header file `cute/numeric/arithmetic_tuple.hpp`. To make it easy to create `ArithmeticTuple`s that can be used as strides, CuTe defines normalized basis elements using the `E` type alias. "Normalized" means that the scaling factor of the basis element is the compile-time integer 1. @@ -172,7 +172,7 @@ Intuitively, "compatible" means that the nested structure of the two basis elements matches well enough to add the two elements together. -### Linear combinations of strides +#### Linear combinations of strides Layouts work by taking the inner product of the natural coordinate with their strides. @@ -200,7 +200,7 @@ and can be interpreted as the coordinate `((7,4),23)`. Thus, linear combinations of these strides can be used to generate TMA coordinates. These coordinates, in turn, can be used to offset TMA coordinate iterators. -## Application to TMA Tensors +### Application to TMA Tensors Now we can build CuTe Tensors like the one seen in the introduction. @@ -230,7 +230,7 @@ ArithTuple(0,0) o (4,5):(_1@1,_1@0): (0,3) (1,3) (2,3) (3,3) (4,3) ``` -## Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/cute/index.rst b/media/docs/cpp/cute/index.rst index a6611dd7..4673d571 100644 --- a/media/docs/cpp/cute/index.rst +++ b/media/docs/cpp/cute/index.rst @@ -4,7 +4,7 @@ CuTe ==================== .. toctree:: - :maxdepth: 2 + :maxdepth: 1 00_quickstart<00_quickstart.md> 01_layout<01_layout.md> diff --git a/media/docs/cpp/cutlass_2x.rst b/media/docs/cpp/cutlass_2x.rst new file mode 100644 index 00000000..e3107c09 --- /dev/null +++ b/media/docs/cpp/cutlass_2x.rst @@ -0,0 +1,12 @@ +.. _cutlass_2_x: + +CUTLASS 2.x +================== + +.. toctree:: + :maxdepth: 2 + + Layouts and Tensors + GEMM API + Tile Iterator Concepts + Utilities diff --git a/media/docs/cpp/cutlass_3x.rst b/media/docs/cpp/cutlass_3x.rst new file mode 100644 index 00000000..2f4e50c7 --- /dev/null +++ b/media/docs/cpp/cutlass_3x.rst @@ -0,0 +1,11 @@ +.. _cutlass_3_x: + +CUTLASS 3.x +================== + +.. toctree:: + :maxdepth: 2 + + Design + GEMM Backwards Compatibility + GEMM API diff --git a/media/docs/cpp/cutlass_3x_backwards_compatibility.md b/media/docs/cpp/cutlass_3x_backwards_compatibility.md index 1dc42ef7..be9c50a1 100644 --- a/media/docs/cpp/cutlass_3x_backwards_compatibility.md +++ b/media/docs/cpp/cutlass_3x_backwards_compatibility.md @@ -438,7 +438,7 @@ obtain the kernel's configuration parameters. Users can use these to approximate for 3.0 API kernels. However, the reflective interfaces cannot always match the types exactly, as the mappings are not always bijective. -# Copyright +### Copyright Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/cutlass_3x_design.md b/media/docs/cpp/cutlass_3x_design.md index b1eed530..05e2c18d 100644 --- a/media/docs/cpp/cutlass_3x_design.md +++ b/media/docs/cpp/cutlass_3x_design.md @@ -114,7 +114,7 @@ In this way, CuTe reifies the thread-to-data-layout mapping, makes it easier to write code that is "correct by construction". If the code compiles, it's probably correct. -## Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/functionality.md b/media/docs/cpp/functionality.md index 396db1fe..73454967 100644 --- a/media/docs/cpp/functionality.md +++ b/media/docs/cpp/functionality.md @@ -277,7 +277,7 @@ CUDA exposes warp-level matrix operations in the CUDA C++ WMMA API. The CUDA C++ | **B** | `RowMajor`, `ColumnMajor` | `RowMajor`, `ColumnMajor` | | **C** | `RowMajor`, `ColumnMajor` | `RowMajor`, `ColumnMajor` | -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/fundamental_types.md b/media/docs/cpp/fundamental_types.md index b29fb5bf..ece3de16 100644 --- a/media/docs/cpp/fundamental_types.md +++ b/media/docs/cpp/fundamental_types.md @@ -355,7 +355,7 @@ support on current and future NVIDIA GPUs. ``` -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/gemm_api.md b/media/docs/cpp/gemm_api.md index fd8ecf5e..fdfc49b3 100644 --- a/media/docs/cpp/gemm_api.md +++ b/media/docs/cpp/gemm_api.md @@ -5,7 +5,7 @@ CUTLASS presents a uniform programming model for matrix multiply-accumulate operations at each level of the hierarchy. This document focuses on device-level, threadblock-level GEMMs, warp-level GEMMs, thread-level GEMMs, and instruction-level GEMMs. -# CUTLASS GEMM Model +## CUTLASS GEMM Model CUTLASS implements the basic GEMM triple loop nest with a tiled structure mirroring the execution model hierarchy. @@ -62,7 +62,7 @@ warp-synchronous matrix multiply instructions targeting Tensor Cores. Alternatively, GEMMs targeting single-thread instructions may have an additional series of nested loops corresponding to thread-level concurrency. -# CUTLASS GEMM Components +## CUTLASS GEMM Components This loop nest is expressed in CUTLASS via the following components which are specialized for data type, layout, and math instruction. @@ -71,7 +71,7 @@ math instruction. These components are described in the following sections. -## Device-wide GEMM API +### Device-wide GEMM API The device-level GEMM API is intended to streamline instantiation and execution of the standard GEMM computation across the GPU. This operator is intended to be used in host-side .cu code and @@ -119,7 +119,7 @@ The device-wide GEMM API is embodied by the following operators: ``` -## Threadblock-level GEMM API +### Threadblock-level GEMM API GEMMs at this scope are expected to efficiently load tiles of data from global memory into internal storage and then compute matrix products with warp-level GEMM operators. @@ -196,7 +196,7 @@ struct Mma { }; ``` -## Warp-level Matrix Multiply API +### Warp-level Matrix Multiply API Warp-level GEMM operators load tiles from shared memory into registers and then compute matrix multiplies using either Tensor Cores or CUDA Cores. The result is accumulated in a register tile. Iterators are defined for each @@ -416,7 +416,7 @@ class MmaSimt; ``` -## Thread-level GEMM API +### Thread-level GEMM API Thread-level GEMM operations perform matrix multiply-accumulate on data held in registers. These target CUDA Cores exclusively. @@ -502,7 +502,7 @@ struct Mma; } // namespace cutlass ``` -## Efficient Epilogue +### Efficient Epilogue CUTLASS GEMM operators perform mma followed by epilogue operation similar to cuBLAS. CUTLASS implements an efficient row-major epilogue. Thus, to achieve @@ -529,7 +529,7 @@ of input layouts. Thus, CUTLASS supports the following layout combinations for i - `{N,T} x {N,T} => {N,T}` - NN, TN, TN, TT GEMM for both row-major and column-major output -## Instruction-level operations +### Instruction-level operations CUTLASS defines a template-based interface to Tensor Core operations to avoid resorting to inline PTX. @@ -538,7 +538,7 @@ to inline PTX. - [mma_sm75.h](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/arch/mma_sm75.h) - Turing TensorCore operations -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/gemm_api_3x.md b/media/docs/cpp/gemm_api_3x.md index c643fafd..8f890a0d 100644 --- a/media/docs/cpp/gemm_api_3x.md +++ b/media/docs/cpp/gemm_api_3x.md @@ -19,7 +19,7 @@ Device, Kernel, and Collective. It also briefly discusses the Tiled MMA/Copy and Atom level, and then refers readers to CuTe's tutorial for more information. -# CUTLASS GEMM Model +## CUTLASS GEMM Model CUTLASS implements algorithms that express the classical "triply nested loop" GEMM algorithm @@ -80,7 +80,7 @@ and computes MMAs. These tiled copy and tiled mma iterations are generally fully static and get fully unrolled. -# CUTLASS GEMM Components +## CUTLASS GEMM Components CUTLASS expresses the above loop nest with the following components which are specialized for @@ -146,7 +146,7 @@ using GemmHandle = cutlass::gemm::device::GemmUniversalAdapter; Towards the end, we also briefly cover CuTe's tiled mma and copy as well as the atom layer APIs, before redirecting users to CuTe-specific documentation for further details. -## Collective API +### Collective API A Collective is "the largest collection of threads onto which mma atoms and copy atoms are tiled." @@ -670,7 +670,7 @@ please refer to CuTe's tutorial, e.g., the sections on * [a GEMM example](./cute/0x_gemm_tutorial.md). -# Copyright +### Copyright Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/getting_started.rst b/media/docs/cpp/getting_started.rst new file mode 100644 index 00000000..df34f3f6 --- /dev/null +++ b/media/docs/cpp/getting_started.rst @@ -0,0 +1,16 @@ +.. _getting_started: + +Getting Started +================== + +.. toctree:: + :maxdepth: 2 + + Quickstart + IDE Setup + Build + Functionality + Terminology + Fundamental Types + Programming Guidelines + diff --git a/media/docs/cpp/grouped_scheduler.md b/media/docs/cpp/grouped_scheduler.md index 333496f7..fab12062 100644 --- a/media/docs/cpp/grouped_scheduler.md +++ b/media/docs/cpp/grouped_scheduler.md @@ -1,6 +1,6 @@ ![ALT](../../images/gemm-hierarchy-with-epilogue-no-labels.png "CUTLASS Grouped Kernel Schedulers") -# CUTLASS Grouped Kernel Schedulers +# Grouped Kernel Schedulers CUTLASS's grouped kernel is a persistent kernel which launches multiple problems (e.g., GEMMs, SYR2Ks) within a single CUDA kernel launch. diff --git a/media/docs/cpp/ide_setup.md b/media/docs/cpp/ide_setup.md index 6a332b31..bad80bba 100644 --- a/media/docs/cpp/ide_setup.md +++ b/media/docs/cpp/ide_setup.md @@ -118,7 +118,7 @@ This is usually a convenient way to configure projects, but it's not as simple f clang doesn't understand many of the compiler flags used by nvcc. Hence, for now, we don't recommend using `compile_commands.json` to configure your CUDA project. -## Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/layout.md b/media/docs/cpp/layout.md index 5e1d4d29..1bc4d2c2 100644 --- a/media/docs/cpp/layout.md +++ b/media/docs/cpp/layout.md @@ -217,7 +217,7 @@ and `TensorRef` objects for each of the operands whose extents are implied as a redundant storage of extent quantities, CUTLASS minimizes capacity utilization of precious resources such as constant memory. This is consistent with BLAS conventions. -# Summary: +## Summary: The design patterns described in this document form a hierarchy: * `T *ptr;` is a pointer to a contiguous sequence of elements of type `T` @@ -225,7 +225,7 @@ The design patterns described in this document form a hierarchy: * `TensorRef ref(ptr, layout);` is an object pointing to an _unbounded_ tensor containing elements of type `T` and a layout of type `Layout` * `TensorView view(ref, extent);` is an object pointing to a _bounded_ tensor containing elements of type `T` and a layout of type `Layout` -# Appendix: Existing Layouts +### Appendix: Existing Layouts This section enumerates several existing Layout types defined in CUTLASS. @@ -268,7 +268,7 @@ Permuted Shared Memory Layouts: - `TensorOpCrosswise` -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/overview.md b/media/docs/cpp/overview.md deleted file mode 100644 index 35d2aac1..00000000 --- a/media/docs/cpp/overview.md +++ /dev/null @@ -1,619 +0,0 @@ -![ALT](../../images/gemm-hierarchy-with-epilogue-no-labels.png "Complete CUDA GEMM decomposition") - -# Overview - -# CUTLASS 3.9.0 - -_CUTLASS 3.9.0 - March 2025_ - -CUTLASS is a collection of CUDA C++ template abstractions for implementing -high-performance matrix-matrix multiplication (GEMM) and related computations at all levels -and scales within CUDA. It incorporates strategies for hierarchical decomposition and -data movement similar to those used to implement cuBLAS and cuDNN. CUTLASS decomposes -these "moving parts" into reusable, modular software components abstracted by C++ template -classes. Primitives for different levels of a conceptual parallelization hierarchy -can be specialized and tuned via custom tiling sizes, data types, -and other algorithmic policy. The resulting flexibility simplifies their use -as building blocks within custom kernels and applications. - -To support a wide variety of applications, CUTLASS provides extensive support for -mixed-precision computations, providing specialized data-movement and -multiply-accumulate abstractions for FP64, FP32, TF32, FP16, BF16, -[FP32 emulation via tensor core instruction](https://github.com/NVIDIA/cutlass/tree/main/examples/27_ampere_3xtf32_fast_accurate_tensorop_gemm), - 8b floating point types (e5m2 and e4m3), - block scaled data types (NVIDIA NVFP4 and OCP standard MXFP4, MXFP6, MXFP8), - narrow integer types (4 and 8b signed and unsigned integers), - and binary 1b data types (where architectures allow for the -native support of such data types). -CUTLASS demonstrates optimal matrix multiply operations -targeting the programmable, high-throughput _Tensor Cores_ implemented by -NVIDIA's Volta, Turing, Ampere, Ada, Hopper, and Blackwell architectures. - -In addition to GEMMs, CUTLASS implements high-performance convolution via -the implicit GEMM algorithm. Implicit GEMM is the formulation of a convolution -operation as a GEMM thereby taking advantage of CUTLASS's modular GEMM pipeline. -This allows CUTLASS to build convolutions by reusing highly-optimized GEMM components. - -See the [Quick Start Guide](quickstart.md) to get started quickly. - -See the [functionality docs](functionality.md) for a more comprehensive -list of kernel level features, data types, instructions, and minimum supported by CUTLASS on each GPU -architecture. - -# What's New in CUTLASS 3.9 - -* Support for Blackwell SM120 kernels for GeForce GPUs in CUTLASS 3.x API: - - Collective mainloops that target for: - * [Blockscaled datatypes with support for dense GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_mma_tma.hpp) - * [Blockscaled datatypes with support for sparse GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/collective/sm120_blockscaled_sparse_mma_tma.hpp) - - New [GEMM](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/gemm/dispatch_policy.hpp) and [epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/dispatch_policy.hpp) dispatch policies for collectives, kernel layers, and builders. - - [Blackwell SM120 epilogue](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_visitor_store_tma_warpspecialized.hpp) and [full set of EVT fusions](https://github.com/NVIDIA/cutlass/tree/main/include/cutlass/epilogue/fusion/sm120_callbacks_tma_warpspecialized.hpp). -* Set of examples that demonstrate the usage of the 3.x API for targeting Blackwell SM120 architecture: - - [Blockscaled GEMM with NVFP4 input datatype and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79a_blackwell_geforce_nvfp4_bf16_gemm.cu). - - [Blockscaled GEMM with NVFP4 input datatype and NVFP4 output tensor with scale factor generation](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79b_blackwell_geforce_nvfp4_nvfp4_gemm.cu). - - [Blockscaled GEMM with mixed input datatype (MXFP8 and MXFP6) and BF16 output tensor](https://github.com/NVIDIA/cutlass/tree/main/examples/79_blackwell_geforce_gemm/79c_blackwell_geforce_mixed_mxfp8_mxfp6_bf16_gemm.cu). -* Set of unit tests that demonstrate the usage of both [sparse](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_sparse_tensorop_gemm/) and [dense](https://github.com/NVIDIA/cutlass/tree/main/test/unit/gemm/device/sm120_blockscaled_tensorop_gemm/) Blackwell SM120 blockscaled GEMM. -* Enhancement and new support of block-wise and group-wise GEMM for Hopper and Blackwell architectures: - - Enhancement of [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling.cu) for Hopper architecture. - - Enhancement of [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/67_hopper_fp8_warp_specialized_gemm_with_blockwise_scaling/67_hopper_fp8_warp_specialized_gemm_with_groupwise_scaling.cu) for Hopper architecture. - - Support for [grouped GEMM with blockwise and groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/68_hopper_fp8_warp_specialized_grouped_gemm_with_blockwise_scaling/) for Hopper architecture. - - Support for [blockwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_blockwise.cu) for Blackwell architecture. - - Support for [groupwise GEMM](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_gemm_groupwise.cu) for Blackwell architecture. - - Support for [grouped GEMM with blockwise](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_blockwise.cu) and [groupwise scaling](https://github.com/NVIDIA/cutlass/tree/main/examples/81_blackwell_gemm_blockwise/81_blackwell_grouped_gemm_groupwise.cu) for Blackwell architecture. -* Added support for enhanced kernel performance search (auto-tuning) in CUTLASS profiler: - - Sorting performance results by GFLOPs/second: Users can now sort the final performance report based on GFLOPs/second, making it easier to identify the most efficient kernels. - - Exhaustive search for best kernel performance in GFLOPs/second: The profiler now searches for the best-performing kernel across a range of problem sizes, swizzle sizes, rasterization orders, and dynamic cluster configurations to maximize performance. - - Performance search under a fixed GEMM shape: Enables exhaustive tuning within a fixed GEMM shape, exploring various kernel parameters to find the best configuration. - - More detailed introductions and examples to leverage this feature can be found in [profiler.md](./profiler.md#exhaustive-search-mode-and-top-k-output-ranking-according-to-performance-in-gflopss). - -Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits. -CUTLASS team is working on a fix. - -**See the [CHANGELOG](../release_notes.md) for details of all past releases and updates.** - -# Performance - -CUTLASS primitives are very efficient. When used to construct device-wide GEMM kernels, -they exhibit nearly optimal utilization of peak theoretical throughput. The figure below -shows CUTLASS 3.8's performance as a % of theoretical peak utilization -on various input and output data types when run on NVIDIA Blackwell SM100 architecture GPU. - -![ALT](../../images/cutlass-3.8-blackwell-gemm-peak-performance.svg "") - -The two figures below show the continual CUTLASS performance improvements -on an [NVIDIA H100](https://www.nvidia.com/en-us/data-center/h100/) (NVIDIA Hopper architecture) since -CUTLASS 3.1. -CUTLASS 3.5.1 was compiled with the [CUDA 12.5u1 Toolkit](https://developer.nvidia.com/cuda-downloads). -Tensor Core operations are implemented using CUDA's -[mma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma) and -[wgmma](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#asynchronous-warpgroup-level-matrix-instructions) instructions. - -![ALT](../../images/cutlass-3.5.1-gemm-peak-performance.png "") -![ALT](../../images/cutlass-3.5.1-gemm-peak-performance-fp8.png "") - -# CuTe - -CUTLASS 3.0 introduced a new core library, CuTe, to describe and manipulate tensors of threads and data. -CuTe is a collection of C++ CUDA template abstractions for -defining and operating on hierarchically multidimensional layouts of threads and data. -CuTe provides `Layout` and `Tensor` objects that compactly package the type, -shape, memory space, and layout of data, while performing the complicated indexing for the user. -This lets programmers focus on the logical descriptions of their algorithms while -CuTe does the mechanical bookkeeping for them. With these tools, we can quickly design, -implement, and modify all dense linear algebra operations. - -The core abstractions of CuTe are hierarchically multidimensional layouts -which can be composed with data arrays to represent tensors. -The representation of layouts is powerful enough to represent nearly -everything we need to implement efficient dense linear algebra. -Layouts can also be combined and manipulated via functional composition, on which we build a large set of common operations such as tiling and partitioning. - -CUTLASS 3.0 and beyond adopts CuTe throughout the GEMM hierarchy in its templates. -This greatly simplifies the design and improves code composability and readability. -More documentation specific to CuTe can be found in its -[dedicated documentation directory](cute/00_quickstart.md). - -# Compatibility - -Minimum requirements: - -- Architecture: Volta (compute capability 7.0) -- Compiler: Must support at least C++17 -- CUDA Toolkit version: 11.4 - -CUTLASS requires a C++17 host compiler and -performs best when built with the [**CUDA 12.8 Toolkit**](https://developer.nvidia.com/cuda-downloads). -It is also compatible with CUDA 11.4, CUDA 11.5, CUDA 11.6, CUDA 11.7, CUDA 11.8, and all other CUDA 12.x versions. - -## Operating Systems - -We have tested the following environments. - -|**Operating System** | **Compiler** | -|-----------------|----------| -| Ubuntu 18.04 | GCC 7.5.0 | -| Ubuntu 20.04 | GCC 10.3.0 | -| Ubuntu 22.04 | GCC 11.2.0 | - -Note: GCC 8.5.0 has known regressions regarding fold expressions and overloaded operators. Using GCC 7.5.0 or (preferred) GCC >= 9 is recommended. - -Note: CUTLASS 3.x builds are known to be down on Windows platforms for all CUDA toolkits. -CUTLASS team is working on a fix. - -## Hardware - -CUTLASS runs successfully on the following NVIDIA GPUs, and it is expected to be efficient on Volta, Turing, Ampere, Ada, and Hopper architecture based NVIDIA GPUs. - -|**GPU**|**CUDA Compute Capability**|**Minimum CUDA Toolkit Required by CUTLASS-3**| -|---|---|---| -|NVIDIA V100 Tensor Core GPU |7.0|11.4| -|NVIDIA TitanV |7.0|11.4| -|NVIDIA GeForce RTX 20x0 series |7.5|11.4| -|NVIDIA T4 |7.5|11.4| -|NVIDIA A100 Tensor Core GPU |8.0|11.4| -|NVIDIA A10 |8.6|11.4| -|NVIDIA GeForce RTX 30x0 series |8.6|11.4| -|NVIDIA GeForce RTX 40x0 series |8.9|11.8| -|NVIDIA L40 |8.9|11.8| -|NVIDIA H100 Tensor Core GPU |9.0|11.8| -|NVIDIA H200 Tensor Core GPU |9.0|11.8| -|NVIDIA B200 Tensor Core GPU |10.0|12.8| -|NVIDIA GeForce RTX 50x0 series |10.0|12.8| - -## Target Architecture - -In general, PTX code generated for one target architecture can be run on future architectures -(i.e., it is forward compatible). -However, CUDA 12.0 introduced the concept of "architecture-accelerated features" whose -PTX does not have forward compatibility guarantees. -Several Hopper and Blackwell PTX instructions fall under this category of -architecture-accelerated features, and thus require a `sm_90a` or `sm100a` target architecture -(note the "a" appended). For more details on this and other architecture-accelerated instructions, -please refer to the [CUDA Documentation](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#feature-availability). - -The target architecture information is passed on to CUTLASS via the cmake flag -`CUTLASS_NVCC_ARCHS`. In order to maximize performance on Hopper GH100, -users are required to build CUTLASS with `90a` as the target architecture. -If a user accidentally builds a kernel which uses SM90a features -(e.g. Hopper Tensor Core Instructions), using the SM90 target -(note the lack of "a"), with either CUDA Toolkit 12 or 11.8, -the kernel is expected to fail with a runtime error. - -``` -cmake .. -DCUTLASS_NVCC_ARCHS="90a" -``` -Or - -``` -cmake .. -DCUTLASS_NVCC_ARCHS="100a" -``` - -Note: The NVIDIA Blackwell SM100 architecture used in the datacenter -products has a different compute capability than the one underpinning -NVIDIA Blackwell GeForce RTX 50 series GPUs. As a result, kernels -compiled for Blackwell SM100 architecture with arch conditional features -(using `sm100a`) are not compatible with RTX 50 series GPUs. - -Please refer to the [functionality documentation](functionality.md) -for details on which kernels require which target architectures. - -# Documentation - -CUTLASS is described in the following documents and the accompanying -[Doxygen documentation](https://nvidia.github.io/cutlass). - -- [Quick Start Guide](quickstart.md) - basics of building and running CUTLASS -- [Functionality](functionality.md) - summarizes functionality available in CUTLASS -- [Efficient GEMM in CUDA](efficient_gemm.md) - describes how GEMM kernels may be implemented efficiently in CUDA -- [CUTLASS 3.x Design](cutlass_3x_design.md) - describes the CUTLASS 3.x design, its benefits, and how CuTe enables us to write much more composable components -- [GEMM API 3.x](gemm_api_3x.md) - describes the CUTLASS 3.x GEMM model and C++ template concepts -- [GEMM API 2.x](gemm_api.md) - describes the CUTLASS 2.x GEMM model and C++ template concepts -- [Implicit GEMM Convolution](implicit_gemm_convolution.md) - describes 2-D and 3-D convolution in CUTLASS -- [Code Organization](code_organization.md) - describes the organization and contents of the CUTLASS project -- [Terminology](terminology.md) - describes terms used in the code -- [Programming Guidelines](programming_guidelines.md) - guidelines for writing efficient modern CUDA C++ -- [Fundamental types](fundamental_types.md) - describes basic C++ classes used in CUTLASS to represent numeric quantities and arrays -- [Layouts](layout.md) - describes layouts of matrices and tensors in memory -- [Tile Iterators](tile_iterator_concept.md) - describes C++ concepts for iterating over tiles of matrices in memory -- [CUTLASS Profiler](profiler.md) - command-line driven profiling application -- [CUTLASS Utilities](utilities.md) - additional templates used to facilitate rapid development -- [Dependent kernel launch](dependent_kernel_launch.md) - describes a new feature in Hopper which allows overlapping dependent -kernels in the same stream, and how it is used in CUTLASS. - -# Resources -We have also described the structure of an efficient GEMM in our talk at the -[GPU Technology Conference 2018](http://on-demand.gputechconf.com/gtc/2018/presentation/s8854-cutlass-software-primitives-for-dense-linear-algebra-at-all-levels-and-scales-within-cuda.pdf). - -- [CUTLASS: Software Primitives for Dense Linear Algebra at All Levels and Scales within CUDA](https://www.nvidia.com/en-us/on-demand/session/gtcsiliconvalley2018-s8854/) -- [Developing CUDA Kernels to Push Tensor Cores to the Absolute Limit on NVIDIA A100](https://www.nvidia.com/en-us/on-demand/session/gtcsj20-s21745/) -- [Accelerating Convolution with Tensor Cores in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31883/) -- [Accelerating Backward Data Gradient by Increasing Tensor Core Utilization in CUTLASS](https://www.nvidia.com/en-us/on-demand/session/gtcspring22-s41996/) -- [CUTLASS: Python API, Enhancements, and NVIDIA Hopper](https://www.nvidia.com/en-us/on-demand/session/gtcfall22-a41131/) - -# Building CUTLASS - -CUTLASS is a header-only template library and does not need to be built to be used by other -projects. Client applications should target CUTLASS's `include/` directory in their include -paths. - -CUTLASS unit tests, examples, and utilities can be build with CMake. -The minimum version of CMake is given in the [Quickstart guide](quickstart.md). -Make sure the `CUDACXX` environment variable points to NVCC in the CUDA Toolkit installed -on your system. - -```bash -$ export CUDACXX=${CUDA_INSTALL_PATH}/bin/nvcc -``` - -Create a build directory within the CUTLASS project, then run CMake. By default CUTLASS will build kernels -for CUDA architecture versions 5.0, 6.0, 6.1, 7.0, 7.5, 8.0, 8.6, 8.9, and 9.0. -To reduce compile time you can specify -the architectures to build CUTLASS for by changing the CMake configuration setting -`CUTLASS_NVCC_ARCHS`. - -```bash -$ mkdir build && cd build - -$ cmake .. -DCUTLASS_NVCC_ARCHS=80 # compiles for NVIDIA's Ampere Architecture -``` - -From the `build/` directory, compile and run the CUTLASS unit tests by building the target `test_unit` with make. - -The unit tests are organized as several binaries mirroring the top-level namespaces of CUTLASS, -and they may be executed in parallel via make's `-j` command line argument. - -```bash -$ make test_unit -j -... -... -... -[----------] Global test environment tear-down -[==========] 946 tests from 57 test cases ran. (10812 ms total) -[ PASSED ] 946 tests. -``` - -All tests should pass on supported platforms, though the exact number of tests may vary over time. - - -# Project Structure - -CUTLASS is arranged as a header-only library along with Utilities, Tools, Examples, and unit tests. -[Doxygen documentation](https://nvidia.github.io/cutlass) provides a complete list of files, classes, -and template concepts defined in the CUTLASS project. - -A detailed explanation of the source code organization may be found in the -[CUTLASS documentation](code_organization.md), but several main components are summarized below. - -## CUTLASS Template Library - -``` -include/ # client applications should target this directory in their build's include paths - - cutlass/ # CUDA Templates for Linear Algebra Subroutines and Solvers - headers only - - arch/ # direct exposure of architecture features (including instruction-level GEMMs) - - conv/ # code specialized for convolution - - epilogue/ # code specialized for the epilogue of gemm/convolution - - gemm/ # code specialized for general matrix product computations - - layout/ # layout definitions for matrices, tensors, and other mathematical objects in memory - - platform/ # CUDA-capable Standard Library components - - reduction/ # bandwidth-limited reduction kernels that do not fit the "gemm" model - - thread/ # simt code that can be performed within a CUDA thread - - transform/ # code specialized for layout, type, and domain transformations - - * # core vocabulary types, containers, and basic numeric operations - - cute/ # CuTe Layout, layout algebra, MMA/Copy atoms, tiled MMA/Copy - - algorithm/ # Definitions of core operations such as copy, gemm, and operations on cute::tuples - - arch/ # Bare bones PTX wrapper structs for copy and math instructions - - atom/ # Meta-information either link to or built from arch/ operators - - mma_atom.hpp # cute::Mma_Atom and cute::TiledMma - - copy_atom.hpp # cute::Copy_Atom and cute::TiledCopy - - *sm*.hpp # Arch specific meta-information for copy and math operations - - * # Core library types such as Shape, Stride, Layout, Tensor, and associated operations - -``` - -### CUTLASS SDK Examples - -[CUTLASS SDK examples](https://github.com/NVIDIA/cutlass/tree/main/examples) apply CUTLASS templates to implement basic computations. - -### Tools - -``` -tools/ - library/ # CUTLASS Instance Library - contains instantiations of all supported CUTLASS templates - include/ - cutlass/ - library/ - - profiler/ # CUTLASS Profiler - command-line utility for executing operations in the - # CUTLASS Library - - util/ # CUTLASS Utilities - contains numerous helper classes for - include/ # manging tensors in device memory, reference - cutlass/ # implementations for GEMM, random initialization - util/ # of tensors, and I/O. -``` - -### Test - -The `test/unit/` directory consist of unit tests implemented with Google Test that demonstrate -basic usage of Core API components and complete tests of the CUTLASS GEMM computations. - -Instructions for building and running the Unit tests are described in the [Quickstart guide](quickstart.md). - -# Performance Profiling - -The `tools/profiler/` directory contains a command-line utility for launching each of the GEMM kernels. -It can be built as follows: - -```bash -$ make cutlass_profiler -j16 -``` -## Building all GEMM and Convolution kernels (_long_ build times) - -By default, only one tile size is instantiated for each data type, math instruction, and layout. -To instantiate all, set the following environment variable when running CMake from an empty `build/` directory. -Beware, this results in *tens of thousands* of kernels and long build times. -This would also result in a large binary size and on some platforms linker to fail on building the library. -Therefore, it's highly recommended to generate only a subset of kernels as demonstrated in the sub-section below. -```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS=90a -DCUTLASS_LIBRARY_KERNELS=all -... -$ make cutlass_profiler -j16 -``` - -## Building a subset of GEMM and Convolution kernels (_reduced_ build times) - -To compile strictly one kernel or a small set of kernels, a comma-delimited list of kernel names with -wildcard characters may be used to reduce the set of kernels. The following examples show building exactly one -or a subset of kernels for NVIDIA Ampere and Turing architecture: - -### Building a subset Tensor Core GEMM kernels - -To compile a subset of Tensor Core GEMM kernels with FP32 accumulation and FP16 input targeting NVIDIA Ampere and Turing architecture, -use the below cmake command line: -```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*gemm_f16_*_nt_align8 -... -$ make cutlass_profiler -j16 -``` - -Example command line for profiling a subset of Tensor Core GEMM kernels is as follows: -```bash -./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*gemm_f16_*_nt_align8 --m=3456 --n=4096 --k=4096 - -... -============================= - Problem ID: 1 - - Provider: CUTLASS - OperationKind: gemm - Operation: cutlass_tensorop_s1688gemm_f16_256x128_32x2_nt_align8 - - Status: Success - Verification: ON - Disposition: Passed - -reference_device: Passed - cuBLAS: Passed - - Arguments: --gemm_kind=universal --m=3456 --n=4096 --k=4096 --A=f16:column --B=f16:row --C=f32:column --alpha=1 \ - --beta=0 --split_k_slices=1 --batch_count=1 --op_class=tensorop --accum=f32 --cta_m=256 --cta_n=128 \ - --cta_k=32 --stages=2 --warps_m=4 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=8 --min_cc=75 \ - --max_cc=1024 - - Bytes: 118489088 bytes - FLOPs: 115992428544 flops - - Runtime: 1.55948 ms - Memory: 70.7616 GiB/s - - Math: 74378.8 GFLOP/s - - - -============================= -... -``` - -### Building one CUDA Core GEMM kernel - -To compile one SGEMM kernel targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: -```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sgemm_128x128_8x2_nn_align1 -... -$ make cutlass_profiler -j16 -``` - -Example command line for profiling single SGEMM CUDA kernel is as follows: -```bash -$ ./tools/profiler/cutlass_profiler --kernels=sgemm --m=3456 --n=4096 --k=4096 - -============================= - Problem ID: 1 - - Provider: CUTLASS - OperationKind: gemm - Operation: cutlass_simt_sgemm_128x128_8x2_nn_align1 - - Status: Success - Verification: ON - Disposition: Passed - - cuBLAS: Passed - - Arguments: --m=3456 --n=4096 --k=4096 --A=f32:column --B=f32:column --C=f32:column --alpha=1 --beta=0 --split_k_slices=1 \ - --batch_count=1 --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ - --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 - - Bytes: 180355072 bytes - FLOPs: 115992428544 flops - - Runtime: 6.73655 ms - Memory: 24.934 GiB/s - - Math: 17218.4 GFLOP/s - -============================= -``` - -### Building a subset of Tensor Core Convolution kernels - -To compile a subset of Tensor core convolution kernels implementing forward propagation (fprop) with FP32 accumulation -and FP16 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: -```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_tensorop_s*fprop_optimized_f16 -... -$ make cutlass_profiler -j16 -``` - -Example command line for profiling a subset of Tensor Core convolution kernels is as follows: - -```bash -$ ./tools/profiler/cutlass_profiler --kernels=cutlass_tensorop_s*fprop_optimized_f16 --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 - -... -============================= - Problem ID: 1 - - Provider: CUTLASS - OperationKind: conv2d - Operation: cutlass_tensorop_s16816fprop_optimized_f16_128x128_32x5_nhwc - - Status: Success - Verification: ON - Disposition: Passed - -reference_device: Passed - - Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \ - --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f16:nhwc --Filter=f16:nhwc --Output=f32:nhwc \ - --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \ - --eq_gemm_provider=none --op_class=tensorop --accum=f32 --cta_m=128 --cta_n=128 --cta_k=32 --stages=5 \ - --warps_m=2 --warps_n=2 --warps_k=1 --inst_m=16 --inst_n=8 --inst_k=16 --min_cc=80 --max_cc=1024 - - Bytes: 1130659840 bytes - FLOPs: 118482796544 flops - - Runtime: 0.711496 ms - Memory: 1479.99 GiB/s - - Math: 166526 GFLOP/s - -============================= -... -``` - - -### Building one Convolution CUDA kernel - -To compile and run one CUDA Core convolution kernel implementing forward propagation (fprop) with F32 accumulation -and FP32 input targeting NVIDIA Ampere and Turing architecture, use the below cmake command line: -```bash -$ cmake .. -DCUTLASS_NVCC_ARCHS='75;80' -DCUTLASS_LIBRARY_KERNELS=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc -... -$ make cutlass_profiler -j16 -``` - -Example command line for profiling one CUDA Core convolution kernel: - -```bash -$ ./tools/profiler/cutlass_profiler --kernels=cutlass_simt_sfprop_optimized_128x128_8x2_nhwc --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 - - -============================= - Problem ID: 1 - - Provider: CUTLASS - OperationKind: conv2d - Operation: cutlass_simt_sfprop_optimized_128x128_8x2_nhwc - - Status: Success - Verification: ON - Disposition: Passed - -reference_device: Passed - - Arguments: --conv_kind=fprop --n=8 --h=224 --w=224 --c=128 --k=128 --r=3 --s=3 --p=224 --q=224 --pad_h=1 --pad_w=1 \ - --stride_h=1 --stride_w=1 --dilation_h=1 --dilation_w=1 --Activation=f32:nhwc --Filter=f32:nhwc --Output=f32:nhwc \ - --conv_mode=cross --iterator_algorithm=optimized --alpha=1 --beta=0 --split_k_mode=serial --split_k_slices=1 \ - --eq_gemm_provider=none --op_class=simt --accum=f32 --cta_m=128 --cta_n=128 --cta_k=8 --stages=2 --warps_m=4 \ - --warps_n=2 --warps_k=1 --inst_m=1 --inst_n=1 --inst_k=1 --min_cc=50 --max_cc=1024 - - Bytes: 2055798784 bytes - FLOPs: 118482796544 flops - - Runtime: 7.34266 ms - Memory: 260.752 GiB/s - - Math: 16136.2 GFLOP/s - - -============================= - -``` - -## More Details on Compiling CUTLASS Kernels and CUTLASS Profiler -- Please follow the links for more CMake examples on selectively compiling CUTLASS kernels: - - [GEMM CMake Examples](quickstart.md#gemm-cmake-examples) - - [Implicit GEMM convolution CMake Examples](quickstart.md#convolution-cmake-examples) -- [Further details about the CUTLASS Profiler are described here.](profiler.md) - - -# About - -CUTLASS is released by NVIDIA Corporation as Open Source software under the -[3-clause "New" BSD license](LICENSE.txt). - -# Contributors - -The official list of CUTLASS developers and contributors is available here: [CONTRIBUTORS](CONTRIBUTORS.md). - -# Copyright - -Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -SPDX-License-Identifier: BSD-3-Clause - -``` - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - 3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -``` diff --git a/media/docs/cpp/profiler.md b/media/docs/cpp/profiler.md index 58088dff..22f88485 100644 --- a/media/docs/cpp/profiler.md +++ b/media/docs/cpp/profiler.md @@ -45,7 +45,7 @@ compile or fail to launch at runtime. ```bash $ cmake .. \ -DCUTLASS_NVCC_ARCHS="90a" \ - -DCUTLASS_LIBRARY_KERNELS="cutlass3x_sm90_tensorop_s64x64x16gemm_f16_f16_f32_void_f32_*" \ + -DCUTLASS_LIBRARY_KERNELS="cutlass3x_sm90_tensorop_gemm_f16_f16_f32_void_f32_*" \ -DCUTLASS_LIBRARY_INSTANTIATION_LEVEL="max" \ -DCUTLASS_UNITY_BUILD_ENABLED=ON ``` @@ -525,7 +525,7 @@ To best illustrate this naming convention, we will walk through the meaning of e in a GEMM kernel used by the profiler: ``` -cutlass3x_sm90_tensorop_s64x128x16gemm_f16_f16_f32_f16_f32_{optional-mixed-dtype-config}_128x128x64_2x1x1_0_ntn_align8 +cutlass3x_sm90_tensorop_gemm_f16_f16_f32_f16_f32_{optional-mixed-dtype-config}_128x128x64_2x1x1_0_ntn_align8 ``` The components within this name are as follows: @@ -553,7 +553,7 @@ Note that in some special cases where the input A/B types do not match that of t instruction's, the MMA facing input type is added to the instruction string as well. ``` -cutlass3x_sm90_tensorop_s64x128x8tf32gemm_f32_f32_f32_f32_f32_128x128x32_2x1x1_0_tnn_align4 +cutlass3x_sm90_tensorop_tf32gemm_f32_f32_f32_f32_f32_128x128x32_2x1x1_0_tnn_align4 ``` * `s64x128x8tf32gemm`: indicates that the MMA consumes inputs in `tf32` format, and therefore @@ -563,7 +563,7 @@ For custom mainloop or epilogue schedules, details of the opted-in schedule are kernel name. For example, ``` -cutlass3x_sm90_tensorop_h64x128x16gemm_f16_f16_f16_void_f16_128x128x64_1x1x1_0_nnn_align8_warpspecialized_cooperative_epi_tma +cutlass3x_sm90_tensorop_gemm_f16_f16_f16_void_f16_128x128x64_1x1x1_0_nnn_align8_warpspecialized_cooperative_epi_tma ``` * `warpspecialized_cooperative`: Mainloop employs a persistent warp-specialized mainloop and kernel schedule. diff --git a/media/docs/cpp/programming_guidelines.md b/media/docs/cpp/programming_guidelines.md index b85108d9..5aa59744 100644 --- a/media/docs/cpp/programming_guidelines.md +++ b/media/docs/cpp/programming_guidelines.md @@ -1157,7 +1157,7 @@ has shape `((X, Y), K)` and stride `((1, X), X*Y)`. `get<0>(stride)` is the tuple `(1, X)`, not a single integer. However, A is certainly M major if interpreted as a matrix. -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/quickstart.md b/media/docs/cpp/quickstart.md index b728f7ed..388d2751 100644 --- a/media/docs/cpp/quickstart.md +++ b/media/docs/cpp/quickstart.md @@ -462,7 +462,7 @@ int main(int argc, char const **args) { } ``` -# CUTLASS Library +## CUTLASS Library The [CUTLASS Library](https://github.com/NVIDIA/cutlass/tree/main/tools/library) defines an API for managing and executing collections of compiled kernel instances and launching them from host code without template instantiations in client code. @@ -585,7 +585,7 @@ int main() { } ``` -# Example CMake Commands +## Example CMake Commands To instantiate all operations supporting all tile sizes, data types, and alignment constraints, specify `-DCUTLASS_LIBRARY_KERNELS=all` when running `cmake`. @@ -750,7 +750,7 @@ are needed in the mainloop builder: We encourage a user to refer to Sm100 unit tests and the generated profiler-based kernels as more comprehensive samples. -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/terminology.md b/media/docs/cpp/terminology.md index 1c5d31ea..6ec6158c 100644 --- a/media/docs/cpp/terminology.md +++ b/media/docs/cpp/terminology.md @@ -78,7 +78,10 @@ replaced by [MMA and Copy atoms from CuTe](cute/0t_mma_atom.md). **Thread Map**: abstraction for defining how threads are mapped to a given tile. Deprecated starting CUTLASS 3.0. Replaced by `cute::Layout` in equivalent usage scenarios to represent thread tensors. -# Copyright +[comment]: <> (Don't remove this. This "##" is to prevent Sphinx from throwing build WARNING.) +## + +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/tile_iterator_concept.md b/media/docs/cpp/tile_iterator_concept.md index 63a3eb0b..0da69d7c 100644 --- a/media/docs/cpp/tile_iterator_concept.md +++ b/media/docs/cpp/tile_iterator_concept.md @@ -469,7 +469,7 @@ struct WriteableReadableRandomAccessContiguousTileIteratorConcept { }; ``` -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/cpp/utilities.md b/media/docs/cpp/utilities.md index b6dffe05..ab45b617 100644 --- a/media/docs/cpp/utilities.md +++ b/media/docs/cpp/utilities.md @@ -431,7 +431,7 @@ Additional information may appear at the end of each line, such as shared memory Please note that `synclog` is an experimental feature, and its functionality is not always guaranteed. We encourage its use in custom kernels and CUTLASS examples, though it is known to be incompatible with profiler kernels. -# Copyright +### Copyright Copyright (c) 2017 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: BSD-3-Clause diff --git a/media/docs/pythonDSL/cute_dsl.rst b/media/docs/pythonDSL/cute_dsl.rst new file mode 100644 index 00000000..71fa4f7f --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl.rst @@ -0,0 +1,18 @@ +.. _cute_dsl: + +CuTe DSL +======== + +.. toctree:: + :maxdepth: 1 + + DSL Introduction + DSL Code Generation + DSL Control Flow + DSL JIT Argument Generation + DSL JIT Argument: Layouts + DSL JIT Caching + Integration with Frameworks + Debugging with the DSL + Autotuning with the DSL + Educational Notebooks diff --git a/media/docs/pythonDSL/cute_dsl_api.rst b/media/docs/pythonDSL/cute_dsl_api.rst new file mode 100644 index 00000000..c4726eb3 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api.rst @@ -0,0 +1,12 @@ +.. _cute_dsl_api: + +CuTe DSL API +============ + +.. toctree:: + :maxdepth: 1 + + cute + cute_arch + cute_nvgpu + utils diff --git a/media/docs/pythonDSL/cute_dsl_api/cute.rst b/media/docs/pythonDSL/cute_dsl_api/cute.rst new file mode 100644 index 00000000..bd5d5c56 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute.rst @@ -0,0 +1,11 @@ +.. _cute: + +cutlass.cute +============ + +.. automodule:: cutlass.cute + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ + :private-members: diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_arch.rst b/media/docs/pythonDSL/cute_dsl_api/cute_arch.rst new file mode 100644 index 00000000..4e2d4d0d --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_arch.rst @@ -0,0 +1,24 @@ +.. _cute_arch: + +cutlass.cute.arch +================= + +The ``cute.arch`` module contains wrappers around NVVM-level MLIR Op builders that seamlessly +inter-operate with the Python types used in CUTLASS Python. Another benefit of wrapping these Op +builders is that the source location can be tracked with the ``@dsl_user_op`` decorator. Available +functions include + +- basic API like ``thr_idx``; +- functions related to the direct management of mbarriers; +- low-level SMEM management (prefer using the ``SmemAllocator`` class); +- TMEM management. + +API documentation +----------------- + +.. automodule:: cutlass.cute.arch + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ + :private-members: diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst new file mode 100644 index 00000000..4f5d18ae --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu.rst @@ -0,0 +1,18 @@ +.. _cute_nvgpu: + +cutlass.cute.nvgpu +================== + +The ``cute.nvgpu`` module contains MMA and Copy Operations as well as Operation-specific helper +functions. The arch-agnostic Operations are exposed at the top-level while arch-specific Operations +are grouped into submodules like ``tcgen05``. + +.. toctree:: + :maxdepth: 2 + :hidden: + + cute_nvgpu_common + cute_nvgpu_warp + cute_nvgpu_warpgroup + cute_nvgpu_cpasync + cute_nvgpu_tcgen05 diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst new file mode 100644 index 00000000..fd1013ed --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_common.rst @@ -0,0 +1,9 @@ +.. _cute_nvgpu_common: + +Common +====== + +.. automodule:: cutlass.cute.nvgpu + :members: + :undoc-members: + :show-inheritance: diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst new file mode 100644 index 00000000..84c22871 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_cpasync.rst @@ -0,0 +1,10 @@ +.. _cute_nvgpu_cpasync: + +cpasync submodule +================= + +.. automodule:: cutlass.cute.nvgpu.cpasync + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst new file mode 100644 index 00000000..ee2c6f35 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_tcgen05.rst @@ -0,0 +1,10 @@ +.. _cute_nvgpu_tcgen05: + +tcgen05 submodule +================= + +.. automodule:: cutlass.cute.nvgpu.tcgen05 + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst new file mode 100644 index 00000000..bda907f4 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warp.rst @@ -0,0 +1,10 @@ +.. _cute_nvgpu_warp: + +warp submodule +============== + +.. automodule:: cutlass.cute.nvgpu.warp + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ diff --git a/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst new file mode 100644 index 00000000..441f2305 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/cute_nvgpu_warpgroup.rst @@ -0,0 +1,10 @@ +.. _cute_nvgpu_warpgroup: + +warpgroup submodule +=================== + +.. automodule:: cutlass.cute.nvgpu.warpgroup + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ diff --git a/media/docs/pythonDSL/cute_dsl_api/utils.rst b/media/docs/pythonDSL/cute_dsl_api/utils.rst new file mode 100644 index 00000000..086bef60 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_api/utils.rst @@ -0,0 +1,9 @@ +cutlass.utils +============= + +.. automodule:: cutlass.utils + :members: + :undoc-members: + :show-inheritance: + :special-members: __init__ + :private-members: diff --git a/media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst b/media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst new file mode 100644 index 00000000..db76c8a7 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/autotuning_gemm.rst @@ -0,0 +1,154 @@ +.. _autotuning_gemm: + +Guidance for Auto-Tuning +============================= + +.. contents:: Table of Contents + :depth: 2 + :local: + +Numerous GEMM kernel code examples are offered within our codebase. +When integrating these kernels into frameworks, auto-tuning becomes essential +for achieving optimal performance. This involves selecting the appropriate +kernel parameters based on the inputs of real applications. +Next, we'll briefly introduce some tips on how to perform auto-tuning. + +The auto-tuning process typically involves the following steps: + +1. Define search space +2. Benchmark each configuration and select the kernel with the best performance +3. Enable caching to reduce the tuning cost + +The search space defines the valid combinations of kernel parameters that can be used to run the kernels. +Different inputs (shapes, data types, etc.) typically require different kernel parameters to achieve optimal performance. +The search space is related to the kernel. We take the Blackwell GEMM persistent kernel as an example. +The search space is as follows: + +- ``mma_tiler_mn``: Defines the dimensions of the matrix tile that each Matrix Multiply-Accumulate (MMA) instruction processes in a single operation. +- ``cluster_shape_mn``: Specifies the number of CTAs along each dimension within a cluster. Refer `Parallel Thread Execution ISA documentation `_ for the possible mma tiler size and cluster shape for different tensor data types. +- ``use_2cta_instrs``: Whether to utilize Blackwell's 2 CTA instructions for MMA/Copy. +- ``use_tma_store``: Whether to use Tensor Memory Access (TMA) instructions to store the result back to global memory. + +After defining the search space, we could traverse all parameter combinations to find the optimal kernel. +The ``autotune_gemm`` function below demonstrates a simple exhaustive search approach - it iterates +through configurations, compiles and benchmarks each kernel, and returns the best performing one. +Since kernel compilation incurs overhead, it's important to cache and reuse compiled kernels +to minimize host launch latency. CuTe DSL facilitates this through its separate compilation +and execution workflow. More details can be found in :ref:`JIT_Caching`. +As demonstrated in the ``autotune_gemm`` function +(between the ``begin of cache the compiled GEMM kernel`` and ``end of cache the compiled GEMM kernel`` comments), +we can use ``cute.compile()`` to compile a kernel once, cache the compiled result, and reuse the cached JIT executor for multiple kernel +executions. We could maintain a global configuration-to-kernel dictionary (``config_kernel_dict``) to cache the compiled GEMM kernels, +where each key (``kernel_cache_key``) uniquely identifies a kernel based on its characteristics. +Usually we could use the {dtype + kernel configs} as the cached key for GEMM compilation. For example, + +.. code-block:: python + + kernel_cache_key = f"{ab_dtype}x{c_dtype}x{acc_dtype}x{use_2cta_instrs}x{mma_tiler}x{cluster_shape_mn}x{use_tma_store}" + +If the input tensor's layout is static, we should add the shape in the cached key too. +Users can customize the ``benchmark`` function to measure kernel execution time. +For stable and reliable performance measurements: + +1. Run a few warmup iterations (e.g., 5-10) to stabilize GPU temperature +2. Execute multiple timed iterations (e.g., 100-1000) for statistical significance +3. Use CUDA events and synchronization for precise timing +4. Lock GPU frequencies (SM and memory frequencies) with nvidia-smi +5. Process results by removing outliers and using min/avg statistics as measurements. + +This ensures reliable kernel selection through proper benchmarking. + +.. code-block:: python + + # get the best GEMM kernel for given input tensors + def autotune_gemm( + a: cute.Tensor, + b: cute.Tensor, + c: cute.Tensor, + stream: cuda.CUstream, + use_2cta_instrs_list: List[bool] = [True], + use_tma_store_list: List[bool] = [True], + mma_tiler_m_list: List[int] = [256], + mma_tiler_n_list: List[int] = [256], + cluster_shape_m_list: List[int] = [2], + cluster_shape_n_list: List[int] = [1], + ): + best_kernel = None + min_time = float("inf") + # traverse the search space + for use_2cta_instrs in use_2cta_instrs_list: + for use_tma_store in use_tma_store_list: + for mma_tiler_mn in product(mma_tiler_m_list, mma_tiler_n_list): + for cluster_shape_mn in product(cluster_shape_m_list, cluster_shape_n_list): + acc_dtype = cutlass.Float32 + hardware_info = cutlass.utils.HardwareInfo() + max_active_clusters = hardware_info.get_max_active_clusters( + cluster_shape_mn[0] * cluster_shape_mn[1] + ) + # instance a GEMM kernel + gemm = PersistentDenseGemmKernel( + acc_dtype, + use_2cta_instrs, + mma_tiler_mn, + cluster_shape_mn, + use_tma_store, + ) + # begin of cache the compiled GEMM kernel + if kernel_cache_key not in config_kernel_dict: + # compile gemm kernel + compiled_gemm = cute.compile( + gemm, + a, + b, + c, + max_active_clusters, + stream, + ) + config_kernel_dict[kernel_cache_key] = compiled_gemm + else: + compiled_gemm = config_kernel_dict[kernel_cache_key] + # end of cache the compiled GEMM kernel + try: + # define a benchmark function to measure the execution time of the compiled GEMM kernel + cur_time = benchmark( + partial(compiled_gemm, a, b, c, stream), + ) + except Exception as e: + print(f"Execution error: {e}") + cur_time = float("inf") + if cur_time < min_time: + min_time = cur_time + best_kernel = compiled_gemm + if best_kernel is None: + raise ValueError("No best kernel found") + return best_kernel + +This brute-force approach ensures we could find the optimal parameters, though at the cost of trying every possibilities. +For more advanced use cases, users can explore sophisticated optimization +techniques like search space pruning and genetic algorithms to reduce tuning overhead and discover better +configurations more efficiently. + +To further optimize tuning performance, we can utilize caching mechanisms to avoid redundant computations. +We could cache the tuning results in a input-to-kernel dictionary (e.g., ``input_kernel_dict``). +When processing inputs with matching ``config_key`` values, the cached kernel can be reused directly without re-tuning. +The ``config_key`` is related with the input tensor's characteristics, such as the shape, data type, etc. +The setup of ``config_key`` is very flexible, users can customize it based on their own application. +For instance, if the data type is fixed in users' application, we could use the input tensor's shape as the key, i.e., ``(m, n, k)``. +To further reduce tuning overhead, we could consider using a simplified key like ``config_key = (power_of_2(m), power_of_2(n), power_of_2(k))``, +where ``m``, ``n``, and ``k`` are rounded up to the nearest power of 2. This simplification can significantly reduce the number +of unique keys while still maintaining good performance in most cases. However, it's important to validate that this +approximation doesn't negatively impact performance for your specific use case. + +.. code-block:: python + + config_key = (m, n, k) + if config_key in input_kernel_dict: + compiled_gemm = input_kernel_dict[config_key] + else: + compiled_gemm = autotune_gemm(...) + input_kernel_dict[config_key] = compiled_gemm + # launch gemm kernel + compiled_gemm(a_tensor, b_tensor, c_tensor, stream) + +By following the methods above, you can customize your own auto-tuner to find the optimal GEMM kernel configuration +for specific matrix dimensions and data types, significantly improving computational performance for models. diff --git a/media/docs/pythonDSL/cute_dsl_general/debugging.rst b/media/docs/pythonDSL/cute_dsl_general/debugging.rst new file mode 100644 index 00000000..649aa608 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/debugging.rst @@ -0,0 +1,133 @@ +.. _debugging: + +Debugging +========= + +.. contents:: Table of Contents + :depth: 2 + :local: + +This page provides an overview of debugging techniques and tools for CuTe DSL programs. + + +Getting Familiar with the Limitations +------------------------------------- + +Before diving into comprehensive debugging capabilities, it's important to understand the limitations of CuTe DSL. +Understanding these limitations will help you avoid potential pitfalls from the start. + +Please refer to :doc:`../limitations` for more details. + + +DSL Debugging +------------- + +CuTe DSL provides built-in logging mechanisms to help you understand the code execution flow and +some of the internal state. + +Enabling Logging +~~~~~~~~~~~~~~~~ + +CuTe DSL provides environment variables to control logging level: + +.. code:: bash + + # Enable console logging (default: False) + export CUTE_DSL_LOG_TO_CONSOLE=1 + + # Log to file instead of console (default: False) + export CUTE_DSL_LOG_TO_FILE=my_log.txt + + # Control log verbosity (0, 10, 20, 30, 40, 50, default: 10) + export CUTE_DSL_LOG_LEVEL=20 + + +Log Categories and Levels +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Similar to standard Python logging, different log levels provide varying degrees of detail: + ++--------+-------------+ +| Level | Description | ++========+=============+ +| 0 | Disabled | ++--------+-------------+ +| 10 | Debug | ++--------+-------------+ +| 20 | Info | ++--------+-------------+ +| 30 | Warning | ++--------+-------------+ +| 40 | Error | ++--------+-------------+ +| 50 | Critical | ++--------+-------------+ + + +Dump the generated IR +~~~~~~~~~~~~~~~~~~~~~ + +For users familiar with MLIR and compilers, CuTe DSL supports dumping the Intermediate Representation (IR). +This helps you verify whether the IR is generated as expected. + +.. code:: bash + + # Dump Generated CuTe IR (default: False) + export CUTE_DSL_PRINT_IR=1 + + # Keep Generated CuTe IR in a file (default: False) + export CUTE_DSL_KEEP_IR=1 + + + +Kernel Functional Debugging +---------------------------- + +Using Python's ``print`` and CuTe's ``cute.printf`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +CuTe DSL programs can use both Python's native ``print()`` as well as our own ``cute.printf()`` to +print debug information during kernel generation and execution. They differ in a few key ways: + +- Python's ``print()`` executes during compile-time only (no effect on the generated kernel) and is + typically used for printing static values (e.g. a fully static layouts). +- ``cute.printf()`` executes at runtime on the GPU itself and changes the PTX being generated. This + can be used for printing values of tensors at runtime for diagnostics, but comes at a performance + overhead similar to that of `printf()` in CUDA C. + +For detailed examples of using these functions for debugging, please refer to the associated +notebook referenced in :doc:`notebooks`. + +Handling Unresponsive/Hung Kernels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When a kernel becomes unresponsive and ``SIGINT`` (``CTRL+C``) fails to terminate it, +you can follow these steps to forcefully terminate the process: + +1. Use ``CTRL+Z`` to suspend the unresponsive kernel +2. Execute the following command to terminate the suspended process: + +.. code:: bash + + # Terminate the most recently suspended process + kill -9 $(jobs -p | tail -1) + + +CuTe DSL can also be debugged using standard NVIDIA CUDA tools. + +Using Compute-Sanitizer +~~~~~~~~~~~~~~~~~~~~~~~ + +For detecting memory errors and race conditions: + +.. code:: bash + + compute-sanitizer --some_options python your_dsl_code.py + +Please refer to the `compute-sanitizer documentation `_ for more details. + +Conclusion +---------- + +This page covered several key methods for debugging CuTe DSL programs. Effective debugging typically requires a combination of these approaches. +If you encounter issues with DSL, you can enable logging and share the logs with the CUTLASS team as a GitHub issue to report a bug. diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst new file mode 100644 index 00000000..b4b463d4 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/dsl_code_generation.rst @@ -0,0 +1,90 @@ +.. _dsl_code_generation: +.. |DC| replace:: dynamic compilation +.. |DSL| replace:: CuTe DSL +.. |IR| replace:: intermediate representation (IR) + +End-to-End Code Generation +========================== + +.. contents:: + :depth: 2 + :local: + + +1. Techniques for Turning Python into |IR| +------------------------------------------ + +1.1 AST rewrite +^^^^^^^^^^^^^^^^ +The function’s abstract-syntax tree is analysed **before** execution. +Python control-flow (``for``/``while``, ``if``/``else``) and built-ins are converted to structured |IR| +constructs. Computation inside each region is left untouched at this stage. + +*Advantages* + +* Sees the entire program, so every branch and loop is preserved. +* Keeps loop structure intact for optimization such as tiling, vectorisation + or GPU thread mapping. + +*Disadvantages* + +* Requires a well-defined Python subset that the rewriter understands. + + +1.2 Tracing +^^^^^^^^^^^ +The decorated function is executed once with *proxy* arguments; overloaded +operators record every tensor operation that actually runs and produce a flat +trace that is lowered to |IR|. + +*Advantages* + +* Near-zero compile latency, ideal for straight-line arithmetic. +* No need to parse Python source, so it supports many dynamic Python + features, and Python has many features. + +*Disadvantages* + +* Untaken branches vanish, so the generated kernel may be wrong for other + inputs. +* Loops are flattened to the iteration count observed during tracing. +* Data-dependent control-flow freezes to a single execution path. + + +2. |DSL| Code-Generation Modes +------------------------------ + +CuTe’s Python front-end combines the techniques above into **two mutually +exclusive modes**, selectable with the ``preprocessor`` flag of the +``@jit`` decorator: + +1. Tracing mode ``@jit(preprocess=False)`` – tracing only. +This results in the fastest compilation path and is recommended only for kernels that are guaranteed to be +straight-line arithmetic. It suffers from all tracing limitations listed in the previous section. + +2. Preprocessor mode (**default**) ``@jit(preprocess=True)`` – **AST rewrite + tracing**. +The AST pass captures every loop and branch, eliminating the correctness and +optimisation problems of pure tracing; tracing then fills in the arithmetic. +This hybrid “preprocessor” pipeline is unique to |DSL| and was designed +specifically to overcome the disadvantages identified above. + +.. figure:: dsl_modes.png + :width: 400 + :align: center + + *Left*: tracing mode records only the path that executed. + *Right*: preprocessor mode emits structured |IR| for every branch and loop + before tracing the arithmetic. + + +Why Tracing-Only Is Insufficient for Control-Flow +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* **Branch loss** – The untaken side of an ``if``/``else`` is never lowered. +* **Loop unrolling** – Loops are flattened to the iteration count observed, + destroying structure needed for parallel mapping and tiling. +* **Data-dependent paths** – Control-flow that depends on tensor values freezes + to a single execution path at trace time. + +The preprocessor mode fixes all of these by lowering control-flow first and delegating +only the arithmetic to the tracer. diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst new file mode 100644 index 00000000..a16c79c3 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/dsl_control_flow.rst @@ -0,0 +1,140 @@ +.. _dsl_control_flow: +.. |DC| replace:: dynamic compilation +.. |IR| replace:: intermediate representation (IR) +.. |DSL| replace:: CuTe DSL +.. |Constexpr| replace:: **Constexpr** (compile-time Python value) + +|DSL| Control Flow +================== +.. contents:: + :depth: 2 + :local: + + +Overview +-------- +|DSL| walks Python’s AST and converts each control-flow construct it finds into +structured |IR|. You can therefore write ordinary Python loops and branches +while the compiler decides—statement by statement—whether to + +* **evaluate at compile time** if the controlling value is a |Constexpr|, or +* **emit intermediate representation (IR)** when the value is dynamic. + + +For a high-level discussion of the overall pipeline, see +:doc:`the code-generation overview `. + +For Loops +--------- +|DSL| recognises three kinds of ranges for ``for`` loops: + +* ``range`` – the Python built-in +* ``cutlass.range_dynamic`` – always lowers to |IR| +* ``cutlass.range_constexpr`` – always unrolls at compile time + + +range(...) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The AST rewriter inserts a small helper stub. At runtime the loop bounds are +inspected: + +* **Constant bounds** → the loop is unrolled at compile time. +* **Dynamic bounds** → the loop is emitted as structured |IR|. + + +cutlass.range_dynamic(...) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use when you *always* want a loop in the generated |IR|, even if the bounds +look constant. + + +cutlass.range_constexpr(...) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Runs in the Python interpreter and is fully unrolled before code generation. +All loop indices must be |Constexpr|. + + +Limitations of Dynamic For Loops +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Early-exit ``break``, ``continue``, or raising exception are not yet supported. +* Operations in the loop body are traced only when tracing is active in that + region. + + +**Example:** + +.. code-block:: python + + @cute.jit + def loop_example(): + n = 10 + + # ❌ This loop is dynamic, early-exit isn't allowed. + for i in cutlass.range_dynamic(n): + if i == 5: + break # Early-exit + cute.printf("%d\\n", i) + + # ✅ This loop is constexpr, early-exit is allowed. + for i in cutlass.range_constexpr(n): + if i == 5: + break # Early-exit + cute.printf("%d\\n", i) + +If-Else Statements +------------------ + +Standard Python ``if``/``else`` is supported. + +* **Predicate is Constexpr (compile-time Python value)** → evaluated at compile time. +* **Predicate is dynamic** → lowered to |IR|. + +**Example:** + +.. code-block:: python + + @cute.jit + def main(const_var: cutlass.Constexpr, dynamic_var: cutlass.Int32): + if const_var: # compile-time branch + cute.printf("Const branch\\n") + else: + cute.printf("Const else\\n") + + if dynamic_var == 10: # dynamic branch + cute.printf("Dynamic True\\n") + else: + cute.printf("Dynamic False\\n") + +Similarly to for-loops, the ``if cutlass.const_expr`` and ``if cutlass.dynamic_expr`` constructs can +be used to force the evaluation at compile-time or the generation of IR, respectively. Unstructured +control flow is only supported when using ``if cutlass.const_expr``. + +While Loops +----------- + +Python ``while`` loops are always treated as **dynamic** because the loop condition may become +dynamic after the first iteration. Similarly to for-loops and ``if``/``else``, the +``while cutlass.const_expr`` and ``while cutlass.dynamic_expr`` constructs are available. + +Compile-Time Metaprogramming +---------------------------- + +Mix compile-time constructs with normal |DSL| code to generate specialised +kernels without runtime overhead. A compile-time flag can, for example, toggle +an optional **ReLU** epilogue: + +.. code-block:: python + + @cute.kernel + def gemm(..., do_relu: cutlass.Constexpr): + # main GEMM work + ... + if const_expr(do_relu): # compile-time guard + # ReLU code is emitted only when do_relu is True + ... + +.. code-block:: text + + gemm(..., False) # ReLU is omitted from the generated |IR| + gemm(..., True) # ReLU is included diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst new file mode 100644 index 00000000..9c5cca7d --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/dsl_dynamic_layout.rst @@ -0,0 +1,198 @@ +.. _dsl_dynamic_layout: +.. |DSL| replace:: CuTe DSL +.. |SLAY| replace:: static layout +.. |DLAY| replace:: dynamic layout + +.. contents:: Table of Contents + :depth: 2 + :local: + +Static vs Dynamic layouts +========================= + +Static Layout +------------- + +When integrating with popular deep learning frameworks, one question is how to deal with the layout of the converted ``cute.Tensor``. +For example, when converting a ``torch.Tensor`` to a ``cute.Tensor``, the shape of the ``torch.Tensor`` is honored for the layout of +``cute.Tensor``. + +.. code-block:: python + + import torch + import cutlass + from cutlass.cute.runtime import from_dlpack + + @cute.jit + def foo(tensor): + print(f"tensor.layout: {tensor.layout}") # Prints tensor layout at compile time + cute.printf("tensor: {}", tensor) # Prints tensor values at runtime + +In this example, we define a JIT function ``foo`` that takes a ``cute.Tensor`` as input and prints its layout. Note +that Python print is used to print the layout at compile time. This works fine for |SLAY| whose value is known at +compile time. + +Now let's try to run the JIT function ``foo`` with different shapes of the input ``torch.Tensor``. + +.. code-block:: python + + a = torch.tensor([1, 2, 3], dtype=torch.uint16) + a_pack = from_dlpack(a) + compiled_func = cute.compile(foo, a_pack) + compiled_func(a_pack) + +Here we first convert a 1D ``torch.Tensor`` with 3 elements to a ``cute.Tensor`` using ``from_dlpack``. Then we compile +the JIT function ``foo`` with the converted ``cute.Tensor`` and call the compiled function. + +:: + + tensor.layout: (3):(1) + tensor: raw_ptr(0x00000000079e5100: i16, generic, align<2>) o (3):(1) = + ( 1, 2, 3 ) + +It prints ``(3):(1)`` for the layout because the converted ``cute.Tensor`` has a |SLAY| with shape ``(3)`` which +is the shape of the ``a``. + +Now if we call the compiled function with a different shape of the input ``torch.Tensor``, it would result in an unexpected +result at runtime due to the mismatch of the type since ``compiled_func`` expects a ``cute.Tensor`` with layout ``(3):(1)`` +while ``b`` has shape ``(5)``. + +.. code-block:: python + + b = torch.tensor([11, 12, 13, 14, 15], dtype=torch.uint16) + b_pack = from_dlpack(b) + compiled_func(b_pack) # ❌ This results in an unexpected result at runtime due to type mismatch + +Following is the output which is unexpected due to the type mismatch. + +:: + + tensor: raw_ptr(0x00000000344804c0: i16, generic, align<2>) o (3):(1) = + ( 11, 12, 13 ) + +To fix that, we would have to trigger another code generation and compilation for the new shape for ``b``. + +.. code-block:: python + + compiled_func_2 = cute.compile(foo, b_pack) # This would trigger another compilation + compiled_func_2(b_pack) # ✅ Now this works fine + +As shown in the example above, with the newly compiled ``compiled_func_2``, we can pass in ``b_pack`` to the compiled +JIT function ``compiled_func_2``. + +:: + + tensor.layout: (5):(1) + tensor: raw_ptr(0x0000000034bb2840:: i16, generic, align<2>) o (5):(1) = + ( 11, 12, 13, 14, 15 ) + +Now it recompiles and prints the values of ``b`` correctly. + +It's obvoius that we need distinct codes generated and compiled for different static layout. In this case, one for layout +``(3):(1)`` and the other for layout ``(5):(1)``. + +Dynamic Layout +-------------- + +In order to avoid generating and compiling multiple times for different shapes of the input ``torch.Tensor``, |DSL| provides a way to +generate and compile JIT function with |DLAY|. + +To get dyanmic layout of the ``cute.Tensor``, a ``torch.Tensor`` object can be passed into the JIT function directly which instructs +|DSL| to call ``cute.mark_layout_dynamic`` automatically on the converted ``cute.Tensor`` per the leading dimension of the layout. + +.. code-block:: python + + import torch + import cutlass + from cutlass.cute.runtime import from_dlpack + + @cute.jit + def foo(tensor): + print(tensor.layout) # Prints (?,?):(?,1) for dynamic layout + + a = torch.tensor([[1, 2], [3, 4]], dtype=torch.uint16) + compiled_func = cute.compile(foo, a) + compiled_func(a) + + b = torch.tensor([[11, 12], [13, 14], [15, 16]], dtype=torch.uint16) + compiled_func(b) # Reuse the same compiled function for different shape + +In the example above, a single compilation of the JIT function ``foo`` is reused for different shapes of the input ``torch.Tensor``. +This is possible because the converted ``cute.Tensor`` has a |DLAY| ``(?,?):(?,1)`` which is compatible with the shape of the +input ``torch.Tensor`` of both calls. + +Alternatively, for compact layout, ``cute.mark_compact_shape_dynamic`` can be called for a finer-grained control to specify the mode +of the layout for dynamic and the divisibility constraint for the dynamic dimension. + +Refer to :doc:`framework_integration` for more details on ``from_dlpack``, ``mark_layout_dynamic``, +and ``mark_compact_shape_dynamic``. + +Static Layout vs. Dynamic Layout +-------------------------------- + +Per the previous sections, we have seen that |SLAY| leads to distinct JIT code generations while |DLAY| leads to a single +compilation for different shapes. + +That said, creating JIT function with |SLAY| is useful when the use cases targeting input data with fixed shapes. +Since more information is available at compile time, the compiler would be able to kick in optimizations that otherwise would not +be possible for the code generated for |DLAY|. + +On the other hand, |DLAY| would be more flexible for the cases where the input data has varying shapes. This provides more +scalability of the generated code to deal with varying input data of different shapes. + +Programming with Static and Dynamic Layout +------------------------------------------ + +|DSL| provides intuitive way to program with static and |DLAY| in the codes. + +.. code-block:: python + + import torch + import cutlass + from cutlass.cute.runtime import from_dlpack + + @cute.jit + def foo(tensor, x: cutlass.Constexpr[int]): + print(cute.size(tensor)) # Prints 3 for the 1st call + # Prints ? for the 2nd call + if cute.size(tensor) > x: + cute.printf("tensor[2]: {}", tensor[2]) + else: + cute.printf("tensor size <= {}", x) + + a = torch.tensor([1, 2, 3], dtype=torch.uint16) + foo(from_dlpack(a), 3) # First call with static layout + + b = torch.tensor([1, 2, 3, 4, 5], dtype=torch.uint16) + foo(b, 3) # Second call with dynamic layout + +In this example, the JIT function ``foo`` is compiled with a |SLAY| ``(3):(1)`` for the first call, which means the +size of the tensor is known at compile time. |DSL| makes good use of this and automatically handles the if condition at the +compile time. Hence the generated codes are efficient without the if condition at all. + +For the second call, the JIT function ``foo`` is compiled with a |DLAY| ``(?):(1)`` hence the tensor size is only +evaluated at runtime. |DSL| automatically generates the code to handle the |DLAY| and the if condition at runtime. + +The same applies to loop as well: + +.. code-block:: python + + @cute.jit + def foo(tensor, x: cutlass.Constexpr[int]): + for i in range(cute.size(tensor)): + cute.printf("tensor[{}]: {}", i, tensor[i]) + + a = torch.tensor([1, 2, 3], dtype=torch.uint16) + foo(from_dlpack(a), 3) # First call with static layout + + b = torch.tensor([1, 2, 3, 4, 5], dtype=torch.uint16) + foo(b, 3) # Second call with dynamic layout + +With the static layout in the first call, |DSL| is able to fully unroll the loop at compile time. While in the second call, +the generated codes will have the loop executed at runtime based on the |DLAY|. + +With the single JIT function implementation, |DSL| is able to handle control-flow constructs and automatically generate +the optimized codes for different cases. This is all possible because |DSL| is able to walk the Python AST and convert +each control-flow construct it finds accordingly. + +Please refer to :doc:`dsl_control_flow` for more details. diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst new file mode 100644 index 00000000..ca409771 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/dsl_introduction.rst @@ -0,0 +1,128 @@ +.. _dsl_introduction: +.. |DC| replace:: dynamic compilation +.. |IR| replace:: IR +.. |DSL| replace:: CuTe DSL + + +|DSL| +====================== + +.. contents:: Table of Contents + :depth: 2 + :local: + +Overview +-------- + +|DSL| is a Python-based domain-specific language (DSL) designed for |DC| of numeric and GPU-oriented code. Its primary goals are: + +- **Consistent with CuTe C++**, allowing users to express GPU kernels with full control of the hardware. +- **JIT compilation** for both host and GPU execution. +- `DLPack `_ **integration**, enabling seamless interop with frameworks (e.g., PyTorch, JAX). +- **JIT caching**, so that repeated calls to the same function benefit from cached |IR| modules. +- **Native types and type inference** to reduce boilerplate and improve performance. +- **Optional lower-level control**, offering direct access to GPU backends or specialized |IR| dialects. + +Decorators +---------- + +|DSL| provides two main Python decorators for generating optimized code via |DC|: + +1. ``@jit`` — Host-side JIT-compiled functions +2. ``@kernel`` — GPU kernel functions + +Both decorators can optionally use a **preprocessor** that automatically expands Python control flow (loops, conditionals) into operations consumable by the underlying |IR|. + +``@jit`` +~~~~~~~~~~~~~ + +Declares JIT-compiled functions that can be invoked from Python or from other |DSL| functions. + +**Decorator Parameters**: + +* ``preprocessor``: + + * ``True`` (default) — Automatically translate Python flow control (e.g., loops, if-statements) into |IR| operations. + * ``False`` — No automatic expansion; Python flow control must be handled manually or avoided. + +**Call-site Parameters**: + +- ``no_cache``: + + - ``True`` — Disables JIT caching, forcing a fresh compilation each call. + - ``False`` (default) — Enables caching for faster subsequent calls. + +``@kernel`` +~~~~~~~~~~~~~~~~ + +Defines GPU kernel functions, compiled as specialized GPU symbols through |DC|. + +**Decorator Parameters**: + +- ``preprocessor``: + + - ``True`` (default) — Automatically expands Python loops/ifs into GPU-compatible |IR| operations. + - ``False`` — Expects manual or simplified kernel implementations. + +**Kernel Launch Parameters**: + +- ``grid`` + Specifies the grid size as a list of integers. +- ``block`` + Specifies the block size as a list of integers. +- ``cluster`` + Specifies the cluster size as a list of integers. +- ``smem`` + Specifies the size of shared memory in bytes (integer). + +Calling Conventions +------------------- + +.. list-table:: + :header-rows: 1 + :widths: 20 20 15 25 + + * - **Caller** + - **Callee** + - **Allowed** + - **Compilation/Runtime** + + * - Python function + - ``@jit`` + - ✅ + - DSL runtime + + * - Python function + - ``@kernel`` + - ❌ + - N/A (error raised) + + * - ``@jit`` + - ``@jit`` + - ✅ + - Compile-time call, inlined + + * - ``@jit`` + - Python function + - ✅ + - Compile-time call, inlined + + * - ``@jit`` + - ``@kernel`` + - ✅ + - Dynamic call via GPU driver or runtime + + * - ``@kernel`` + - ``@jit`` + - ✅ + - Compile-time call, inlined + + * - ``@kernel`` + - Python function + - ✅ + - Compile-time call, inlined + + * - ``@kernel`` + - ``@kernel`` + - ❌ + - N/A (error raised) diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst new file mode 100644 index 00000000..a7c46003 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_arg_generation.rst @@ -0,0 +1,196 @@ +.. _dsl_jit_arg_generation: +.. |DSL| replace:: CuTe DSL +.. |CUSTOM_TYPES| replace:: customized types + +|DSL| JIT Function Argument Generation +======================================= + +.. contents:: Table of Contents + :depth: 2 + :local: + +In a nutshell +-------------- +When using the ``@jit`` or ``@kernel`` decorators to define a JIT-compiled function, the arguments to the function are traced to determine the JIT function's signature. +|DSL| provides a Pythonic way to write the arguments for JIT function as one normally would in Python, and the |DSL| will take care of the rest for you. + +Specifically, |DSL| honors following when generating the JIT function's arguments: + +- JIT function arguments are assumed to be **dynamic arguments** by default. +- If an argument is explicitly type annotated with ``cutlass.Constexpr``, it is treated as a **compile-time constant**. +- If type annotation is provided, |DSL| validates the argument type at compile time for **type safety**. +- |DSL| provides **runtime checkable protocols** (``JitArgument`` and ``DynamicExpression``) for generating JIT function arguments for |CUSTOM_TYPES|. + +More details below for each of the above. + +Static argument vs. Dynamic argument +------------------------------------ + +|DSL| supports both static and dynamic arguments for JIT functions. + +1. **Static arguments** hold values that are known at compile time. It is not included in the generated JIT function signature. +2. **Dynamic arguments** hold values that are only known at runtime. + +By default, |DSL| assumes dynamic arguments and tries to infer the argument types from the call-site argument types. An explicit type annotation ``cutlass.Constexpr`` can be used to specify a static argument. + +.. code-block:: python + + import cutlass + import cutlass.cute as cute + + @cute.jit + def foo(x: cutlass.Int32, y: cute.Constexpr): + print("x = ", x) # Prints x = ? + print("y = ", y) # Prints y = 2 + cute.printf("x: {}", x) # Prints x: 2 + cute.printf("y: {}", y) # Prints y: 2 + + foo(2, 2) + +In the example above, ``x`` is a dynamic argument with type cutlass.Int32 and ``y`` is a static argument. + +With the ``cutlass.Constexpr`` annotation, a more sophisticated uses case of static argument in the JIT functions can be something like: + +.. code-block:: python + + import cutlass + import cutlass.cute as cute + + @cute.kernel + def kernel( + self, + tiled_mma: cute.TiledMma, + tma_atom_a: cute.CopyAtom, + mA_mkl: cute.Tensor, + tma_atom_b: cute.CopyAtom, + mB_nkl: cute.Tensor, + tma_atom_c: Optional[cute.CopyAtom], + mC_mnl: cute.Tensor, + cluster_layout_vmnk: cute.Layout, + a_smem_layout_staged: cute.ComposedLayout, + b_smem_layout_staged: cute.ComposedLayout, + c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None], + epi_tile: cute.Tile, + epilogue_op: cutlass.Constexpr, + ): + ... + + # Perform epilogue op on accumulator and convert to C type + acc_vec = tTR_rAcc.load() + acc_vec = epilogue_op(acc_vec.to(self.c_dtype)) + tTR_rC.store(acc_vec) + +In this example, ``epilogue_op`` is a static argument in the JIT kernel where the argument is used for the epilogue fusion. Upon calling the kernel, +an elementwise lambda function can be passed in as the ``epilogue_op`` argument. For example, a ReLU can be applied for epilogue fusion by simply setting the +``epilogue_op`` to ``lambda x: cute.where(x > 0, x, cute.full_like(x, 0))`` + +Refer to the `Blackwell dense GEMM example `__ for a complete example. + +Type safety +----------- + +|DSL| makes good use of type annotation in JIT function signature and validates the JIT function argument types at compile time for **type safety**. + +.. code-block:: python + + import cutlass + import cutlass.cute as cute + import numpy as np + + @cute.jit + def foo(x: cute.Tensor, y: cutlass.Float16): + ... + + a = np.random.randn(10, 10).astype(np.float16) + b = 32 + + foo(a, b) + foo(b, a) # This will fail at compile time due to type mismatch + +The type safety check helps catch the type mismatch issue early at the compile time with clear error message to avoid tricky runtime errors which is usually more expensive to debug. +In the example above, the second call to ``foo`` will fail at compile time due to the type mismatch with a clear error message: + +:: + + cutlass.base_dsl.common.DSLRuntimeError: DSLRuntimeError: expects argument #1 (a) to be , but got + +JIT function arguments with |CUSTOM_TYPES| +-------------------------------------------- +|DSL| supports |CUSTOM_TYPES| for JIT function arguments by providing two runtime checkable protocols: + +* ``JitArgument`` which is used for host JIT functions to be called from Python. + - ``__c_pointers__``: Generate a list of ctypes pointers for the current object. + - ``__get_mlir_types__``: Generate a list of MLIR types for the current object. + - ``__new_from_mlir_values__``: Create a new object from MLIR values. + +* ``DynamicExpression`` which is used for device JIT functions to be called from the host JIT functions. + - ``__extract_mlir_values__``: Generate a dynamic expression for the current object. + - ``__new_from_mlir_values__``: Create a new object from MLIR values. + +Refer to `typing.py `__ for more details on these protocol APIs. + +Depending on different cases of the |CUSTOM_TYPES|, |DSL| provides easy ways to adopt |CUSTOM_TYPES| for JIT function arguments. + +1. Direct protocol implementation in |CUSTOM_TYPES| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +One way is to implement the protocol methods directly in the |CUSTOM_TYPES| to enable the protocol based JIT function argument generation. + +.. code-block:: python + + import cutlass + import cutlass.cute as cute + + # Customized type that implements the DynamicExpression protocol + class MyDynamicExpression: + def __init__(self, tensor, offset): + self._tensor = tensor # Dynamic argument + self._offset = offset # Dynamic argument + + def __extract_mlir_values__(self): + return [self._tensor.__extract_mlir_values__(), self._offset.__extract_mlir_values__()] + + def __new_from_mlir_values__(self, values): + return MyDynamicExpression(values[0], values[1]) + + @cute.kernel + def my_kernel(x: MyDynamicExpression): + ... + +In the example above, the ``MyDynamicExpression`` implements the ``DynamicExpression`` protocol and |DSL| will generate the JIT function arguments for the JIT kernel ``my_kernel`` based on the protocol methods. + +2. Adaptor based protocol implementation for |CUSTOM_TYPES| +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For the case where directly changing the |CUSTOM_TYPES| to implement the protocol is not feasible, |DSL| provides adaptor based approach to adapt the |CUSTOM_TYPES| for JIT function argument generation. + +The JIT function argument adaptor is a callable object that implements the desired protocol methods for the registered |CUSTOM_TYPES|. This way, |DSL| automatically queries the JIT argument adaptor registry +to generate the JIT function arguments for the given |CUSTOM_TYPES|. + +.. code-block:: python + + @cutlass.register_jit_arg_adapter(MyFrameworkObject) + class MyFrameworkObjectAdapter: + """ + Convert a 3rd party framework object to a JIT function argument with JitArgument protocol + """ + + def __init__(self, arg): + self._arg = arg + + def __c_pointers__(self): + # Convert the framework object to a C-ABI compatible object + # thru its C-ABI interface + return [self._arg.get_cabi_pointer()] + + def __get_mlir_types__(self): + # Return the list of MLIR types the framework object represents + return [self._arg.get_data().mlir_type] + + def __new_from_mlir_values__(self, values): + # Convert the MLIR values back to the framework object + return MyFrameworkObject(values[0]) + +In this example, the ``MyFrameworkObjectAdapter`` implements an adaptor class which bridges the |DSL| and the 3rd party framework type ``MyFrameworkObject``. +The registration is done by just decorating the adaptor with ``cutlass.register_jit_arg_adapter`` for the customized type. With the registered adaptor, +|DSL| will automatically use the adaptor to generate the JIT function arguments for ``MyFrameworkObject`` typed arguments. diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst new file mode 100644 index 00000000..30d07377 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/dsl_jit_caching.rst @@ -0,0 +1,152 @@ +.. _dsl_jit_caching: +.. |DSL| replace:: CuTe DSL + +.. _JIT_Caching: + +|DSL| JIT Caching +==================== +.. contents:: Table of Contents + :depth: 2 + :local: + +Zero Compile and JIT Executor +----------------------------- + +Zero Compile is a feature that enables explicit kernel compilation on demand through ``cute.compile``. +When ``cute.compile`` is called, it compiles the kernel and returns a JIT Executor instance. +This JIT Executor instance can be cached and reused directly for subsequent executions without compiling the kernel again. + +The JIT Executor is a component that independently executes compiled code. +It can be created either through ``cute.compile`` or implicit compilation. +The JIT Executor instance behaves like a callable object to execute the compiled code. +Each JIT Executor instance maintains a single compiled host function. + +It encompasses all necessary execution components: + +* Host function pointer and its MLIR execution engine +* CUDA modules (optional) +* Argument specifications defining how Python arguments are converted to C ABI-compatible types. Note that arguments with the ``cutlass.Constexpr`` hint are excluded from argument specifications since they are evaluated at compile time rather than runtime. + +For example, in the following code, ``print_result`` is a ``cutlass.Constexpr`` value that is **NOT** evaluated at runtime: + +.. code-block:: python + + import cutlass.cute as cute + + @cute.jit + def add(a, b, print_result: cutlass.Constexpr): + if print_result: + cute.printf("Result: %d\n", a + b) + return a + b + + jit_executor = cute.compile(add, 1, 2, True) + + jit_executor(1, 2) # output: ``Result: 3`` + +The JIT Executor ensures all components are properly initialized and loaded after compilation. + +For example, all CUDA modules are loaded (via ``cuModuleLoad``) and kernel function pointers are extracted (via ``cuModuleGetFunction``). + +When calling a JIT Executor instance, it: + +* Parses Python runtime arguments and converts them to C ABI-compatible types according to argument specifications +* Invokes the host function with the converted arguments + +Custom Caching with ``cute.compile`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``cute.compile`` bypasses caching in |DSL| and always performs compilation, returning a fixed JIT Executor instance. +This allows implementing custom caching strategies as shown below: + +.. code-block:: python + + @cute.jit + def add(b): + return a + b + + # Define a custom cache + custom_cache = {} + + a = 1 + compiled_add_1 = cute.compile(add, 2) + custom_cache[1] = compiled_add_1 + compiled_add_1(2) # result = 3 + + a = 2 + compiled_add_2 = cute.compile(add, 2) + custom_cache[2] = compiled_add_2 + compiled_add_2(2) # result = 4 + + # Use the custom cache + custom_cache[1](2) # result = 3 + custom_cache[2](2) # result = 4 + + +Cache in |DSL| +----------------- + +By default, cache in |DSL| is implicitly enabled to avoid recompilation when kernels are called repeatedly without changes. + +The cache is implemented as a map storing compiled JIT Executor instances within |DSL|. + +The cache key combines hashes of: + +* MLIR bytecode of the MLIR program generated by |DSL| +* All |DSL| Python source files +* All |DSL| shared libraries +* All |DSL| environment variables + +The cache value is a compiled JIT Executor instance. + +On a cache hit, compilation is skipped and the cached JIT Executor instance is reused. + +On a cache miss, the kernel is compiled and the new JIT Executor instance is stored in the cache. + +Here is an example demonstrating automatic caching of the ``add`` kernel: + +.. code-block:: python + + # Global variable + a = 1 + + @cute.jit + def add(b): + return a + b + + # Cache is empty at beginning + + # First call: cache miss triggers compilation + result = add(2) # result = 3 + # Cache now has one instance + + # Second call: cache hit reuses cached JIT Executor + result = add(2) # result = 3 + + a = 2 + # Third call: cache miss due to changed IR code triggers recompilation + result = add(2) # result = 4 + # Cache now has two instances + +The cache can be serialized to files for subsequent runs. +After serialization, compiled MLIR bytecode is stored in file. +The cache directory is ``/tmp/{current_user}/cutlass_python_cache``. +The cache loads from files into memory during |DSL| initialization and saves back to files when the process exits. + +The following environment variables control file caching: + +.. code-block:: bash + + # Disable file caching while keeping in-memory cache available, defaults to False. + export CUTE_DSL_DISABLE_FILE_CACHING=True + + # Maximum number of cache files allowed, defaults to 1000. + export CUTE_DSL_FILE_CACHING_CAPACITY=1000 + +Limitations +~~~~~~~~~~~~~~~~~~~~~ + +The intention of caching is to reduce the host launch overhead before each execution. As above example shows, +the consistency between the original Python code and the MLIR program is hard to maintain because of the impact of dynamic factors such as global variables. +Therefore, the MLIR program **MUST** always be generated to verify that the kernel content matches what was previously built. + +For optimal host launch latency, we recommend using above custom caching method with ``cute.compile``. diff --git a/media/docs/pythonDSL/cute_dsl_general/dsl_modes.png b/media/docs/pythonDSL/cute_dsl_general/dsl_modes.png new file mode 100644 index 0000000000000000000000000000000000000000..1f3f0bd3f40910eacc52e22d8651309e84cb3472 GIT binary patch literal 1134058 zcmeFZXIPV4*EOo5qM~A?DOF)x009;0B^H_>U3v+M0#b$0OCq`z1c8mxTaaF)ODBL* zrPokHq=puH=;f@S`~A*&H2XQ%e*b=d#0wU=*P3h2F~=Np-n>;&l09{T@x*}x2Tt9W zlTrg;rVku|oH}+Cd?#b_>T&SpAY4uM?t#n}=BWb*E*!Wob?2e8;VjlIEJ4HL$^tHR zZa9;}EZ?79P|B=b0rC9c;?9w9rEv~li7mL<)7VK@^-ZuiP^#rYH zX|Pg{<81lu1)MLbR;As)b$q0)yGoknFiUJ{i}l4Y+ptX6^mNxGK9nmfYmC=sG)@E$ zADRe2Y;Bcux*t1p;r5FI2M-uUg?;dZ5fqsuf#a(wrrdd!FRHcIAx+& z``=o-rCj;joqq4gp23$cq!i}1RGE{53{L^x?5B+(Wq5jZ`B3DH9O(H8(uoKQUf6w$eER>A0j~01NYNUUzeF~? zW3PD9C<#AuYuWgWjs%EaZGN5X+I^5aUGyS$=pghE)$eTtei`EnR=A2h+xp|5%xQ!^ zer!6Ukfsnu?g-ysfE6s)U1?~@K8kb?Mmk3h6GMWv;N(8%MJ4hCz zFcZc7bmY#TeF&`3{QgnJWNe$ z$hl7?7adrUNNL(fZp9CWngkodpC$ja*()H0>>RdYBx}L_&zUtPEjwu~N^Y6a4zP0u zF0YXMRggXiQvm#{57|TgccxppNeVTjsdXK>K&re6jLyT6i6OV($J4bxg3WNR7+v!vw;Szfup)hV z@KXf26+fI1O`_Lvrq{?qjS=6$I})LX$^Lxb?Jy|dgqvb;vCHH_EkqSK|9RVdvJ3W| zpz;I>X+BaK>ySHtXKu`=W zeBcJT^Jl*Uc5d`e7a+Ibhe0$`NJt}bu92Obfi#oBifM77^nbR{Z9eck^yGO3a^Zfa zR{>Ogm+p}LIqBX#iB~}A?Y0d1le3TKQ<9x!^fSmU_;KFsyQEw-`9+(a-1)ObNh|Qh zI{(E&t`JaaO1Em03)vTyhT!D9$I1SD|K2}ALGB zxPXD~QrG5^a}dvNPOw6>wK?sdEz|~BiXoqCE%~Bl^(-h*@BcIU810Z)Oa*_TF}d>_ zod=v=RdNRvDNCnd0=+r?V*zgWnSR28@LlLz_iP>>WAUj8%w z%qRm~*oiJ?Axpj*ukBOaEKW7~P<)i~z*ebR>gW9w=^h@MC_&Eee#Y6g52Wk$kV=j6mH}&WUhKI8SY098UyA<)QRw~v%U+(YQX?BM z`4_s|ev74{MxZW7e5asI8qj>U%OaJyry&y5hP?6`>ZKQHR@|C&li`%V-?iuqZ zCk7SnJ&9jw6%l9+k7VAFYNatFZ*^@vh_;#SL)jkW_8+?kR_q9M2OjVoI^58`36-u! z_GyJOFHRQgj#HtVjc$?ii`zyXBqBP$TXJM$hGUY5E}(S3P@bi zmr%z1hN@6;PtV_)+Zrp{m@j4AHu<^^+^)ry-vf??|N!qDUQwt6@SpP|xf8gl< ztcH5f117+q;s_M*KU6X?6)cd*0m1uI`F`I53s-Ihej#fTPlsy&Vf?9a7_wkC4p@dB ztCqp>GukgJ&udv-W!rra@DxAHoA+FhAEUS`_>Dy&5oPj1Jy0#ETc+UOLzZzt>&J1`_+_8hMlgb=q0inKX~ZBh1w=KS&yj-;n*4N+{(7~rmW;Bs7mY? z%-jn05>o!>tq&X()|r~&hOX2cWFC9KlY}P4|BxX5k8fPl ziPg{8GmsK5xc$n4DPMl3*;0~Q%BZs#C0^Dgz9Pg?Cj!YSZf)Fr2sirP<7oJNn&>f? z&O#ChD=OX%2YB;`;L{GD|5_nS(EV$!Z1mR4h(jly2!3Oyq% zK+TNE_ySe35r_n~BJe$1o>eefpXEke4vm2Md*54KFAuSnn9a`%6(6DO!UR@UuYOs| z^c3@%6_#~}XKHRtvFDqIC>y2dvMEABWvX);g&GZ`;Rs!0^Og85D@htMaqB5SV#G4k zm;!Qn*OLMi6;9u0o0)TZnYz@fGff&=V@LqZr&MLC@1VWBz_zX7Bg~F3x*Xim-Rzwu zHAOk#WVnE{AtV;&?9iQKMM+-5+ds|8oI)|>2cmd1)6@12i3Hb~44vgP?A{xS7E6+= z5Z7bcaK_8ik#)?+;3hLqk+a&1+bEKlb281~k!R^p(poFw=vdeG>*8;n1q{|DT91lF zm%n&#&w9M8C_lfEBW4k=#g|`b3pSUDL0*w9D>v#%XUcE5cLKvT6JL3Q4~wY&{)GYw zfdtGL4_NCFloDT$n(_@g8W&P@lV!9}65+o@KCk{A>?f##(mPFM6H-r8INAscpMq_b za^QFd*E7EkPd6FE$0{uNttFGUxzXkMSpyEKxWe1Tx)tK=GE|%HiK^+_`d@UPdYlz@ z{bV@w*9a8|24x;Mvky>Ail1b@*HLm^6i){paL<_mxCydBIu3Y&-f0a9-(x4WX4qOG z%Cp*$Bd`3vHNeF%1Pzr;CFCu3iNwyL{S=ub>ixAQ>AJh;<`K7Ga@RzG@U)O-7@wb;0~O;Qw8pDS3>4m(z_Mb88U zA0qGXto{YnSK6PjcFUZDZ`x6DnymRiYg&ucn$~>r(}6Bhm==V*@Az70!Z*0u%59_* z@;km#d6rb*9W&KdJ`~xqGd7&~`ku{H+_!zEP?{28m%m%(Eu2sv=6PT4^-ro((R0se7(L!Ha_OmD^W7E4N*MSE!wWQB zOockB1G-UB>h6EAAEkOXfbA`Mo)y-MQLN9qtw%1884*aNVTZrq9ynW3Wr$V_vrC=T zM!&c4O=#((dKy#c+fQ0Hqn}TIL-e z?z#M7|A}U4b3Fl(rR53pc($v$S(z1w>ik;8KZh@e`Z_g--ZGEp<2B66W$?5k`p8P% zs?h!{>(kr)t|Jwmp8?|$HEAtTG_^u5R905fi99w=UKAO>(xm8*h*OWACwp5cQ24>6 zL9l*5Ky^~fl(#;xwXf%5My!(Z$?qga1yXz*inu62`h^%2M@3tB1wkt z>dmA9BULGcY`mbWW8jeaCf{}aXoSB1%D0i|&PafIJ#<-GeZIE~zx1nF$Jk-Jo}ma& zTSjbjnzAgqWN?Fx`s9QJPa;W8WmU$;0B&>RP+gh*7UUHrwYw1eQ3k!PFH8)l(I$@v z=IHXwsD7t*#{yZzP?r5jBS_qaP$*uwuo0srXdav)T^m$4)9S~(mnTOs5a{&N9skTb z9lWY)Wad86qux@3GYeZ%VEhn2KR0^zvpL7}!SyDJ2$4O7iwQw3&mQeN~m*pB?Et+L{c3YqcmU$j$wPK3g2Ye+$`RScr>->*I$pzEUM8v z^Sd1G3##E_;nROsKF>$!_&eV%gtHYdro5^Uv|S=Z*9NB(3#F>l#kvWLJBr{4lRu77 zB)ZeUJY^-VC#de)(ef>@(Fe01>KRBVyZ>a{R6ihYcjxnVQmSl6`KhfYw#N&0)ZBA! z=(s$rnCS8%Llr(TkQ4>Gj@A1wDt%}{;(*0)@LCrFIBWqGLla%`LEqo0fJP`%0KD4| z03?TWR2N~8vjZ8K;pbk`S!Cz184sD_jGkUyM>qkF_1V`o(S!CJV^#&n_*Hl0#~?~p zM{HLrfkNLyAEgNypKfB49b+gcl$(FnC1cp7oMo4fFqUHOPD=jE3i^fkyXON{(18Qk z-rB!k2H5`Xp?jm9^Y=y2u~nyf!)2@za?keIDE0cP_k833O^`I>K<2ZN2w(lkf+>hsO=qt7=e_1s9C_2)hp@Lk zJ(ykXvIl-+`(bI$c+O@OM-9}z57wKkkc|ilO(#O%dc(N;^HkG|ob+ka103!d6O)r^ z*>n=d=W6|nIYu~IwI@e|Zh-QKPu9}1r<2&GY)fvx=wn5FxrR#plEAouhmN4EXQ|I? zj!o&%{_4)2H2~`MS+?|I_0jHs#65&fmoQA=IAHVbH?*y5Fa)X1Y`w->C&AAyWo544ZZrX zEFg|~!-a6@&JU6>V1;~yvaZ?G0*Q6MtW>F^g;_q(WYtK~Y@enV9!)znUWI(_maKS7 z(Ent$l(}QtGFcgLffQAlM*$sVzTi1Ly)tT-piCcLZofs4g{^G`pi|{CA}BbXcZ8#a z*D0C&o#8FEb3@#6kkS!M)f-bjmul+MP18-k&fR2JfGb)pEZS)3o^p~+;1A&q+-Ft} zu+adf(j|Pvvtb2=DV~%<+Wg&joLVzT21o!;2T$_$Q1Q2R2KrQuypkht>ZX$?Xth2M zNY9klT4vax5cBpat~K8v&B?)PgX!U;xy3ASt!mkroVIzFmER>zQmslUe_Lh#_P5AT z5WjraA4W@FkuHfnR{h0gxZWbEz#|JBaEI^tNbl;_V93$(vk*P{XbF6>K#~l!WKYXf z<`<&uNda^^rwd&@YfEJb z0h}E?<14d$sew>#wR95)5^c@$le3x5_T>{W5B6GvCs%cEJ{1``?naDig#<*WYb0%n zgc||kB^mXm1Rk)>17Vm)?ed(1(8iKve4tCc52{Q zlyJ_@UjvS(wzIVo)a0Q~*fF`8k58bxgDx=R1#ir&vC}&8$UEr76r$;aV%@zuZ)fd5 zYZpoX=+Yv)dx8Ci7J|`R->N#le!L<(M#C|(t|rd}_wQp~4DhnlUr)BUTtJv}I6cb1 za4ro~5y-A&ZrU|3bSPNU@wv#R& z>oA)0`S~QCV`NN%emAva`zuvesq=upM*ewUf=;Zs;ojXb=O=+OQ<$pLPOoykyLK}~ z=ANP?(_Gr_fI=nyhzWfc_ltx-|1*1KYm(x<;eMvS$4RYZwond_+$wXuwJWUH^2-&O zs!SfsF#`9xr*LwCLlqC-2A}CFoexKqu6#=wLt!OETO$mIg+ z5af#eXpN2h+zo?KU_T!PR$Ey%tym4dL#F!qk0!JAT9r7_slLf_RMW)A*1zbzM!~>! zGjb{P#v57obF(l!$11_w0J8X=dC}CWFk-|Plu7ZcpQa{0!ndZ;uve}$1C(CYtnTs+ z8|MQ@5rHG?-bM3<6}~p;j0^~*C&}9)0<$)r=DdallFZAIz>BIoM_Z7#NoXvd9xGb-C zr?rw{;T-4;Y}DVWI-_xoOgI`Pl4$)3O>ViszP`Z*k-b)sO1oIwr#pU10c#(b+}{Gy zPk6eYVM*6eEm6zJ`4hri$yT~O)6b8C;znSE^h`I;hsvqY(?J1ju6G)25&pfs8vz`!PY&bVbj@|G}k!PDDWKLCyAz{9K__#r;mNzS;gtK{#d z(6+ngnRx9TALP#-cqUgm-`G8+ER|4Kn0`Od@VXw83yg319JwBo=1b5(kQY8;q0MmU z9=?k$C}Oc&DN-@<)=jrzx#0sn*d4)WxAC0B*RL@jRH>iwd&R0GQ}in1LhEL-e3)%` zM6tW;_uj-==!80C2wfNwU)Zj1cO98gRk*dV{M-47bW*cjAf|s>e|bS`m-ZT95d*%He6mq(P&NB5KxwzK`K11hri@pFua`jNW zw=dPKg{<-D*PX|Y2i!gT*+PTrOj^8yaM))LS~kjxQ`@OTyEhOnM;byeyGOm4>}vFNLywZIktr0D-|x#Ag2Qkvf@n_6X9rHJk@advDC z5YmDoll6rh`c1uRlVBlguBLjPQA5};Mwk0hh^gBy%WZ4yBi@G9){#)NQ77KrjYzkt zoU9zcviIVHI`_EFNXM;dkPI>v5u0`n`Q|U z(f|rjeeg_D#jj*@?tw@_i;_m>wX)hnugA^NM3XA$Rh53LZ#+(8&Xs=Z!7_yuJ#HQ@ z?h^um%aLLtpI^r2F)FRZWMA2ZGNI8^_3_Bc-i?JDfUV5lp=E z{SFOGVzRWy;|(HO;Tw)-W2^|CteJb5F>tr z>?MEx%kcV^e;_7uHDsLsmL}3we$h)p@|Lx%kMo9Q#!_g8bVxc;Oth?hE(BD5SqN)0 zaJi+N9X4xuctdn~e~Jcc*}I({{Ta;^fpL4h-c2!&Rrh$4gUhI?)DO96Mg1O` z9)e~G2?(GUXY}|<_JOgF96Ta)3wq{Nq+^%ThSHbKt6IrWNFXaLK~femq;IuCA#L7$ zEdUfZxbu)Y`l%Rp$7T%I8mC%%lC0-j!mE2YZgf#lhu-G7wo*4$Fd@zwF#LCJ!PO|>Pe)vna#zydtD!v=m*`oCMADFX`>6~>BfTg&Nq>oIv>k&Iyt}! zS@jYM%$jXYt#O0Q%w|YUks(Z2T3LyivcrF#>SxVlbcvLk#yWEfKyLb;nh4Fo1&w~{ zOKqr;h5!aoAzQuU)1AE6o#H2+teufIer9sXwWvTfJt~`drfnH|q@g<;=2M?>z~bY9 z@w2l)UVCA3>_5vfqkPgF9Oi?b!3=MJa)q#+Ir_C#yRyIg9JrvbrBn^Zovm+!c0X4s zIWZioBE*0WTiBOZl;yYrtrleqQW9b4~Plbt#u@`8LKBKma)V7D26#9p?pF_sfF&xlmSQV`AnwSr*)hiuk-Q);b2*X zR5bdYN@9983HwKR~J)qCjY83}ts5_l}0NZ|-cXNHOQHAb`f{7o5Cy7UR}d ztwp^#*DPe9`bSv7ibDZgIJ^vDowk%@xJ8ZP&l3<@e(*(7Xgn%LQb@pVqkOuto0}EV zzCClAx_fIGAc5vlTG`Yf^ z#W$o#ZU|bt4;+|#%l(tJ9}vz>StB|23_=IOZDzy7{*YXH_gZPxU&Lms?fdracy%l* zZXY)dy0{7PmF>pFr8l&BHF~>yOuPB*$@D-Y^s7{H*c4JrSaaex$?C@jj!YjyKDY>8 z7|Q+Sg(1)G%K!rUl`M%FPMtafZ2FAnib|>)oIvw%bS?8dV>q=JDdcN7DJVZ5W#jfo*#&h4K=SByOcW8RdxM}<-%H7ajsa)(Ebeh?ej{U0+;C zMP$q{C+3H#E+-YR9pX5j>d}ucMAJpOCp>!3H z-RM~Rx%BK@EH^0vJF*H59R0u5J{*riTfC_?QBvTvnSa!%1@flXXQ1{Zp%^)Lq$3R# zPHP1J=m#Z!;vW~bSS=}bqC4KfvM-K5lH4M51 zCs6FV?I8{m0VF<=dMHMK;@1}jyXy1U)#e@!Y$D7)?S=wi;!`Uv1^HArq183*8JcH# z)qnyS!x7U~Q;sd2Jfy(+pTPJtpeI3o>A7FOc6gK2^IdLK0+0POXe2idtLBf3iB2m+@0%1zmpvRy6utrL zs!x5)Zoyqw!A_$O_5rqY_72*b?$@^3&pB@#1-f)AW~11RP271)F?#4w&x6Fe*{H$I zVvmeM{fzl{-o~N1p(d$~!2sdF9bZipi4gzOB>DPng8+Vo#vasCQ zb$s+_*j5aeL)dfVpOeiO0aP~Qdf$=P6mB9Fi8ll``{UrJG-d+iAt*jwq!tzr_u)%m z#@V3#u8sGeOYEf;@qdQSp3OwgsV`(WLSssLVuwg7vf0EoU-asGb^}qpfz774o=O9o zsV&3E88?)T&+KRa{Oz6nXt&4TlVEGBk=)A$;-n@6QJ!pr;xtJ%h8)_u16sO!_CD)1 zmmUzafW~bq>c<2Maba16Omc}t1>d-*mw}MObZF`M_yQ;xl-XA@Is18j7Lo0)4mBVj zZu&uNA1*>NDmuLjSzJ)@m&-{Cx>Ep6*NnW%FH^8a2Ba>vud$?2Br_{&gAlbm@2Fil zCu231UGr5Hs0DDf1genSx38AJ2KGF#vN)1D5*BI=eZ279bG~qf9}pj>E~Vnkf#RPP zh{KJ!&gDWL@$olKR;7u)H`cC2*C!4e5b=09aDVqiiHzQ_QA4T&tIbLd)Di2`cW_vT z&Wijczi9LM&E1fw4yuXEvL|rrW@rsUfx)Y+xAjN9zWUkg${c5X3P_iaw{ZT>`nC|r z4OeQV%;}>|i*%80{$^_z8cB$Y;_J7`=n1ZyrVgO(#2ywv(#9}vbmvuy`r|+ z8P{mGRy`Ug(M14yZ${y+^O*NbNtk?dN1h2^ziU7;1=Tbi?ZaR5r!2VmnaS+6EloI= zmZ|mR_L*Gw`|*OCBQ+sTJ$j83CF?9x*o<93cf=8`<$#tspdPp1*5gVH?4y#^{g|04 zdC$0$PhqMhTi>&Fk2>1wlQ=r^wsvxo(~($Rn+hgNSfmMKexQ6EqT5w~{jym1&g9lg z3{XqzN=~!H?3XkuGgfb3v#JRp2fm_9VmR1A^#ub&yR^#C5hsXjOPbf;UKU~)fJ=Qj zGi?DTBJ_M$^&1_4UzjGyvCWwTIK5dB1S|-O)GP^y@sI%+?M84=iw%2%)zf1S#?)MT zn!c(re6Vv}o$abe@dzT!W_{|CySAkYci`3Wc=KRBdod-C`9oF4{CxGC70G7FVsI^;ML#Hp3I!CCzfU*>`)5L0X z^A}G{)fq95-7-}sm)gfR)U&sWuxt0aH_d#C%G3j1fARc~3Ek_$3@p*_fWCU3NZeqm zmd|#aW5ow+XkF7Xd~D4zVsvg^y`dE}QVCp>-Hc)nGxj@0k76)P2Vn(|u6t|_*x*D5 zG9BSW{xt*d4UE9jXC({1&7)OH9-3Qg09zw{&?1XarwDdQl4`MQxBcmpqw>rzGR)6% zcZc-;uU8!F;r?UHZbQLqY+&D|v>F*3So_2W z5V~+YxN z{+}&ar5U-OVoq|22NH>QF=T>7mY&+)J{&-C?(bOuo;%+& znfkNjICl!`H?rk6*wmDeG*IDL_5G0{Y2CSi*r6MED>lZW_?6FUsVjNKecM|b1Cgkb zVS@o(=$_<%-WTW(ktod()&@rH4j?$Og+)@p=F5V`!KjI0S26G7{~?`nRzCJqVDd?8 zEt4mpXYZDsc(P&u5{zCz)t~rFSSUsHBI>P5k0N@l^;;H)i&N!$sBg`h^MHwr>t!yJ zNz6Gp%|-k3H}qSi(G8d6!kTe8zQ}aHvh?(Pf~q)41O-w6Z{56gm(1yz2~SUe`19-| zIR-7Ap-aSFK{N*a^2^2-g0J|C=c=R38-vCDldnBQ4lDk`(*C{&X;LeW`EZDi(SDu~ z&Psk}w?CJ_Dm3I(JFP6o$*U(n|97+Aph4T%mF65{dIfKyvvD9V=Lkh!e#5d)SDMK4 zL*t9#i}l5wr{;P9^Dg9l8azI|(g%4q&1$S(Rsx#J18S~}ze)$$WdQyfXG#rAsr{5I zu8emrHrrF;Uvp%I_phSklmMs+6ecstI1q#@kuwFsj zGk=K+Hqku91IBkVV(&IGWHOe#uqy`ggrB@x&=?Pfn%>lug9sAi80i%p4W%_HE5#+j zp)StPhTCkUt5vacS&?osbumukJCqX)<$VNZkJQ%1NWatuwzb3_E=#+?-e72G=htzT zL-A{aIzIf2AA+H}uk{L^{L*3if%DnZz{54MfjSH3G@uo@cMd{5^A!7IPGWxXXaVS0 zyGXs`O>`0xG6?O!g#mW*L#Cv=(IBFPicI?)lUurt>f~eTeJD9$N_q6?92=xg*2g$f zWWGTJgrn8^66hDAbKS$^JMSD+>3oh=@$vmM?SzJwCR#-pPED zyvR==<66ON44EWNO)RV@!VBON>sfxO@-s2_hg*ME;Cn;XxB2m~K@ow?wbzlnIobLd z6@Rer#$p#h54Y7VD>5BbJb$;Tu*JGvIcp2(LO7Pd7VU;rz^A}~E#a;=)LQRCc2UqJ z+H)U0&1JjlTmwUAryOzfL^-0nPOb;Z+gu8m!SRk+X&AH#|DWj z9eSd(lgp6B$eYMJ6~*>VETxe+ zG=K$B5u$MeU3w(bG+ zL0-+u5_ywGTubuCJM|ME+n{G|@hVl=8#%)_=IzF;)>0G~zto~-5fYcx;3$vFcBy3+ z6BuvdCpj)^{&^y&WmuQ^bwNXHI^ zob~4k=w@T(-2M2fuL6t_VOoQ!f1!7kC%_=2-W{);qanG^wtY~1;}fP-mgDyo++X=rP{&MGB=a-25MOW5{e1=5=(C#G9ROIiZjBKX*Mtfr7sV06 zcFuHJNH{MP7*OEm7qUyUNE6ZnJ-S|X)5z&w6QZ^mK66=iH(wnL(;;M;y!7UJRTzh^Y-|Xb>98& z#KHi^@4WBws+1&kF4+bh_}y-x#88>KnV7vx?5u64ZRf9i2jX(GT};ge-3k6`l>Jr=hzOX;--(L=|@Y7u`A{?eSo_= z1Hr7xdNb|UKLDbQ0R5M7Awxd?LAtYBG9Phy$t(S}WA%Ej`63gE0|7+9C1U5E&2L6^Ru6W`4HOoz{0~EY3{RwGXj0N_DP7(o$>gQ- z%8G%tp-V^0X>I$qzgPAGk*oj2#mw~9XzE1^tgO7#kb-DhWAl6qBp#mMz)-?mXgPi* zdgJXY8y4|6qRQ6~45%j?NdHi)SB&IzB?*wt^iLlSQ6*IvcSZE*Bjbpgxqbchd_n0% zjN&rpgEYg=?8dEcA9KPx)fyxuj_^zG&gbQ~EAzCj!WjW$xRoeIA@j;3YBZ#N>eVaC zL?Of$LC%)DnD=&G;L>qOV!ZlTBs`%w3O-X-!J4{4+_Dy5VuD2q zZ>)L<@PcU*R*15N&@&&iCJa~AYX;5iYzG>T=ZHs_WrL_rPicqPMG1&$6c-r}GxA>j z*$1Ex7&I=EA=SVKhWEC@d^XP2b?HTps9r;=J9g~Ft>DN zydK$IjLhpui5ASuD*2c*1y`5H^YwG!@q%DLrPswD_33Ir4~$M^NY`j^j&>?fWT~LW z99}I1>O7Qnj|J$@yrZ3}^6gHDG87gQSQp^jKxx~rF zRKPJEq|CPjn`x&yz*T0gUJv|5UHxeuBWaQp;%%JmoqzTDVEBNL5G7?f)*7^!PXU)= z3SdIHfP|0HBldgj?hl-$143DGO;-HlmBJ_#26U&*lqFVgIR6~w&xrg?O>ml**lbP? zG8Ee))E-$3G`)6hR2*t8u|IEYCUL)Dq1>+8lq0c!5RHe@(+o_^TWf;h>$v4p-<(H5 zKkE%*&MHM0`Oqf}yce+9SJ-VaFC@8Ix+g!c1&Y+c;r=k2E-t1LtSlI6S@Rbf`#Eom zUDf>cC7AzVp(UYp$uMS-okxK?N!}GP85%L#3690)pz@z5aQsOeol^4=YF5%TgxT*N zvYGPr-Y$``T4|Pcb5s|4m=TcV-AMH=9AwjE9!)J))cJrYKK(M~H5(f4faWL&CxffE zUc*#hvzn!x2r?FH7!k;pnd@SIE<=(*guKk>6{5JEWm53`{+zlKJ^Xzzo98eV- zFHthajI5DHg6%Kj^Ew0OHdqEmS@#5j>6Ii@?qF*$h$fc-ucLTM#v`ka`|9%T=(FVL zX&fN_nk$RigXZ+-Sko)mlh@IXZuKHok&%qW%SY!yW6}>yt(R_2R8}Ur*HL%go{Eh@ zu}83@jS2*p%FE9yTe2~iyL=~t_pUZEdGL4(j)+!++2i@qUQU_4?v6VbsS=%$QsSTB z#5lTAjwQWaq{<8rgp*p!bhjL+qE@b=y1)#e`9 zN^i8nT(%D-{jn~BSr=EGQ)!J|{?LgnYioY%a#bdg6bA!`0li$)2L1DIy!ae`8R9ftl6B6L-R8ppJf}GdX^Zcg#Ie<%`jHg~Uthw^ySS+NN^04u zr7ekE;}e`ciqWfmaUNnA#j%dIwdqmL#6sz6Bd<+8fCUJ`A+_jx4!#wBHZZu8iIZgpLJ*+`#ed5T#Lyi(pM@WUo4E? z;7n#{QSvJ*gv}T#dNg#qa>w@0Ju5WWS^S1|12vkEOtq@jwU90+fj(XCU*P%ge&zWY zR>pLwMB=&}L%L2!)z^|qSjuN{LpERLe8(D~vJTGdtE?(jY!Z_T{G1~<2-A88bFT74 zbp;f@&3`E^QsX5ip}#msq&+SN28K}493I(t98t-VZ5s*xvS5^+L-_h4UgUmqFMUH| z=pR-Pj1R+MTHgy+E!VlJyi-!B4BNU5v@p_C@T4OUo6)D}1JV3#uMHM+p<0WFW?nL= zWQ+dMnah>^m3=6L?wXa7Zf0Pw^Nm>VA1^R-f}wD;FRlqKBn>u3*I-4sp31MUa=M1_kA#~X_{)joSZt= zFfSbqqdjl?k#LrNIIrzp;-9BH3g7lpcXMLj>vb%ts1JeY=11 z(6I|I4*d3?X8KswtiF*Zp-YD+42tGla_%nfo&Y&>eGW!=t7G5Z|M1c0&pBVi7QXZ$ zg3l<9In!7fo^x#WtxBepeep(RM602DV``7I@xHL zdbx2XaOS+ogSoSG$zP9qK&jl<{&a1ME&=aWG;zLGj1Od-doYGjMgM`kJ$!Tuni^Ji zA(CF!ReI%A@bqz}%W%YIbMT1hBBG=54NUi3;SG!wB5Do$A>F{Oq4AAPS+vBEuz<@w z>o#;we_1uhZ%%p3@P!PzmdR=9WNP{BQ&CqUP@LBlIh$*wk;9mRcf$ zoeopX^~7QRnr1I!?Z6iZfIx*eiq zL^9De&orXp)c8_b!+0@;4_nY~%)%wt){?VB{T$NrLNZntPST|GrixT-VI;ntH8gqc zV`zv#+Dl*mi}f6T5-w^iWccwBlx%V&ZTcYy#74I;6!4tI;C~V zc9#`ol-$quK9}V(JB%2ZSV=BLzXFRCr&^lD(?Ef zq$j%2L|d>s+@$~A_3nSq)0JclzWYlTvSnkDS7Qdxx|MZM7<1Q`c`!9hFI`p)hQB(v zM#aeo_jx;{W^MekZz^oXB6YKQfEX*`9_B1ZG2L`>8wS&MX8e+Qe0vU=a^%wZQ(nP# zxW|t8MAmCSP;a zQ|q6K;Lum?B-)kYe4{eeVtw4%@uh4`+`RHMDdACpZ1y&c7Vb3+(sQg&Oc*98OB+!` zTF)-B*h8nD;<(xOI^HPsU*OWJySBS=YWMAoA6^-#}G)sJ9%4s8?kjZ^sz^mrbp zYD*D%^p!LELSN`kY?!;~iI+Ny3)@DmuT&pLLTJWC-lm~13AOEHq!5{17Z>itbA(vD z?X2$_VmfJ$lr$%}jhr=HMQS{;%^ywk6Uqs8sc`Fd9#!Lh6RPv=P2)JvS`>SZxR}=H z**WR5$5tkLv?m>rMUMv$Q=Fb*>$h1M9%=H^iq@a{TzS0Z;nspVBFO@kn8Q)^;jF^r z^riBxF(ea+)<4krTdnV zVU{$(;p~ESH=<3smTR80%&E7T$haDz#WwzABDSF6x^@$**$3f{-lw+R-C}gK06c?3ZT#irwJuX@>jMtQ$z@zKbxBqhZBNq{yp<_}4#n!0I~cVk+3KwZOA~TS zN*z3Nso>5Hv8E`Y2}*G7A=}6K^6pAAxqa2IzWg;vXq$#+hO_WAe5`((Yfa0z<_~le z9lI;y7Uhg6n9=LAHv`tfm zm*8=5xEnuY7;ABYqekdYzS9}r+sU1O$To;y*)sTE%9j%6b>KHX`SXq_FZkT zr1pFWn|Nxni3?8SS$ht@gIl<9DX)}!{M=0WkLd4*XLStdD9#$VRBUl~w**hJNZ#*$ z6X(Cx`Kmpx^virPbUJJiIBJ2O^sa%_>5QQC2;wF5u!Z!jRj}P$aTYtO-YRmDv)JT* z?@67FX*QoLY)(qFvi!j>HJ&K6D6{<$r^)H~Y3n}U@*@aK9>hLtlfu?VR;%-2=+YZy z!5p^Wbk2oNr3{CjT#V*9{`s#Ct-f?VQ*OG=7ssDg*sy1Ds%t$mc{KBBOAEy+xFA*j-Xk1&rW5HW9;@V; zWcSQgcJV_XUCxD$o(`bb&Bm39r-+G#pBkCypzfrhp&pN|Z>oU1c-IVO&e5ZP`` zwO_Fo8x_a+PHr#LCgr-~&3e4p^gO<@#dj55F=>~!P=9@QO2B+cltv^Y4)x9S<(;+n z<`ZTzvt?A6rT`IxVFmgtC3}zHgWf)~Cual$!+ahH1x_`K+vY}G9)1E_#Xq(i3~Mpj zV?F6uk$2v$lKFRx;K%oF-2>Naa+SX>2qNcFg}a%P$9>J&<9lf<4+G7(Vv_MY9K^D| z_5DmM?_6={N^YTdJ?Ch1A{VOZR1sT-<0}#gRK2-nRc$+yaY11z9qYwWt94RrWn}o3 zjI5&ozot26;NyS^^ZQbhZ>x6q@owSq%N82tW zbw1&7vW!P;xl_3T^(R?{VZCRKl9LXf>_=+9eAwld=XvyC>E%1_mP~cp#KBs;Gr&cI@QsVxVTAF%?l1VS#pPHjFMpQjx@05(!y}` z7>cl-xq?wAymfe6J{xSFTlRnlWTVYdtn#W!09|ltnttPHIvoQwG{?`XEZ>f< ztEksh)X=b$J`Yb^vIu)}xLhviPMukwE7tITChy$9R6ba(`4HL{qkddjA2#*$EhXRH z)WJ?;4yp%_y+m}yWvX4m-$tJq4%h8|dZe6bs`TjFq2Yj3-92rZu-=9;96D|K+xg!R z`JW}sZQm18o8vCN7$u!!pMxr2u4RZMbafNu664~;pQq(Qtk;4Z6~8;}RShizhAF_l z6tifNwsVnGKv-m~oN@PfFv2(R9ks4~Db#$v75)4wgs&Q*lk?`-mGDbjxiFd!aIT(M z81?p3o2->H_MgPx4t=b8XiWVuW>guHv^u_hd#yCScK)pb@zh>kI67-YxSK9%wTgso zPmNnkctf=RDgr!KRU{cLbZ&Cjy{70il^oF|G@_y3u2C(JA6~u{eOgbn@y>5D&sC1w z@vLkCuE!F7loiHfn|H5^?F^X;igX*-ZjSYOjP!q?%eCTpNBbc^O3g`ouMNSW7ZuFT zC>YV4uH5aGJDHv0Az-p-BQ&xhlWVhn@9}D?YuBf^^b!*W$!#NKtAE;ZG^k{XuP(wy zqwSN-6#z#o3~h{Dxyj%?Y19pe!Ykw1)doGd24{~?obW2Nj32sdV`i)B82#kj#zP4@ zotR}TGA2b~ojI>pfS^HR_QGBBxGED-4Up|B6Kfb0!;jX!@89?QH52h^1kF<;@!wXR z`%!y}0dS$=>Ic^~4uiBG&$YBDKjU|Q`>(6ijvQs*R?e&YIo#FlXmr<>g6bqCo${RL zO?$Ykz4ig!Cnh~UK3*I05LWu^!p@HF5?{?($ltR7vgbdnV0a;%Sk;TG<&{pbn}$t~ z(Dx|)&1M&UmQ(eaPA1?~6+Re-iS?`hr-KniH-eQ<;TRyq_M*YR`M-QHQUk?{YY4`Vc;E?bGZx z$y)wy-L7{ZC+p;}{+f)8xBz1p3U(~bu~2jlcS? zT6=jb7yZ~yC%B>&>1s%&oSWx#_!ah1r&e>%-TBp5RBG$tKG%k>eB(QWlMs|2`7>1K zV_c~0TPw8(je^sJ{w=DTnZk^_+i#$)PkCAVgJ*aA%pXTOeTmq}(e8nTf3x@+ddeSZ zhyQ;#d+WF;x9)uy5fnrPq?8s3=?3X==N`yc3V$J%SJb**b%YflvgE+Q-u*FjFnBNnU2kxy#EV&*8cX}eHr$b6XF z_aD&I03zEOyJpOV(#!Ap>jpU={KRMlF3L%eAjhi;@=HZ^Mwv?Pz1z%+hoR&A@ zIVYk$YGd5Xr`zK?dD4=JMRZ)6e$QGSogBw)eI6)rVBDBSQ$&mB*}w5J zGh|FKYNM+`O!`as-S$Rh>-%>G!TApKcCMuzW;E5=7L!L3I$yY)hv)u$Dhf|ADMw(|`7fli`i7M=)yne%)$cw%e<>Mz-(jo0+_3sx? zoC~ncbA>1|7U?_?=eEkpA>8tnW(}?Kmt|F;O8DCCm@A%QzMipiLFr_~&AXbns-~Vb zZ~q|on--a@E=K6K?~CLI7e-E^gjm#c1JPVMGS{l0VBF9wNrUb21t%DR*-l~G*KPsd z)ke`EuyTUmbdl`Fg8s;$0GoNR>%!$w0A8X~zy^zFv#yLSf%HO~A}>ePX-USlT1Yk7 z;tGj>-?iZdTl+>LnXPCwN<6lW3@$)+oNUW-buhe(E?GnPB1Kvwj$JaJyvJ#XxJEQSx8`+N6S#R zY|iY8wl1uw$U0`WEA?%tncSt^E3VsykPTKr?83CXzv83&Ve7$CZn%T9Ov_};TA1kEQ?5y%SVb9~r^H}8*#oCwg%&-Q# zm~V|lyyqW}gOBIhI`rHhcF$5YKvzy1@Xi8iiwO3?YL-04P5@HD1cV zrQP08V$s9}cIonRos{c;$4M(1AnnIn6B<=U2?9w>M!!8GPg0~}!8i*&3PP`<`H;1GMB&u7*Qrs- z$36XbfauSEqa+thn|*)sIIyYEySy^y8z+rMpwpU8hRu)5^yt&izL33%;xhu1d$ zaR)~&l=U0vpoOyCC56y<+Xgq$2=1d`3D=1MQ8ols9Wn6Hw#XvdVq2ttYZ`~n#D14a zE1dnSW>R?0S;3?@ef=ZP?Tc}HZqZj)pNE74>L;!5G*r-M`BJb0dO~dD^g6pB(;$A$ zbzb)1+AJnTZPh%vAiIj`&&uEMxy%YmXg-{2HcQTN0eIzXdo2F>9!eYg~*g zABU`o5&pL)^jiXki=*##FQECjQT9DFxur(%)_nw<1BRkde}vfgsyL{n}pFVmZe11R~cV+hr(6aHXi z&fB{Az^1D1t{WXJhGTad&Zo_R5S+YI0+SD|9OmXHG%?=pLwD3up%(knRj3fE7KCRK zOh`@P=I8I7MMbPjt(S35vxhUmF<)1#O&bIx*`GMkv)Uo{1!Ko~*H)g@W+pXwlT&76 zgSidn2IPRKi`?sPiRFvN*#B+Cw}NR%KuI4#=Q5>0I>K||lzDVotc#FMre!;2+#P8V z^_~%BVT~LLv3n$Uzfr+qEZ|N^?kh87tm?0rr=b#NJwm0+SAj5}v6OEZ|DD|bgmb2G zK%A6GGftHSNXoA;{45vas9WR+0kQQPZt`8R=q z6@b9YJ7~Ii3EhUcrdlv?LK>pEIx#Pk@7cWulD1iX%<$<~AI?pF(^c8{l}4vl2ll;l8HHHx#$Td=fLT04lV z%mYInTk%{*x7gTxx^vddvv)w52pd8CxW}RvyW%n*tWr(>_>F*)ly%$h+6HX0a6)|| zteOuQ*R_)Ix#SN`%vH$Jy@L^G^3^N1LjT;fd+!_zY^`Ey7rw2Cp>urSn$^ya8uDTx3 z*SnoKjK7F~1AQ;VFx?|eOs?0{2e~8W<6QULKL_RE#6~XA>hw^?Rb2`U~9$ny`g?`G7RsvwhbGWSwd&Ifvs%y zxY^L*_rqiv0e;k-Lq^jzYJhb37P^yNwa* z{QifQ*MrmBRQo~6027f+cIItkp;G2Z%2-%yE4=rlRK@GdRy- z(WB?mC7pGWP+(-6kM4e4jdNTHbUnf=8N&uwK56=cdx^>pJTvs9aN1>d--7g(q|DEp z79Sft(yM9~D8Aqi2_;=M;`IAgC)uJoyu4H>4G>Z79=~~E=n{af$9OG_h5y`$e`bgO zpp-R*0RL4TM>aR5g}w$aiKN!z7aNOEPb)L@pjVOjKAOsoiHP=S^vh?6_5|n>$Z-+B z2f6tJ1$tJm>L+zzBO8T&Oc^$K#qJD=Rf zIT{0Ccd>}>EzHnMnh{(M%-~>bJbpN-n;z{U4#JIkQu=gfMr6?zNJpmCOHBs670&xc zQrAU_I@3kEq^l3~Xbr%$oFzb7f!Uu9_KX-8l@+<|#dc_%3LOjN_Q?INwi29&$G8Og zSj_+R`hQphs)`MG7LL=0I89YjhpU}WxTT*bs5>QGYe$*Q!(3nY7rZ&sHjGZ#870n9 zyeF4JaLi&S+lS6@E;KCpN4>UdUP8yYQkaLWwt98L>hS33v4yhIv~D$AuQoXeC^I-- zbvo=P7%=k|Th>|l;4JiCm>>U;w99n$ftj1@1rMgLS169QYjHBZCj(buVXrLuq8VLAn z=KD`a?9XHe87T~nVwk`?O;J`%%GDf?-@u63PZMS>ZQ~>vgx!TiW<&R9z@ok$s^WuM zwJquPx-n_~cRJjF4vZn->WfPWZnORQCkNxRhGw?y3b=<7(`><2=3$Qx|O3V8nqMg_F{kK~Od)2HP<`!(J7B9KD zSB;YW-`vv`Qs!UtoxMmHpz_>T&aqInIV+nlC#UjXLndRPH4mcPo4?pNpg!jYYG|;? zg0HPhS$1C!ys@S#{iO?{A#znAOhh1AoTaO7^;LbiTxO&yJ^C`omr&38vbo}kj%`oa zfO+5}kJzD+gG@=o?uGn|6oRC+_7A@)0sDmkW0m(C%isGA%=r7~`;p+-?QJ)9;xoW^ zlYH-kvbyh%FMOVo$%OqSD=V6oT)pSv>a+8Qk}ea+ z@!!^LJEV;qAWeLp`7x&+Urk;Z!vf^H(Se}Px^Pw+fN6m0NBR*c29(dH?=)x~dV=v> zUj_s`I)~k(fa48n;n??oX5YUcM>Y*0cv55UoHib zbOKR-X#u8gY8xL>R6Jq+Rhiq`#ZE@=-=5hn%uITo<);!}w4@$s;pMZponLW_{)drT z-nVm$B+fZOoN8*$TIbAzW9&A3mgBll8oR5-?IPs5ltx08Ju!)JcDC3cHKDW8OZdFJ`ndS3ZEPTooU7Z+Z?IjwB4LbJ?te=yW0b2UfcpJTxlGQfay|rGtJUf7G%| z%lyB2Il+B+Kd6C=^>=mU-z^nR4F@Z0tWM{sKMRLq3D84owK{*xjvj{ICq>j2CAdik zB~Ag~n2nZT_$RBkLBpw|F>76PEJH9gQ>K8p!nPOZ;@_P-bEBHk7{3wg*e@h$wuurM z0+Em;2BMW=T&@h;x!b)_Y9ydVxx9R5hFNYFo{`js;>ku2--@p$LOm5nsK|mN1%4Fp z4Jz=Yy*^UJ;gSR;QIVZ*2G1xc@>vWCo}FV_Sj2yJ!7Fsr&6mF~=S$}{GMmkrfKzJ-QOMxHmZ%k}t0l9E%6NVcr(k_637_xH|I7aPyC?kdA5E{}%%8?y_tYq2$LvO8>fT;4 zE+$7GwxJxK=g?x7wMDrI<&AiPBRWq?q#=2KW=vJ<{3o7()IuMC<)UBPMrZHOHt9K< zlsg3DG}aEh(Mgm;p8k|t)N{JZJ~AA$=dEX90HbGJ8f|itBig=72pHWGuqrzceBV~> zq2#V2;scU$>U%tMlm|3`_)DCcZkWNE{l{VnLp=*4WVWx_*B^x*Lbv%VJDII8tbqh4 zg>#P{sez{;sx|$Lg(}YMWP)&A6Rr&!35#a_)jldI~XOd!!@cI7&#O;SMV7`fp*O!N5Js*yGBz-KO7 zSRUh!PUC1@;P1#y(|q71f(|Ko{(Tpi!*{`Cu)N$93b|WpyMN~cGTZELg-UDu1eV|Q zsK=6tI5^5TBKKJE;#W*)uup26XG%ZCHJ7(go>{Le*cBH2V5{L9;?q?07|Vnn99zU*uQF!>RWmrdGyb?k=`poE z(cbtP)|EQ7=U^>8{oFp16T_8sP6vdYCxou&Q6J<5!qm z21Ra_cS-|_LIH)-@_V%3Ag2^$c!X@0`9%I3eE#>(r$FIg10#AEr7RWhhR7n+%GR!| zE^$GVoyN0aLL-0zBCz-_e%KvRct z0_I70$FQ-(ajX!Cpb9^Jb%)I2L;8*^yStffX7I2>jf)o^p1q>e# zSU3Y#@@R*v)0h+!&UZ%mw6r|wGqbbjKcnF^_6717G!uwbBvYcf77qz$%lRUGR74i!@kOR*0uI zpunoHeW6%qk$ZK&yt6_6?tl9(lLoLnGEuzy{~vN4(6X9csA8o>45#^=0zUB#n*h_| ze9GMJeURCxO&xq*!1(b-jd(2rT4GPr#H!Yl;pZ%^I?RKYRq0RcSblSVAbJ!4jGiv- z=AH822h`uj96_mj8`jfjj@5I@hjY_{XLj_}2>!W3O>6+!maow{>?dA=&GJlfbm#&5{n5Wr(b(tI#mq9s~-3Qr{QURU7yXDor)gS4WZg45}2|-&6(mR z@xl46C&nK<}W`Rb4j}wpZ|8UBuu#l((y}ayD-(T8yJe zdl2Iv@mu=>1Z6v*EgIX^>PGzc!qx=Pi?!cH{HKS?E(M<4;SR;W!`S{gc;G&{qd@%M z`{d6oR|ZEzo0TTYf&cC*a3zZ03iJOK_5siR7|=8D@#FpvTm0WWnuY;A8Q6T2Gf(6< z;pGL4m7W23)c`QY;y-*;@ElHgLEJ=JfA1$u z4oHA>PS=zI_@>@7HL^1_tkb3DQuM32qO*``E(h#MlZ*|u^NpXKG9?{g6>_dn^)G*a zv77ahcLH|tnyjSo|48sDvWS@y;1fgl(7J>&fR>m{(G;6I%57(R1~j|$RR_{o^6&y<;YW=v*S3%CXxQI`a=uA z4v$)iK!(y?-k_lUC!hY4()f?{(-hU8MYdStksY&>*`N++uZ$KAm@<*r&AD5--6&J2 z!7|D6K@sKV#oEzsF!E~6E-ts?O57zJXN~V-Sd|DK<;igPv)sfjM_|d?@80_V5t; zl)=75sVDoolM5TpRw3tp!&SJH-um?27LZsQ?5lkVyz!+(y3afBHU>n*ww{24GE2Ax zz9+#t?rfHE9OWfC258>c1xQ=0{}&Q|^V2Y#ci{F4d-!+Lr7R-j9vJBvQNlPvCk|;w zL0j8#g+=Pu69@B$of5w^7{4_BZrZ4&p6GXKVSdUMD4+ETiqVy!@KA~&16$a)-SBjy5;pNmq@?F#gAcpJ_}Hl0#-=KoxflB|84>QN!|p$@9s--|Hm3c;fOm= zjL_^|dE;i!A>ZE8f4Tg#k=z96;9M=DeIvG>mwuQ2UP~qO zGz{mjOQWem`om~8C-u4r^>7nV_;bm`_)y=xZ{}M;^VS^}+KD%@8M8nwDV2pFq7?PQ zbCI4^YO>r@jTzEp+D+a2c;HA@&D?g|B?Ap;9b7; z?9JEAwa8)h8hXmE?YB;KK3la(EdUVX?bH5XL^$un5UCa;uGmt|`l|(1Of)9k^T*-} zmLlOE6uOy71T>nCx~Ea*h~1e<(b#83huceV`5xU$S3~aWlFaUVH+{`sob0|%xW2z@ z0z4^|pt$Qo*=yjiacbcB*_XAeC5$EvBUHn%CzA}|Kt-ABMJfcAc@}jM+aMz*jo;qE zSGb=LSmC(&Zn{67;O`D_<8n+NfESAwj^`7JirXt8;S3Fl(kWc?*C*v;5)oH7_Cfj8 zxDC+59N^q2O@9}2VaOD773QnyLlxiA&#tVJ99$ltLxm*Qj6*8Xx zB01zG^dYdFC&3fPoh^L$CtC@UE}4^;aF!^ERr7@mXehTt8H799q3=O$Ol4J5cMdJy zE7o&Rq`>%w zEj!<~T3qZAQLpALJ*UnZ_4Xk|#;wMp^2p<|M4Uno1z1p`ereeWjmZF5|v2>!oq5ZRMSYTF-d8tWQQ;=DoDo zeF~vYe&2pPNU#)>o(teSp|YNKdU%uvT}u~*`(!p89Xn7_JpG4FChY*Md*&HcO6h3w z3K6$7JIB&g{;|d?oampl&P#?D!e&E>W3ek{WVZRfy~uMF#FUwXQZU zG|zlm@c}8!W8&DigWX7S5+Km*G_w~nBOhHk>CTNlqshu$eL}`kWELI(;oBKn-rylQ zAE_hKr@YBZe((_io0lyA;2nPiClD1Xz?670(&%La->+o>46|(F%xgYPfhDw%lxF3u zB9qxlE`%v>#XtlHJU%y?h%GQ9RpkF_jwowj7hT4>uJsY%2ePuWsT2Rqe47mInqP`PqJKSV(C0JSml;3hPP84zPIG1fp7qfHPtHedFcQC0sfZuKFD78B-D< zM!NV+^?KB6>KMkMie;L*T|vygOy%b7)5F~wI@jg=RLsOPy_0iI;%SVId+@>Wdq(pg zYZ!o&R11UpNSwGRoshk@!W{Ls>d?mc82L4Mb(Cc>L+y!l)ra4N!41#}VJD16y~$&7 z@VZ7RCiZu2xj&LyCY&x+GOOfqEAYdaz_XCp3Mi8dKSuJ}!h6=r@$~sbyi2I>Ref$< zZLgm2&7O?=jFf5m8Q2b2mA-ZP%|ja)rR&OzlhYf6awi5RUgZ)Yzij&ePyE10Ad3<& zGG1JTKbGK4J6}fj4-uAIfEMTgy%kFiutw+m$EJ2ldg}XsN0BEA-}MD)!p@TT60Pkm~}jrdo8CZ)~hfy7Vz!q zW`*~FS?{e7?HW4X`r|&e_ARwqtA~d-Gy@384vdemg#J5m{NW8i61ImsjTYv_B%rvon$19tuY`L!=k8AU`C5Vr>Bz%XjJM3I;B3Grb$3R-GZe|hkqQk|8%T(Hx_p} zLK>Z^R3o_aiY()@CqW}MW7*gIMB61krUm5S9oK^!`9w9WDksSI#k!$+qlbCfdT_gH z@=m*iI!Hzd=-9)B<#Bh$IG$hZZ3ofpl|T7VAW#Qyz?Emb`cr(w zDVXDpm}4oH2$y_4#zA4{(vEt-fXqbYTiNL%&Cn-4a4Yto!D)>vVugRoPGzxx(^*x# zir^^({HPEoG(*8x*~-j1<^1S*@nK+W<6cy(?5j4e)T3F-gkZ{_&5}NL=@)8^Tsib# zi`a|5wj^Y=6mKe)Z07cwN*D>ycL-%h$rmHj)HH!`bHDLoafySD;&(3OsGj?-s0~wV zCjb-7;zau#q}g%bE24joUh;QNubf3B~ujV?Ki~7zCVqSfoa*;{qE?WiYpb^_Hg5d>-uz(`A3|PeyKV6K*&B+=PqEBy&*U;bZ7%q!X_q ztxJ5r=10!7HqmqWVuik&nc$k+(fmfy5j!Oo$_pw+2scv=R=0-1vU?$4uNf$p*>Yxg zL_kh{D1mNN9GJoD0H*jellnc(hVV*tLuKowlLvh+=o z{WAUzo8||-7x;ND7Cwf>YSwTyEoeP?<>Za0fryLIxVTv=7&r7KWn5|SJSVmkPqZb_ zSC?_CpEkiij^62J~kTMNnnW}!~E z+Sd}J-3y+|=P2|^^T(0c3rygTD2nhQh5Z!r&o8l@N;9qKDuz_!CqDaOq?*05_#nZ= z$1mTmWi+N`6p6sL@gp#~GiucUBv$$uryZyv`^si)TKG|(wKe$-$}!adqJgHP#q;}r ztwAWI01-_xrGZ+&H)%{4^=^Kw`;K}%8i!1be???}a(5S&K=^(TA3cjTLP@#%lH-Jm zAD6vhv}y;oNqM9YsUu^4SjQ325iMyBgA23cZ^jHROMM9&-NLkbFc}*=FID)ch&acp zon+%V-4^quSU)Z3SE!1qFB zNqbp+#bczeJxG1d z9_!s5bQ3^$0h7m>^hp17;|QD^N@jAxoSS^gWI7J$#XEkx$!?Tlz@#gD{A>a^G37>m z4zZ;%Q2lEuYsv{+YtpJc+v%6cZD9QC2%8r&WYTOQ+EGz=2^(wKzeF!J*NS#5eywx& zYK8>zV^ys>3yxAWwa!q|&V1%Ns0_p`oVkrJSQDK z`{pELR!K8eH&cHZ>dSMCGY>jCWQVz*w+kW9c)viICpjqcpaBtPtieBFdgN-RWPuQuTD;JQ5l zvx1X7(iQ(=SxP0~avj4aTdpX9hdf>PT+pD~vU%;(=SwF;c)-7st6PKESKE zIrM5V&rI)o8@Fb)N5_s^D5q-yz1A1m^ta*4&WG`^$vMd*;|8)^PpsnccfK1yO^Z2i zfyR32E^QdiHec2`dGGa2<`eAtrF+1<;-n$|RPpXbacgsK#t*!aYK&~s&+DysGHit? zD_FLjirzhf8s8&7FWJTn-vON%Ying`*$lwTel|mvCDSCq3{gIX8-dkqoqtTu0I(#r z5}>6Z&XAw8n&$RISUruu*riy(B%;hcWU-BsX~E-{Sl@>FKP8svJrE(g(bfxg!P#CG zEup734y&Z1|HEJ_pq&JUQ4 zro?8hIHwSag>X1dO26fRqIq+?Mpoh{^CI?8wTN>vb)rm*SZs=sF>8Er&qkwrx$@$G za_ObWtRj$hoIn8Zj0FIEDSMzyq583QvB9LY?!0f^B*<8;Q=#W_XE~4DR+3eqbYxiU zXR5J4P46Y)A2v{xnV<2v+rz&wu6IaMU0SXx%w9nGd3~EVW6o=F-8L8vm_Mn|+#DXN zMx+LxfMqtidbB_J>qhSnfvuMkx<{O$l0mLCwo1KDeWvad-Tib>Kp>eq-3XIftFs+y zDoN(ZOVV8C`Z+I22S`j*!oNy4tY^D(0dk#iI5>W0zM7|Wl$Q}oeN(WfSOfWOBq$8@ zGk6SBe%xV6RvADKXaVEt z8SVqyS~op%z9ezDOF(m;e|A3Y8LDhG`9QU%Vpo95u^<-?BrR1|-|-xZkH03;j7L7` zzk~?ziz%(!F~{b~xJ-=2e+wEJP9cN(K{~p(Y`Uu}UlddgQLouz?&fwpqga1fxYDc2 z?)^OqH+wjnimVY}(#E57ky=%NYnI4Gp3f*r`}e`VA5ZJ43`*RSb%|AyK8W%i0yE_7 zpE;TePrLCYmg!+0`!U@-B?tUe5Mc8R-qf7H5d^<9`^M}yDXCYTJ`di$0AziuMkJ3y zKNzZLJ*M7jT((->EXL0T{ZWg+U$_I{55X+Mv8BX_zl1Y8u-09g{TYIjtpSzbQ@ay* zTOXNPyNIkPpS@ijI3xrjwo!VwH)(&wIN_1q&R#Mx6JNvr4B6hSjV_{c-(orN9oNi$ zB&E#)yTz)E_HrITCCP_H?I!%r&z*I<793QQ2}DqZjs;5`J~5CkuETVd4Gm{hLe^}< z>Yh@i%nQ7q)fBgC${=2ByKdDCFp?S%8s^6J{vabx>LS#JBgSfUm1HZ%^xZp4%NUp6DoL=4b~)Alhb+rwkK+Yz#t76nN&vYq@~-j%TbiSQl+wHk zaLM4Pb5?QyBuuV4;rqcSXCHA8N7r(w3kCNSIg(hvc#}xAQg)fkioSXi)-|PpIE*`m z(RtH@h;aqt7$7j}rm5WfqX-I|{|RK?0T;J^6zXCqV?@>tJi@ZfbFZJg>CJ@z-XI9x zK$-7}pUcRq`vHy}pBQ0Xfln8#wfp@%An2l@muh8(M)6Gfdor80l*~x5M!(w>t&CxP z`GgA3)cd(gV7QU&(bZNOIhiM0WD|3j--;JSD5-8(yOU#UMXQ)(nk=0vpYFT_;Fjywz7!SA9=B;OWJREm%(2tIl-82=0hL~^ATFMNPIqy@hi=qn~ zXyZn&qZZ>KhZsN}*QzX3u#|dm@64FXEm_O%b`L5b+A(O>dXAZ{R#JuqmN2oN7M6On zY1xg@P3SW4FFt4-ZD}B&ST6KL`{UG!c|Qv)c@?IaX!~;L2HeU8CM{4 z4fN&J1BKDBc#N5VUV}hiqVp};Lknp3(tWLz2F^1U+^L;fOJEfhk|t_uIlHtmiNO+$ zSEo)->5B#gu;r}%X4JM?6)2CY>tG%>T}4KLh&F^_MeKF2G+2@iAlKO9Shm1ft=!`_t737{4esOM!fahyW!_F7+G%iK>On8Errj8 znTc!7Z-j4An&!Y*?9GfXyJf?ISU^YY2wh9} zY%%i!2fRlm!s0j@r_;qzO>Y@+gVDkCcsqA8q94cL<8r6X6Fm(jg*#4-A3x z3ON1OD$`oxzqJF0bof0lZYgUJEEDC!#6kTs0#puian6tk)65ifU|?sn>1Z5fI{$S$i};|ehpy#fdXGp$~-baRq8UIb~k{xFNPR# z29vQB)}4`XeFejBT1P4@8cJrGDsEChRjU;_S9QaR$df>Avl8Pv*Dl_=K|GAf(EeI5 z*yjS*POK>ozuN_Bu7&yn#`Wc7wTPbm{Aq zRWP>zK)dI}PLTkes`632)+zu(p}?BJsahaQWdLZgB{7DmK5{*SRKrj=PxBW&IC)Wv zw#F_ZE4u@6j%$ZOeE~_@jl?IbcB$=HesBoRG=ij9{}qf=?wK>UO_SiuF~U!-l3zI( zz}H;{(ANx3bGkUCnEp-Uh$C!+ew`G}ZwVlHg}Jmpj2b_Yf&G|Ffpasu$XYN5?*Ob_ zeBR4_9Ti&k(DVHUf}}6)?5K!1_=m&~+?{&KBiKJtXWdwM>^(<(^G(vG2PSfKrNHQ| zPy5NCKw(CKM{$+*C4P7`Ck<~qaFq{Wb~ZXFRod84?CSukEnj}1iA9Qzk=LNrb)4bH*yl{6KH6GIbZm}L+nLk8_DmQx6sE} z_$pjEr1So1u{9L#+yiN3EU!~v&W&50UJf-r$wOrx zh`;pBjo@qkguzHj(vVLDdyCIQv-rn=xNU&qNPyk#)joFHA84*)^=0X;JFOm}5X#9@ ztCJvma!$A#G!6YidljDQ$_&_{&RhiT28>fLst&9#Fdj?kAH$R&%!R=nU0y=RjRP-J zULnFyRic;X#mTK1Wjx1r|`H=M2iFA9728 z`HGfQ5M2u+fO230 zR32OfoEnt}=o=TT6{-iGOq`AY$Knw3(=7=jK=I+seUYEUV&zQS&okV-sUO2zP-Rtb z%}O}Wg>jZ6Gq02D!O*a@+|esXl|FKE=njg68WmC%Sy2ivNI9&q_qo68;XO%blm>aB z=BDBIoHWnSMM!lX2q9tb+@eufxQEgi@=z)9?Y(!tq;C|Mq-(gLq|M~yv4xWYStf|# z9m-=lTWd*M9b6Vh1^FZA8|_Z!Yc{=CM@N-Lqr?{8lJSwgt_XaKldB$agzAe)!h%mz zFyv95BVi-nyY=BM@GDa%%JbCO=ZtSzNKkJXHV^9jaO?A?uvSylM1yvGPKZX>z;0e` z7`*c(PLuTyEdV$fuV_^r>y@kyNGb6YEB3_uaYTT%OQXpW55o0Wt9JW73wL;8uP4@z z!h0pOI#UUhX%-gI%=za-ZlM(w5kw!T0~VL3#D8~ z9@TsFOocoKFGIJg+-|MhHAVJ~zIA|dydM$}(2X~L;I_=_7G|Y8m)GsSC%)~GEB`9N zG(<*6!lHxj9m&aYI_73MZ#=}rT+mjoophGoPN`(Tu!5Djv9IWU;K|N=ebS8i}uZ zF;L#)G}Z;FKL}PyomJNw-OWdWF^COF zu90^4LC(P*9#2*e(re!wHRg`yP|_e(JVLtPg&99y`sIt!96`WD_wv_;;q9B%Oo_yZ z8C-pA)s#r;yJio*j~GllzSJkmiu={J#d{+ifkd^sg3E~QQmF#xkLCZ<8=y;K`b*uh zGYAYH5A7#PjMlnEap@@Vco4l8FP?gL?k?9>oc5R}pq)NMmzUtII!a&+c3ehGD?N6` zn9-J*Mj*buBAdK2Mq3eonH9>t^kFs8b!o72C4;4d#q^-2f{Rl+fn z0Dp~rHjUF4ineDse)o z;Q=v6GB*$bkpK^`q~4RpG2sd0hI1UozuTF-)m})n1LxA3fk3P?Okr9EXyKo5)1xiRFQ%M0J^DeM^Xp!!jD zsznNYdoDes`7?iY3}VEc`;Xi2b}+vY8sK2ezoYbZt)3tSU4{7=&suL+{o=y3$iKQ+ zI@VFM4-!`>KrA%Dij*+Y*nF--M|5kMel+MwM-!4VB0aeV@d8cmE- zvPJwhm`%`oG2!m_Fs53~jwuS?*P$h@0!{fI34Sm!L7um@oDq|K5SM{ky$3^i5HLN` zux90`D>e8lW(;J_gXh$0p8wVtRD`UrCP-2kL(6E@2SMd49}{^5(;12G_Gck%Q1gJv zxkf(a;~or8S7F9Dif(VkiCy0x%zMV1*2mBG@c^Szz)%H7yuVFcnQ1j;*mPlrzfx5l z*Tjjp(Vbz)ZX8?n3cEzurYgK}98=K#s_6~Go+ zHaOt@r*8(oT6OrXKOEM8y>L0-Om;7E8*LNiF;fp7PZ-bXbsW3xjxJQ2!&GfASD+HG zwy_?g$e3GT-p_w)Nx&lF1d{Br(q%}~g!|BViR+Z;jPvt`tM>ZjQnc0!?Zuj#;4_X0 zO<%kqx#bpk{{d`kGBzXx*Yef2SefIOmM@2QiwNkif9pf1n1By0={;L~%A_82ruRz1 zqM`R%gug*j8j323q^TsthYxRV@dmOBKIwljMaPlZi(Bo z&QF=d3pxEJeNz@HReTV~vExcZDZMrY+uOFD&0CP_RhR7sSWV?tS3C%Y3pZN2pfoBl#^Z9 zS&6OEDjM0&++8YrlQua&9z9|5xcA{5u!wAlv&Xwy(TmZ6=<6 z5&4jRdRTQp&36x~iGgv0WyMT%bTCahF z!iyySi-&IqTOtg0jz}Y#rrSXtadMte-I`|)h2sMS$~&cb3|EzV_K{H86eOb2A?Npa zurfYMrqnDTn&|EAf?~F%?d_T4abfSx*`6SJkGKJP%s~Yw4F5+cAz#Fje6PFvlg$X) zuhkvR&R`0k6t~Y5FwvruN6Qyx@Xw*GX{cqN^|Sg=#)h>Yy#mz_Y_RE|DH=M|n>VLN zE>qTSZIg~d1bDqwDZW+gX+DPmDF%A>G{x8T@T;nsD$08@CUs2xj{ecL?h{w7TF0-M z9T|{x-^=5-JGbu4-qt5WP*7PkdV44Q>uLMe!a>G}eLj{-%qS&g0lEW+F7%OL0Q($4 zG&p)Ir)}GL#_W~V44>bOZgcwbm{l$A7zrJBsTGHJV?WDIeq}QcQxr`-{e7*{swB;i z$zS=GjV@k&UrV*>jSf{|`I#wIFtVkGUv?yJSg~p@K!E)CbUu)jF1CIqpgMS23)x~Z z>%XB=zca8wi*{pZ0|qde=$0V}v68G$E;dsWxu_;a++I?o60#|HY|uee~y zI~nH7x7};EPxbU{RN7&=V+u3f!xHFn-%-~S?!A8Jc5Cy(^&=`WRB~(*S;M^L6Y#~u zW5b^bZF(QYoOCtS`d6G%xdaVRWF?fpo&xy^hMI8e?Bk`a5!(oe2h;4hnXM=jBhI!*Ag_w1i>2IE5M%?=^n zf&wVM8zF^O4LnS1TS*}Ea)yQ+hKDY99;oahnawJE_&_Ar8fcuqc0cbp4?#zHH}T`= z>=<@K3Zo}c`y>eAG!qnUp$avrt;pTM``4dmIGn9&J9uPtxbTB+&H1M#x!&(`sTu99 zz`WEtJR_Du_i`Pl<)ST&J4X^ERA|SYGu+Z?MBOS!T(I>K?g!r<^I6HW<{+A!X&5~W za)Z232m#X>vnGd3S=S%3kGa}(L5fD*5?>#nKl(v}c6P_AjE^`F(8uE@4<=Ljc%mYi3Ez4kNE1C36k4ET+V#L6 zq7of;Ex+ZWBahC-Gq+V*)AHK%a) z_C(gU%EYZLU4|d;=g9V3H7`HL3e?j!HlkkS4r3ou-2$quEzt}#hez9ZU_dRpL06n z1#_d;1_w+9<^}<%RggSs9)~A7Xu*vZ$L|I!)rjdGLA?+*Bn54egri7j4w&clKo`8X zL!1dNrq*0!GPx5=V|F%EEMAHkh$eevuh3OaJcmuiZa?_;>|X7#stjF0?A1mpIv53u zumJYeKi@y~*lOO?oISCVrWN~$h;E6+f`nmdXx^FRc4&{?*-1sk)ET=t)F}pY_kZOT=x3YMKX*x|Vkz zn{!$W>FT(mG*5+=w|tcuY4-RQNqH}rqUEHw3&h~4K<85k! z^ZG>lFBSUI25sb~Y(f_#O&r~Ip=^`l{B8Rw_fh7X^)asFQLdLGFQ`P*yu37D{=FE0 zE(r@_9>!VlMR3MH`&bs$O?KjmD%C(FImetoyN-Yji$H#g!a6}4n ze~0R&(O2X%Uz;bRGF6}6TsE(NX`K(nK(z-Q%!zlD?y5g-i0XZcbiw=v{dQHIetfD{ z7(NaL6Sted!_+C7?^QMW@l#^R%v`XEC#V}}{d7T}8ClWZiOWB2%U*+hM5l3%+^A{v zQa|@dF<6qC)?Uhc=}D4rxrH#Eqv?|?b3s67Pc!<;xV+--a>4MAo~=@zqyh!G)>YQN zj&1qLrezk=d-R=QPl!De@4rzp^E$475q_a%Re=0C2lPn2YYNkBfflrRu&J_ zgZ2wdL1nAmWtOJqagdZ{9ez_*t}W#kazql{&#dp6oxU26dC z^<@wkQRkw-?Xs>&PMmqCWXTR{51ue}(8*hF{aE7K_%%f-x45}!8z>so+3g5emeYNv z6vIa$39&7uS~P0Unu^>|t57YdrZJBzMF-dF%M5&ElUcVSpX?SMwTU}&i(&rADxVfZ zz4-7h5tB$J-kQ9i6UKY{;r^I5kDbkQFS>o_hZPXXi^nlqA%5=M_$)jI=z{A{J2VDKfCsgifimfg65(oklv8D$_-=! z3unWd?XCLHFovkms>`Y>D~3^?t=*~`EARPAfQ3DDf4!NAgPfzR(fM>5r+gug!?j*> z(M_c;?Ff8O3OX6@H)$2*JU9?3*C#$^C&QoHX>oWb*kIDSOrs>m99?*_5j4e_A@?&6 z-o7O=oOd@@a#__&5ZLDO#*c6`@-80-6VW1G$~DWvo`~4wc2$3Tq8GrD)muUV)5&4tMfheqI}i#X^%011)c3=}&Ctunz_Y!ap5En&M(-Bkm1hmRaB`S!dCec4 z*i1W4jjHV5y@k%(!a1x`!&pde>}}ok)B0f)Ax+Md zOuGZ`(nq8|=^xlmyOd3=W$is?;Lcam?lt$*t`*W0-b7po=%y`N<|86~@NyW{d}Fle z{D<0eakrIU9KJ(Sid_f|7-enzgs{!C!nv&Pk&?&g`ij}*H5l3t8;9G~4KivxCJett zN`QC{R+-qXHcB6L1B!0GTc*OR7W*u4P#bqcvk%>%8i*VPhPhHto{D3y$wb+Kb>L31 zAbxKwnUTpjP_7Zw>q(AyRVHOBr9&{6Rr-j)n@TDxz1)sPxEr`k2djhQWJ|25}=Q_p{Q!n{};NJ~>N(%bEUUq_| zQ1@vV=YKmV2=!H}TPM<4N5?w)Z^@ldHu0p)V9zV0B2{xU5g3Zf&_D;PigDO4CFp$%Y}BEb^^ zmZ1dpcy25^t0p^>aN!p3{>9t7NxiKUNUZDczBf(v)(JMTa?A-e^y9h{j>k?FNwPOg z)pNhg^7V8bsM#ZJxNC?J@@mj8=jn@(ql7j;v2GmUVrEa(N>_RVjeK8!xrTlb19W}% ztB7rdcKPCQs!U^}NjgVyBSk+nON7gzqTu>NQY9y(dd#a3Egd*fl2J6s$H9=P+n)pq ztIOrJ<_{=8e4IfOCuARe%P1y!FA3@S?nO$bc_R_mP=AboVz3N8L1ChqXe7@@|rpj>VPI*PLL__4q9G^A3m$s&7e~;_M+seA@Xts zF|u^$%Ay)P`)%6`>d2eL=40aW)oON!Q{;oCfuzmACr zCt&30aTXQxzVY`jiLP#xjDgmKQpdG&^u*WQ4$SN8%YH>&sP?j|3tFL9T1kXIl29gMxHSRQCi73keM?}|r|?>W`Z5KoLcm?JVnX;<${5}o*r*nEi|=|I^t4&A zKp;6os5jXf#}Hq0-pF>lCz8raL(0Y0pk7DSBItH}Y&+@qVsr|;OS~8UnsS|qRYRy8 zNx+_FZl+%*pMwp`P?sS{)NGVtXH;KYy@L8*WBgm2@2Ou^ZPsd#F*8E$boB@H*(3&3 zqfxr8(R{Jso*lNmnVj?vYI)b0HHku;5Z|w(!CFcFx2aeU^Uqmv=~_7h%oMgx+pS$~ z@1@l-u1r2+Ctjv&9Hb#L?1r_?c)TYC@33{2aY#6MPsy#;cC&H(;>~ZRF)W))?wgy9 zu!~IPR6;;1Tw&@A@?&=2; zi<d!aGCq>CDF8VTmaGFu{g3nCOYFJ9i1wv*!Pw$lXCRgIuy5$lc? z_zL}%t)^6}`z3F`rE=OsA9K$ZFja*md8f6&Ln?YF&h5ftmPqV`!@USE5Q zYsdgB4=GD_PlLhEhyILYZx+9hq-!$Z=PYKzZ2xgk#@R8YbAFel%165w!Mdl0pSOHuD=I0H+VhV&`}77G zk&yIg;hF&kh`BI!t4HO1nj4pi?edLax_3Wgm$)%M+hP=LsUo}P7E#J88 z%-nnA)-eE+N?~8>GggsKRu;H^t+F$w11oYKL^<=1Q2?Pk#ZK%W2tbEkE=L2LcS$-26X$I zBzu5)3Desj-tR$?6Jg;tY*QNUsQlL1NosQ9(pDOF=;-d64?JSqs4(7rwwA|dhjxqj_u&7WpnLJ00Ugud09EW0cp~ zp0c~;y0_N#LQ~zy7k`k5C1+_ip!em&)04nZXySD--m&GN{9J)GZd_2J5WSa-W*lEfWKLTkX7?bn06)h=qLo zJx5T=i6NS!CxadN*=ln9@mJNkA`V#jLkGukv-$9WVCaXn61z%!q=u9T7>5MJLdIRq zNt~+a{uKtv(JojCesAUattoXkl5Z24f?!D!w`AWGe}0bB^y38Y-MbUi$>2tTPe|vE z3%g#hU9uI{4V?Fw7IQrD?zR7A0bHNdJLZ$-`ODlKgL9yxI^c2^65bs^?m-*FhH0=B zA)LxWl|xKxlV^-Wr=N?DgF&R~1I&RJb| zJSp7hFiNcKY_JrJIF?>eC;w-8x&V6+i;Hw(PdO}Le(Dw7M1C=0L_~{=v3=`cCM{=d zAG@`po}EB)k6)bZP(ZjCia~wkWu{a!Cg&BBqEuYmlJ-k*U`SRn`9k8o_*IuAi1W|F zvZ>{P>H@+?Q)^0o&k3@e5!3DN$MCA*rm2>{rIdw^7a)=+mi8SiVJI#rt-f0EGa!VD zG~f=CJ2|dMQ4I||kTkxpdndTK+naJL2E$j^Uot|N=!V{)(>}udC)aQx>PO0rjie3+1?6BJi=*Wy@rO>0%B7DR_f2xP(~8Vj z4>^C7@$MA7=ASdZ+gu5DIAYGVwRk*o@z=R8`TEzlL--OPL?5jw*OcDC8SyF8$JU_h zD+NVLOm*z>1i<)eLo*q4AxwWDIo7r;LZds`GwZPIi!M0e@oC{05 z@b@G!NEqwPRnYrsB8SonC^Tc_N2@w%B{2~O$R$r5&+%SmKaR3Tc*qW#FR1Lk5)=o< zWR%=xWriqz%icNt&-YVLScN%t8^Xua9M{*U->=C}d04X7s?X0CU&1zxDNK9A@XJ@+ zrO=bPD|SiIH+%`g_oW+#C(ef)qZ9V4IK7@V-ATUvIWQa3Ouq@36BAfQYH0S(oiD{- zyZZd3$1wM+ifFQ&J@yx|`Lh*8WlGgktK>bVk!3so;CXyo^0K1;Yymi69hxUN%xkj}g+XR`@%9s5gQ*Zy zT+a*>cUiRfBvX3B4s-uOVhtZ~6J%9&!a}<0DyYEJ*Ge?&w2NZ?OpNb!cbQ$4`X^#g zAKypss5wd(mObv`c^Ol*hkJaPl3p8VValD>LJ&>%N&}o;M+Ppgx^-8sur>-50}F7w2(k{N|3g*C^$ogmJ8oU`w{YxY(71q zdYER&`}E-fcq0@lsNQg>TeM=PfcS38)bUxRoK$Q#*jaLM6mH{Ptk8rm&*xagM!dk? zjJ3+9pMAGEsqDe>YQ0rSM98;jYMjk2GKb-AN{h8?3e-Q$%Tqq-%8eWC{0QfOP9OZj z+4bX#hw{i%23{eK!%ogYk~ro0bq4Ckiio>T!%Zj=%kX_a+TSqKD7B4kIo->Y| zIwXUTC90r!4IGvJ{}MjOc+sQMA-VfI+$zO_H)gU-xkqd3|GHjbCS6L6jKS zp~w?+n6$SZG3H()WBC)dl59ncZ$-ze(h4Du@WbH$Q1qwayRk$VmB!Ts290LUpW9G8 zG9~o*<3RNB`?6iO6yZ^>ohWj}M99t&B25HlSK%dHXv5}!3Ff&Pt+(oHhSY1@4r3BM zt?!P{^|$6TKe+7t*>L!80lw-H^L&>10@iFWpLE+p*;I770{f-v&;-gLW9;ru z{q^Uv+DAJ&2?*G$cPv3KmyPY$F|=yAtG9NC!><1Bh6s6wzlZk2m4!P@lZziG=fLUW zS&pTPlm_eFX9?{$sMf+PmaX{PaHXSzs;wTqPu+pVgx~L2qw0UQbJ^zZ#7RF#NmgC* z{?SXZ|ACQQeUvb{y@EV&R>SIn=pAELVMI+B8m|I4DnHQtt9qj_1oZar2-iB!hkN@% zd=)4_Meys&^IT{pL(+)}V%+uO8-6-`RodQ(W^^^;*m;c zM*CiLN6k)9>#=IP3{2f86AuX?3I?)#P`sYO>WkcH%a+u?dIHjgDByGk%HH%IJmt&( zEaHP8!yJ=vO)rG>cylD2|8;3CsdN3AY1}9=WV;Q*K(hWBYa>UL%lm_l%&KIeBRYa_ z(wTWz!`;>U#czqipzaqI>593YM7ojY1b2O|ng6 zU?3^4@B38Q92iuc3LnWlHzJSSe%SxiX^`4U@HHMl>mW|UY2Snnt^Vx0j$T%11`nNR z5puV>X3o`;?zc#jynK^~{ATDuff22O5$&`yiDd&*Zv+Z3&2f z%{Y!<3o@E7nLLJ&yszO_x#3pXD@_zS?cG=-)0Os9@?w;{q>YLC4eZB4oX1v2ks)1i z?jxkc??}F0?RMLx^=8~G4MeFV`R-P!MLkn|{o(q=>QISuh?~Ul=de_b3Z7bz%DcSmu=KNGw!@cR3dm*wa^g;%Lc7;RWhcC8`x>3(DJu2b))AKc8;fH zgQt7!HU6WF>ZppL0H4(?ZNL_9>W2gs6;%^uSwpOT9zYH3)U9LR9UN3=p!j?j5oV5C zlLB8-Pr+%sY9z>Myu^y!g4cEO$q_CXVMb-oagEf}ku2g_a;V_#<S1Q*#JlMm~X$jq5jFF|kdp7L`wo(i)-zHq@vOT~%poP`4uD zmLpD-sAcsYns~}elxotm;~)o=NJsU z0ad(L|2kzquIK~nKW*hefU5HUpkfhAasc`%Z?-WWJ4BG!^x0V2X_Gw*#klg?xhWqs z^%3^KW6Ljld^r_i<==0gC-X&Y0x_{ zGpBI4y70gv&)De2yz zD2E%?=g-nksL6J!X*tTP#Gd!Gik-QDM%_b`WUfF*u5?iMC_ncE3z1G4cY6Z|>f>pR zWoJp~cRnRR*Jb)SGM4?w!~8U17qEKT%$BDHcvY%75;cxPp<$07HwB<4sAJudkA!@u zhBHOi!7xT$ZI!7bVaz_+#@h4ou9io_c=Syv5KksV$%-hwoIl3DOwBq4GT?#_KHIyq zZ*i#Zp{tr^H?#QI2c-|HC;Bht+)lHC0!Is94Nwo(FDK*%g zcL|R2GZj5(OL{BErp@eoB5Q4`FVn z&E#cf(WAhsrOjDW-(!D-*_0@t^WMhPZ-}Nx3nR_>$~1n>tB=$ z&-%4N)M4^pc_*9Si z+9|Q2;?VH~K-gOw(#B+3V{K$pd4@dzYX@n6oL(WgyW#)Bt{`~f|04UX?FsLZ%TN4j zwK3v}n4m}z1+I^OK^wvnPgAVR7eY@i%Y4I<8@nRyq7F1+7!E>yEc=rBb7*hrhskiF z;WAeS0yB-oxYg!DfA=Shu1>fL+-p2lY_;eS-N+QAyljMX{cFLpdBJa7do5DdB$g{D z({PnuA|>^nN47H~YqJ81G$+1MUKaBM?inYIjfVsee`J5Gb#6w0ilQqA@Wi_IoJim{ zh%tKUn_XJk7qPr42J-s;A9Ic%0`Lbj`GCG7E*+*PX-RX0)d03~;#-5+SjlRV&UG?0 z^cec`lj$hgc>YQy%ue8YlrTqo32OU|?R8 zT{}Wa%I{bV`ZHEDXP7=O86koFMFG@H6XNh{b2{imF^;{3{#bBag zRM47C2eiGDs6sfLlV0yda@oW5D_MAkz(KP;psj>mS^K)s%F2bgPP3ba;QoIsM%g*6|(QWX16YKFnp^Z3Sw_}xo(Ne;<0 zKiD@$N&8Cz-mD3w)zjZtg$M)1uXq;Ba^$9S??Xv4Q{H4&J2NWmm3pitz-&(B!nc5% z+#Ah=**xsQW;Ij>sDiiDxVvD_(N515ZSQ!VqvVPc**k{aKXQrk}0PuRD^^dO^YUHkg7GUG>lbvm_GIvd%Zc?Fmq+a3klD>VXr^Jg`z7b!} zv@UD%b4cCzyMU1GG6V`il;bo?Q)6I)YL=Ty(#LHc=K;zT7zcJI{6A@CwY_|HROT_X ztP(kx?-~CST(;oIT$WyCxzPMrH@`#wfg124%ojDcUALt5OdbN_ zTyYwyy~lZr>u$tbPRsk^lb*#KKb>AFs+=9W`)XQ61^!%D*Rfn*xC*R4b4KF= zTwi&Gsn!?X|IR5bK=XnKQbPYz_TC+Ue5WE9P?Mz3c(r#i+=TK4*GIAb(dhx=a&eJp^!@T`<`zNTb}0f{`qi`GSDXbB^Nt?)<#nUhNX{e2>y zr~1WBZ8KBLLj80=nxILL^W{>{GA4U8Env3E^(iaor7h^4SNJ#ih=o|a`xO?P$*mPN z6V5$4Z+0hmWLtz2&S5BFC)xE3q{P63Ki0-Z)_#uf;yKm#@$)*l*&YL=7E0?|?KAg} zbOlwN!sJ2?v12O)A#y97+&^2Nzqih-dr3BKK1t}fqL3eyLv3(xO;;BC_O_37Ixdtn zRo-pfZ+eh`@?W*76Ufu<;e}AFE4bE+aGN%Yj@l$Hw`EBWIOp0Vopj$HeuAPEk)MQ( zcjS0$)36Fc{xyEfR{=-!voem73WWg>peVv5RR3U{0@Tk!q~LpP5om(XoB_^fB4wi;mlqIreo>j{KkUJqb$Ko20rt|OB zi&3%W)5oCK+;I|^xbVY1Rkza@&QSjE1`I6PZ~!%B`6GXUo;XfnoqRnk!1`Ghjxu+o znHWS7U=n_tCCYVC1ge;fO)(UF|66Py8%i@;|6q2?1#Y zQ8^$5*pFfC@#bcwU0zI=eK(SkI?6}|lF5$Dcx#}X|D+W}F-mEd;7nB1!LE2tvV)^v zcGaNsOyUekW%Onl?si>% zlf3fswzc(u`M~|QRbxr+CJQAT77+Tw8NtJ!ECUK$Ro|9BGmHpv1_mE zCT4ID@rRqDh!AI7(9)!{Y9KyG#;bHxB%hRk=he!;C{XgFBId(zl;SSgM_wA1EgiX& zJ7#;OFEzU#Ocwy%j`H$1bpCfGJI9pIly6zo!{Dj)BVNJjj z8@$FgB+IMtQy+?%cSX&RO9SfjJtnnV3<}o>Q#T}Oe>7?;jY)TqP2(3O*|`4I;*+F< zR2`0;JULTGFk2D_5fA`1E*>V^z@UIUu*mWiYX3q8^(s#j>i6$idBNdq{ zR$=4Y^RHNVi2uSDU{8aWi&aP(qjsSzX z4BJ7NVs?18z2Aq_--oQVOy0|wHMLkeI(D{RhPgh={LRCCq~V45hA6tej_lsuD+HG* zdk*~@9ZMMA4J_*r@Tgoh&9CYRmvl5bLA8nAW8_TPRWpJ48-lxE@<*eRq2W)JL?i2v z`nS)(djBfqFL(t0rLKfjE8alnGWZI8XQ|j;lGc_p!+7kJ`Ci!!y$wiQzr1Z}>MSjv zoS!3Su?gjzw=2zdP9Fp(p{Sn~?6dDo`WM7^f9?-m^qj<*68Gsl)TSI#RxOE&A01ar z(mZ{Puj9)>ggP0i$+i>N4xq?6X%FsEZwXLZm_?9aLb|?-Pk0uUs$R%&x%3toe{x>i~CN6 zcuGYsl*bG!2{2kw_%a`?P$?FE?<`efRnq@9E_-Bx@crcSMDBW%sI~&*5VDPcc|S^~ z@qKU4i0kQs+D>gL*q@#12{s(D)9(0ka<=Vf$iAYsqGYU1&sJMd$Isyr^GL-!|J>OO zctcjgB#JpR$#qWnzZUR&c_pB=j;<8_Sh5>_i zBB(X5gZO|*GwqP-NpWFotrW_k*^D+Z+=Fe)9xhY4jvdJ~$-OGDp6t0_6fU{C_&=0U z?W?*o*1|Si$r>*5MNMTrAjeu=02u`Ry<1@{BJ^_)Dh4yX$ynu2!&(AjOqPYznd{~T zI<5n4Gii;eqO0}J*HuB!OEtvhhHNt>*hxSON@-Yb~IKP!~!W=?(WK+g`?!lyl z7xY41??j(m^V0XUd%S)UoN#pwtJqWE1~Ko#ytjGt^FU$V`>9?AKz+Qd*d zY+?Do#XYK%|5;)7rqr+l;6e*wh(324)S=pRq*2VFuvZK2!ti*cu1 zqr-YbC(LjcW0@=*(||eDTu8YQLjUJSMfKO?|r{LGV0C?(Qo;ccR*yVurjs@pnqn=hx=UYjJFb0 z+!b??h7$ClOA$-FCX~Y4(l}0mmW7=kDWXx*gI(B#BGIL!DHxUF@2D+t9_#zlrYSwUo-VpBO|x`jX$~~eGSOTsM|qTKxI}%H@Y`Dxq;4>)LP3H? z_a^U>pNm0Oxh9{I9Py!Z7N)`iMFe5_pH?OBrY&43F?}hV55ErF!t+h+JLKW~3$NG@ zuxfsHtnX{qOfws45$~?jR=oCi^W>Gy8JaMq`x=!LbM$*fl$0Hy{6#%y){7b96K#(`vElMo~$t^T0*l zi;!F-vciY#%*|K6p4BEJf{hDXihGv8?2@7w6iYI5r_E>NwR& zLpw@glR;z{p$-FcA95_I+CfTOo`?7tnt-R`^g%s^vLWVO`c1YWp4*zUFZ0>I2}`T7 zxTJPGD1JQv;93WnF0g}}N74>r2ziZj9NwrmU2I?)S3@_Dq1kGdCnKJJ2*huE>u8oA zK)$gaEHnDKtk8^7BRK6)_+$xqV`s5N{_Y1n%1zha;g|N!4q24+Ak!oGA&!5|?!;j( zUbT1)SGO}@fPS8V4crjP6YZ@j^MU7@T(3$=09( zf*IOVHdx2wuCnkdCm7U-vb?xNLEl&hp? zED%3EuF0py;+|>RQ(b9mM}vRK=#TprfacCLKWhJd=DgmI`B!y6a9%ghxkUU{A0Sh! z3xU(hl$g6ahLjQ}ie(+%7hN{3TE&g^qk+)N*B5;Z_I<5-pGoWTyhY*Coe^4BX46UZ zX@#8W>52XEXlQA@T8uj;;(81)H(*xI;6VaLmO8|D0vJW2-%gT*w_9H3=CTI)AksW{ z{-5?-oHI$2`(BVi;Gu<>&jUAYjpeWEAxpUhF!qb)M;i%EN@xV;u%@VA3e(w}`eGBp1=LL&jc(1Hpa*LbsT?eF{NZ#Zi~x-*nYAgR>~ z5Q*3;gf+1o{%o-5ET`c%T1@IH8Q;qw!$KybW*{*}E&pV??4Q=P z6=AxF!Fz^%y^CXiZ><^4gL`k_H$FJDCu#nyjh?BK9|Q0i6J+rjzj!aw9#NkNZ?{FB zf-G7m8pjun(I9l4Nzq-~X=dRpZGW$#=tiA6}7HF=? ziffdnK9~||;?z$^;r{!Z!rY%$8x=;N%Mo5PGT?ewkEy0vF3{?-H5Z?lRF*FVa79HJ z7p3#e%kAJVzga6yJ!R!f5B=aKb=m?0HXUv&wVC?7v~K;d$w_P(aD*KHtG`2Xon(1^ zWe`?W^3M+yuD|wy2+?ct5z*2Uzkvk&la_^M?|v8`pE{c23hHWAjb{rpq562tzY(!c z~simbrE*E(5kS@r`4Qey-3}k&+xK z_&H)z6n|(ueZ4zcb2mdAavzBpT!5{y;I=~uNxT*F0vT$6^@6=@I=A1nzmV`s9*@DT zghx^YICb>sEhjJ ztEzS-hfbt%=#Tgoc}2Bu42$BPyPzMi%7Oh97aU<|jDkg~^2$44fzCZOuR0p+>x|Xr zMX_l@h7|XmKrTINQqr_^z@P)ZJkN3MBq-?s_*k8C(memQk*;@sT(IlM5FUx$Bb=h7 z-_x##@?Q!wrp108oyf@RE_am9fx)7ug;*SGG|XRqG#cfuHnh!Rl8RPhQT_iOtnxZB zD>O42&lP{0eUO3sO^%N!=3Vm&uyDa_2T}=P|H?x(6i3$IoQDe2@}zW3KI%U zC*uw(vj))TfEl3@3e#;g~yM&1QMH3RK*8%2^?Q~TlDDsUKdlkj@Ke4(K>GK*->yMB{g0Z#U^Wz{}m}k)! zxR*Iu%796bVUhLZqV9$C${JMY#A63ev-}s09BpGZMk(wE3<+OAwNAxx1byCiCyPu3 zNUbLymns3@v3hpgkpn_z8oo2pi#2ed7mM=!c4V?EcD@tmNNWc$g!<7t*I9>qmQOCW zqJ?E3_jKrd>*Z=p)=s6yiYGq^e$?gGk`4nV*`Aw&UstR^UW~!l6C^)_h4NY8oBB`% zC%bR{_Yp4|X(2{^bGua!W6-AOGYFg?f#p6Ao&M z_;{{fhO`ZY$ zu~UmhpKG7@1eDOu}OV9!#`j)sb7&b&$T~okoI}Qp!leQrw?Ky zdDR1)qrULb<}E0Z)Rx>pb36vZ%fdvS7QFP8U9k=D^3wfpKq}^8%++6Kdb3EbWyd2* zcZ8IEa{4*wsVrE}qWZ0NE?83t*e!q~WEk>If@E2o;!f3t<*gq5IHB+^3}tUY!}I*_ ztLw}jYa@k-l5|wc?=W`p|9alMQ!%e6c*c+dkWsBb*ZL5@=+1{~y<#dWGd0k>jA0A( zh7~=Um^kf!(?AeSKqqE1P!U%e_9&dTTdgD(AL3+8mHwakBXani__hAcd)l%cq^p5n87#atP68TEC**6K zx_iu8c{O&0R#v~&CRr`dfi)OwW*I&E{?4`jCNWJ_LKf|&uqmWbUMDLE$`FWI+egW9)3!zxj2~=>+C?#EgRHP z5RXXbf+!Lpj_)cD$URf%VY;8-tvQ1X(vOZ7qomOol9slY3W$Tc>!v-fr=0|We060f z*H=p(EVdR8CAb86*XEyIPxXsq(L(?N-RMa`QhLk%_YMqc_ygFaNeD1{KHTKeFCezh zHw<5hyd9}RRS|E>iRJ9S5S+q0YPd6ps0~r1{L5JXp~E*FJFO5_KdGN?J%4x@^)2@E z?CsDpnG=ct!6>s_g83raB?kwZ^^7+>ooRQ5#vftDSBUFSiGE3}0vz#&`*?&y+yz$m z|8gq@k$x0!8tzKp^ZpmNV0aaGHML}Lq~yo|*JW=Q^Bdz*EK0V52!2umtbrPo%@>VE z6LWB}xKlmluL(b%L(4${>7_zeef%q1+g~&F ze3-jZ#buhok;di`w}fs~_Cm@j==eIk_FG}|fp!(G>8_kD`~(>@=nV+KAJNkT)GPAL z-&Kd^FAF`Jyz9lSIHmfXH0oa0Nh%mjn!}3oyQ3#}Z0YV}&JT}uYT0_t0><4$T9bz` zmki?`-L3zXWUyXqysE*miZ&kO?L2ac`J577?&4u=_+^rcN$;3WWKPDgE+PL1I4v%)FR>V!7Hw zEHFnOV}}Mk^-MH2y#L#`)XC#t2h6R~nry8n4tc6oTK;#m|HuRorW#BR7JAq)aN)!g zGb>-IBa~&d;#520nlJLx=&rp0>Ud~Gm+T0IlmH>d?~WTV>E&dwsw1iRlfA+QD;FU3 zZmNCP5_j~Cx%Kizb4T`JDBcg@ii}Vvt09J8*8S%&RX!rJb!U!XVGgLSTN#^!(OhDX z`@*BKl85f3eZ0zqZ?ah$$R`}IxZ|oFoKqsD?|H1V6Jx+5N;Sx7rNK&2VcK>A^Mp*| zDPDO!$w2~)ec8L4{F#79qU&ZOe=R6xmW2W`KWnZ&UcVjpO)#}xpUlj+_5rDQq{ToO z%kQw6Hx#jC&hXCE2`>JAq(;Xr%>WTpIyAk878M}s_r^g6F}{aq@Jf)83HGhqVI znpUL-mvYs@w|MT9BvfKzzB;98W@#3^bJuFi(R5KbMZk1vuA(8J8&>Pg4XL0j?DJf$ zUybif<_l-?9*@#_K6b_s_}!7fS*k~@AK1l`#9@XR;Hj|#hkbw&e?3z6kh{#DxJ+EG zmI0sRi8~EiMp#G2bwA;EuWy!}`UUw;Yr8QswS`ZruQLvKBx#@2x7n&Skha;xlY7n~ zOkL_taB&Dv2J zM@k#&bm*x7phspO(|()RFCOAU~R=FOSXrarObmZP+_z|C5T64*sPx z`YZ?rs)-F6vr8FPcMQ(j^B_2;^D{|R2Nc5YEv+JfgE`8(l@PAqW@eZyFvrHZR|@S0 zGKGv$bf-s{T?w$dg7GxFyEo_ECoz!gZo(G~JMSfrl(^a5;dWLilrlR;NVo@#Rj&=mA2A?^_8@wD&7>YFCk;{| zzA4~@TP(7p$Fjara{+y(dq=|kNS0F^4J9Jau1MwA%Htzod~opY!+pt@X51hsN)ixe zR}3Xm>qX=ygP=T;_)Pi77%GY_=PM+CJc3|ZGo7^5I(MvNwY{q6nW9Y^i4lAVY}bHV zeE9tcv-bsZQc1hk-sGUKa+MTWclA1NRb$qDxMe8LeDr*NKEHz?`iNu(aT<>tm-|HB zJoBZsV*2xj`&d9Xw5`N7b=3`|sJgP{aM(@>apa5Dj9^|xDmpzH2EyO31seORA2fM~ZN_@EW^lT5q&oH6T>?AVPWdp|r|=8o{XE5K2eqY4n_%nZ3DF z1@lB{UB*Q zBYxkbR{hbZ)88F?Ry2OCunSy7td3EJG;`2G+Lwnb;5s&A`slU(@0Ft#fcz3PE!T#=?3K-L1~ zX)d;4ln$(Pa`WyH0Cfh>bW@&{DP#uM0;D+p@V9Bq&S|h(9y)W$5;b^3GnZ;53LsEK z08Q({yY%+3Y6k@59f!4yWxt!H+0yKz$|8pZiddA#@w~9w46q??x9Fq`=_@j{1fLsE zZ18g)JH=XO;JG_Txxq`>R4Mf{+7`b_#?GtmDS>F=)i*Pbb`|>;1KuZTih7L&`xj{ zYD4?|4=x@ey(DXn^y5Y1>OT2R;Rt0Ojz|G76N3QQ1Q&b;_ff>!vxSPk`a4_L|yJ8?8USavn2WRugH zB1~9HCaUWFi=s{(j*w6{Tk@>m6ee+}q)8_HVswOMoqs<5QRozge7FB`4e83V1? z%Mwf?(Sg|?A|!tmSpRMI&|Fe!vY zb7;{`nvec>KVf7d6jU*(h$8H%3*WHNaO9*GXIWN49vmDrGKrtj!b)psPC&GfKT00gZ7F%z%K~BlwB;b=XpH=cLd{~l^?EnMsqtI>4 zbJ&~fcArmC2<{u9KL6d_m(NTLSYhe0&0ht@bh-LESVoX5A|wt!*yo~G%wdhs5e|?) zSKkY1{AA8rLi7Loo=eG!6e3GHJ5AT=lH4w|Lr{OXaWCAE83?PT9bi zrytHSbv=rmk6*3&b0DC30Kky*=6475Mqo6$u%@xeeTa}O;=@sk&S9vu8yW@?0^p29 z8Da3JAo4j2=v;_shk1vL~%R`<+v30tR$#q>q5DaLa1WK#RY? zs+kn{$yPsi)zUq0&n^!7h5v4wF{m?AK@rj}FIvG&Ay{KnBzcKBsoVK1C3!#zsO6K3B+ilT~s~3f%aG50P zl1G!_+x1oWq?q$R=01^~Wg!rSWSOcNnA z309V-G_T@)?k5jCuM1rHBb#x3nr@=*n$jpmYbT3}eZW?8G>p>X(;R3>sZRBsAV4Ni z2y(k?>J52zr_95d(B$yYIqxA=Uy(yt9?*wELmDQ9tq*tfTG zTbxOOz=(C1J8>_vmLnVhHFjS&NsxMJa4qPI` zn5`)yTVGM0_|5mF4ADZ0L)-bsVg>0l?kP96Y7|Xu4tWpAT1XKchi9=U0{8Pwu!gGZ zu$@*}Yp$s=_7VR3iCK=28b1B_JRw}9PZjWiy&s=7zhIz4lHxc#qD7@SlZ4bL+CI-u zwMd)V+V`=PDG!jAsl`uR?+cpZeYrppHBtO#8b$+mT`f){Aif6i_uo9}5lo1e0njyg zYGc8pGupmFUH?;=``4iF;T33AgDLk^3R$J0@6#f4z~@|Srz`es_&mNhCqH@jNci}L zd<1}*hd4>boO}kk);|piD5eQ@ayLUY-g6|Ab+P#oT_|hBKcWM**jVii#@z%cY70ntl4Uz= zaWH=<&GlIbU9AWlZU}h`35&1B3X1XDgyD>Ojh_4}^}JN!GV%_HKOvrL`fAzET3R`Q zwuxLw5}++x$Jn>(UclWJ6ESwh?%2dg4qL;T@2)m+#@c(*m)I{AdY+~0=bqY%OLhOe z+mrUUBE-RwUgqG6-vjFK>Ze=-br6UJ)BDYU(^Hkv;ldB`LVFoOPf?UG~Q9&({S zo^-RvZ?x;+FVrkIKRl$_E+1c|ZZquOQcmGr3VgqNCC9Ec+Ne&;@`YfFy-}}G@;0HA zT}Eh;F(!N|w$LR%5fQg2`U_M>fRRCU$?x1;D(~&Q0j}@RZ@>lO_$z{xu-9SYn@O+%j|Sj zqT>rW_?*b@u!lo7TMN=8YxbFdD2L;(dYk@8cdw=F$Xy-(Zu2 zxM0H`H6lhQj=)CXFBpZa7Ah`O=}cUw%VxM$<(LR1!;41Twc=EMcnHL~=MCFE zK0&7gBQ_u1c+p+j#ZKp~?hVTOQaqDc-I!9Yn0;-39x$a7&++b`(!($fO1h3QhhrJ5 zdr|@`BCB@7>eSK-|Cjm86>wn4*QoyvTKT_s87hTjt{CVX2#ggD-D1;b*QVED%-6kr zi|}Lw8Fs*dgn-9;%TYgxVb$>6>zyi_v_&{G2l9dA z9L-}#E()3a4?BA&4h%m=kfD?tM%Nh?>Udewx0%Vg^`Jv)kdrkN@jhr>?$bJB!@z$u zvHb`AK>Qz<-{a!OFpC z=KeU$8WUxo5He91B9^y8P5Ac}yD%H{Dj=o3?RTFN)a|1y0DE*{KRI+V#x`<(C9j;= z(*n*IJ)l)wj|rz7qQ!K2hy%7Lo3_t5aS-M0O!mx@*NbrS7cTc<`i|$^NHrZ8y3N%D zeImKfKwlv{eou@bvvLI+J(5=zt*>cXw2=fWQcyAf&Kct+@g#EK|1Q0k8WDyk>5oAc zW7DjMu^w=k-&v4^^NWoldq?!T&TKJ&vB424QPW>-Ql--J3rCh&V^^euZ~>Px0h8D^ za{sMut-r>^S#TCANlsKlmM8_TC%&@HhtW<;MTn5(%!Tw%XV`R4<^{=t2$~C?p1lMU zi2_O_AZHt)qR_coV!w;BCv5OX%R@!3J;S>#Tha9?`;or{dJ`RJ^F>zu>VVXPSv2r6 zgorZh`B#0dwsGY7y89w*W$eZ;ab5S=d2ZWE20Z#izp)ky>0i$AWYN?^QuEPXmSfb6 zaM_Ie7ij=l<%~Xor6n7Z9(J;WWoZv{!kTe~p*q;!U)zd|d-RRy>M%rw3qIfFd(m^H z97JEOv=$$^eDc%^A-3`e_>@+6vhthfQclC%BQW`jt}Ly-fC>8t^F$+Q7c%y_^rHa= z35cL1jh(xSbHGO&B_ZMd6G!g}!z~!^A`x4%rS7BD`nJUq`BUa|b3R5Sgl8b?XNvjZ zTdlkGa7Q(%;Z;B&V}+_e+0zgVVBFbv4M-;R`63(-9$n=77kg%6wi+F*QY|vxLRnQ< zz#e$)7mgR(L-XZZ9N%HzXu2XIOY`%h_bRD}UA&#eTB)zsRTfDi-Q~@Pv@|;hjc_aH z3@f#wbCVXis1EA_j0>BfR3lRPIgH z^khspV7*t_l^bWkt*zttd7~G<%+FTglrQ->r}fqEJ+vtF4?gnA2wtw7ylmI?R-;e9P-)!T#E?^0ypA^-uw@Da=6ZDGs`q z)s%K$mRm>XKB*iKesMC15QSZV@Id3Q242>oAWjgR)bPrN=Ah%DbEKyienF{VKq?b` z&&=1nsIi%;Uh0~RevIK@YN~1LB*VO5Ew3 z%u=DlLWc|}L}T{fFcU1cr;&|sCBXsmh`Hc?P%Lf4Pgk>tV$j)J1J*fj==dMHVFD^$ zDO~2_lH4-vZFL_%>;vx~OZB;9Z^Bb81Z4rVe1p91tkE|8NZ|E%0gtRMSj_tK8uav^ z2%^~nFxnfxLI}fs`|us-ceL-u2m~$J)_q5ud>IFo8p`yL$>0E$SY6MU6Y!?|cKJ2K zns!hJ+{G`h7mX!~1`c6{P?Yb(;o<0jt{^m91BN9hx3f2ghF-q-e>#|ef)PBPP1KDw6|A$E1p4I!tFlfpMNQzbHJf2HIsMBp$D z6uAXe7g+zi?Ti#LVE(MiElKu&S70~{Yz|1!J*{v6SA-H4`0rB;TCrOFJ>h#eK)+?k z9tlK$PyY_O85f{FRGiOVSPb6O_3V6SeHcl&Y$hK{4$sX4R~{8d0Ti@@XZ#Gv={C6_ zP)o!tjCMC%crskbDvO8$4PZ>Dt#*bNpmXMfdf$vswX$ydJFp{MhXrRlowV%T`w$^w zNhUVK9T0J3{f3sZXidh50J@AF?%*}FmVyn6WCsQkbR`R{UJ2xHc`TNFBw!GS4|Kh3 zdEfvm4kKDHHf&C?*{Grd^|R+^hxN~pm3|$`=iFxAW!&B*5I-Q8DC*RjDpX7Ipd{=x zmkYdzpE&r)+0{2oHEo2I@MS-r>!VZ{0{2}w^8c{^Or^FQhX}}4Z%&F!3w_wuwM%}@ z;4-+zdi5=rHF^{xOhQv8k^@*=i!O52l5FG6i3SCqe}|d%eg@szd@{Qw5sU+Qz~%Yd2bxlchUUak<;|zz z1rRb3Dp`z6^qui9+DoIws9t&;jM#`S0T-iIHQB>nI-sc48R$Q^{kK74V z3P(oMg{vQUNQUo~SLcR{Ed+xAGEMh}K1E;3guH*aP~g+aZDEKv>D3(-(Jd;J3ZY9@ zVUio(sc&=sHmD&#yKXaj%$%RilJv8k^MrhJ%-=B>cG0C8HaVLd3)%l{LDRjN!Netr zawD4PiGC^6unItIHq77gt;f7dO>WHyRCb242P3r=5b%VnOB1B zEMm{$D(x~$6gZ64UbVDU7*LgQV!!^U7Zy{#6fffLHy^@5?21vLDUq%zeqL+1L#FOZLoptUa!={B_3BCSD$E zSCN&EAW~%IQV}JPmr94}oMkX*|J~qmN5t=foDlcyJCGAGT$4wdSV7voTTd2TsAch! zFCVXWQ7$i*o;V-{+wCU)+jf%sRFJ28 zBcsbk-ZEd8uVudP$XTes_bnkL zY5?Fw66d$da#@IiK~-_x2_B0{-`d`V`D7vhk)0hW9)UnXEu)flQKxt&xjSVducI{g zxu%8&XIKZ@&uq*|1JHFcf_>-lY`h;t6tAytxf-Ewfshgg0n9Oy-z=O@&a;0%@OG$2 zUZMIi$J9Q263B-ka7K5)aq~N_&3A{JyHzgzc5VtDU=?Mq_m|GBrOgd_QGTV^%fRza zGS6{vO*&W%W7_KNx)w2y!vLVM)7+lBV zW>rTUU&*3R9*oF?sz}}`P_e+C{b+aG3r_4&uh*|4y}!pz5tNhsnceR#=)!mbQXk4* zwLCm5l8&4IBM6p5XgD67gsFH(*~R;lewOUMb)mln8m%Rr0?6~BGx>)t(K1|HXL*q$ z%wuh4SpJLU`P9)ea1^bL$~(+a>vkOZEx7%hww2+J%tJHdE%vYN=%IZo1Mf9|6BFc) zlXnRMeuYP`EdjlPO9n`H2Wt_=-XsufK1N1Y)!GxTlSiIIKOKL&C$`6GX0^VF$2e~_=TzsW~_9WGy!2sge4iROH-m(?G~Pc zwidSJE&QZVgp*81@1J%KgSIT+Jx|=d)D+0sB&ct)I39 zLJp_;P;OtnI%W{^P(7{PiJhFC53p)yScOp>RAOXOskNO@%r2z09G5~8tskEDdV^N< zzOd!0IiJxy(0?9%H|<`S5QG^I!V#Lb&MVFpYxs_k%!`uu1**M~L~})QU#+7os(L&f zitK<7V`Ib`9S7k~tS=;8?p9w^aDY0FcX{m(T(GBiWZJpO%;--ptY4w8kgF@)G$l_< zF8pJ`b5CZ|U8#D8Ic?6p%tyxZj~3oG;BVG+i_<|)ud4BhdVJxGSQ;(mnw##vf}~oc zOx#H>y|n$#^E)mXVV2E6p^POKAXSz}*hrh8q39x<(PKQT42$QX zIYy!J-)<;-JgKAfnOp)lBmyLg&J)ohj6BHm`AL|0W^~Z|v6=3S0WFPlR7kjG6ad^O zb4Y0(2^vSKSL-)ht%2lR?>EO8Lr(*q@Jl8?>fxvyy#RIMb-G`_kSYOJzSGZEQ1__d zush>otXh0Q{k7nJh|Bn=xun$f={LV0S|!#uN5Uk?_^Z}$)o5l*M01oA;gSNFZ|*k- zEkn%%FNtVB<`pLk!6T*7Wuej`1YgeHvHQXUvd==vfv-&)sNUBR9r-J7!g)+4_FKGaAu!#_IGU>fdrMzE5h)6%{>FfF{~%hQ*EE>EL#q>v#DlaZtf57F z={9rZsm$saR#J*|VVAPl*BBGVO)ydx&i_MS{H&?I$O5;aBzE9BIc+Jm9b(g&jBKiV3MGX| zJb=%k6khO?4WNSjb}IYDn&m-L%}RY=KI85Aq;UsHey$wl zVx&8HvYhaI;&D4w#%&cfqa@>l>skK=%gitD)VpjHfUP3rWZOPjb_zp9CL|!hLVaMn z_nGQ)ZfX2~^yGwW*|>0a`shTV^Z@Ujl?`{od(~9>^y)939Y=eLnR{z9sL5_aQ}~*S z!_AT*)NtF2SocDO1Z78Icg#$)NtI%uISbr^)a0)hsO(=A2m$up3}};3+@Y9FAvRD9n(eYE8{e>5E4QKb0!JMqO&vXhh7!$Nopt$hGGnoR@ zHcYN7l$+F8IpWVtGMhY6WPj?T-}IAfvv-)zkN%4Fn(m4cMfKSTK z@vD3*W{V25?}uSgz}qk5acjlSt=U}eH~xw;fH#nZ)^T6$>EO5izie7$gTBRjT^_nU zb(5WZZ29jXKT*K6k$WlTOgxHpxS&h8E9s(lp;b+vG>K7Z1XD3_ag0$*`yJ|s!-UkD z>Go{QsJOktDDY;sEi?G|mV#WZ^j9e`q zSG9hd?Al8xo9IYvP|S(X7@0{1_KON!_}bzW7)2lxh;-E+79UAM^nqXBA!#g_)^At+ zYtjQ2N}g(^n~`0tDV$S6NINx5*BS7Z;@uGk%@GV(tIVah@c&r*zLvIAvOb#6CYjE5 zO^&bhmX7aIE8@BFa1O_m z%B1nvM;)3U-1P)b+b+|aCW5$49qK7v>O*!Z0Kmsm3mn+R<9nDbB>Kd zvLf5oua?JimO!qfb%%ZxhI7A0+DSTCsm-4zZP_H!malN=zpkn{M# zO4mwqfvO#YWVXLadT>IGxzGIJ5mroS4JW@w^CIRXJn$DOw=Zh#bDs*L+aso$djG+x zq=i=zlVd93`yle8_m*!Kc(A(U1jFs~Ofy@I>ECU6kU5>!ME?_b1qt&yVGuF|bPnqk z4$6C-pv*(DZ8i0qfP&XUgr?_IoaPTxwj)Itf!J=hQ%mM;2JxI^0&~pddj~YkYLk@0 zSTbkpUlgm}yG=hP0brJ~qOCRSf}EWmWsi!pcLSmB0|*FM$oOpZM?Xhh+Gr?6>-u}1 z>oGSIpqqki!3ZB2fFvpNTMBO@FJ$X*ly>71KS8O*ZS)-fw!87ie2W>5l!q9$zrH1T zfK912g^xuH)`BxmVJbj6Zoh#DS3Eb`Q+By~L)w2vbHIkZte2~;pT#dBI_lDU<|E<{R1gbMF&7(sA`Y!M33nx4U%TRcE{xm+ypE`tw`bTjMnX|9W`wCEMv~+hul!|ZJVM7} zG!dFc^kw%_5Bly1o1xU!j7Xt(t&48*icENatHwacq{6lEA*~qehLY!`qQ}%-e~*{h z4$ksOvxM!q-R1e@*0|W_kC`N;)m0@CSfz#y0D%PH#SM1qhM^B(XOU&zLsAN~4LApO zRJ!XIfNSI0UVfcz(LeToTjbQ|cJMW5d05XZR}6L$)QXF&UZDb<(|RM;Uv3`(fC@~S zFSj+%%=k^uofkaGqhFXUC|6$=7xfB zMIq$)f zAQbjAZNXAS-gKD;)*jktyDNB*mQ&mkSct}Cw%#^v{wU$Dk&>LKmA;(cu+rd-&!@dM zmS$M6@NOqE9i=l8YGQ$D>E-0zii5*2Q~S$nK41bs|I5@FFo>sT7dazZb9=3^zEg*` zC$2xABq9NbhhIk(y>^teu76KN7ZVa&iH4JSnJA9DUK3nlaEfGz9w|+I^R1wVc*}4H z$K0irm>qI7pt4>)gI+!u>pt$dDK#uyd8Ah`K(_Y%+;H;sT-s8YSqL-NOw{FvXDW#LQSyR;b)zE6 zZQlOF(}Vu7p!6S%pr2}#%^J1q_mIh%EmO(yK-=!Jq+Jv$?>A6epdU?TX_;uAd3YW{ zS#*f4emvw15wgOtFXaNdYUqe3&w4LQyDT92YMm@vh>fcfA`qR`sYM)R!J9o=>zWay zGEfcsNI!%izS+uTN-fvtr1&>A>y*CfzGqPbTinN%qhO;{ z`;K~|w(zS=tU~&?oe`UV1<9LaqaaKiY^9)ogtr6ukN0&bdI`s+`5;$I> zh#(1Y4Rp57dV284O}aNrQ0H>!=jfSl8MF$;7kEBTI#s$4#uBs$Mp?+TW#M@`Jgl8H zoVBcNxF^yS=%F6;;mM?*IsAFyT)V#YuhSEUAVA3ulg(Lv#(vH%IAH3@&0F$ZL_{I- zUn>ireGcgXk-{#?67e!E7Xq+II>cm=rN#xnvb2`I)*$8L$u*SV-1| z4NTp>F)E)E7jg)4B6#lZz0fI&g64ifbzgHc-uPZ(nl3x#SC490B|!->{rZGI`1`xr zn-M_bom8L%h)!*Cr)cx6eCEU@F-_p}Cx?x}AS0)vlRcT2z?$%B zK?(r+e&WOUow31W?}V{Y^F#!-g10%5+ehHWk%ex1dh{#^6XRr?O1D_6W47mTZMS^BW#bQlf@*`;nwqZeI6UcoQ_id|@#|3lD_Cyhtl?VT0x# zOJ(!E0riMc23l{BEJFzxJ^Rdm@W=J)i$G&?Ocr#L55Io$Aml)BNR4qxrR#lyH7kD9`$_`% zts7t~T&ZxTg%00!A^gX~K)lPAu_|RaQOM}?HS=KqGu*{9f}5h$r3Wl8n4kJ*-*2nk zs@L_j;apxL-q*a~W&_Gg{r~FiMtTT6;_c_(ixGns5Ebrd1})Yu1S0m|QGY(e{%Z&M z9KOr!s6k`5Fmisun=p7DfQNt`0-hD3HXjr^mdR>r6CF*R8rE%0=& zyfKj=gSNH(iCNuisOtQ>olm?QW@%s|7_DhK^d_Z#PyTi>24HaKz|0PU9_dv=D@!YJ zGwdAR8T-aO8B0b{$|7N4P}-D{`F3ZP49wbiz!bkMx6H&sg|&z3=HY>9G(wWnDC9Sw;()5R2yCNG zu>gd@!~GQa(7pC#wNc%T80-F!rwX6nc_jmP?&kt$)CNM`x&cz8(axU)usZFuWFB&y9(uDS0#Puo zll^K?8%3TGRvNML;?YEG=gizckL=VGonc@1{h;VTTaAy22luc@?HrNKt5EspeM-Ce z#InNlN6<;PcE6E=Qa~(_6fBO7tnC1{O((;iM+mI92*0Wy^-l|*QAmebtH+e z>?{hTfl13rfnJdr{E}^KFt>r4?DHEnjrixcTx+F(D&HE*A|*PGP4`LPiHVL6)c!14 zi_&IugDcS;kBX4z_wCFPFztNlB%{p$;r)%SN-@t1qPeDom894|LTFmNK#XPdGp)4=Lq?{43i^gZ5}dCj9}oa!VD|?ar(1)q7&YFBr^U-(>Tnzia5{p@hpvPvKI!%F4+qP$^Wc3V5@@DaPfKpwAQ^l+0K~5tM=4CK z4jzgyY>2GDksUEKK0FjN@OLN_L}jO3SO%IV1k>~MlZzl3%dFYTR8 z?tl3MkP9SVO{*C-#I|wZg{c#+bT@w~<1+5`8Be9Ove3PmZEi|0*Pcq10YxJ`H=$D> z5wT=Fc_HPERQ#6)s_uNVu@Qz33{oCGiWCS-s-<9gQJqRxr`vFQT~TiQz^-+hSGtof zT6SOkz%)a2XIkJ0;gkMrh>!3a;j0RoKF61QrNZC%@#WFJw)PP3<_Ph{|0&}baA^Nr z-^iKZz&rv*@VzLQID~QGP$$n!pMt*>s^=v8?c?rmzO4vq*BtT!NBAxh=rgL50V#R5 zmh3c7O5AUaRe7RH#KhHw8H1kCTBtU_#-Xvs3AYX7ND8 zyQ`MBrdS?lrv$CmStE(1A|tgs_nyHq&7a-BrfjcRoZWESGE$$@i@@cM`xpeg!IDRa z4=wStn2Qhg6NbIL^AWoEosn^-=Bp6GOY}^9eKz}SM5I6WSAsVvN$iAeq_$n>^?9yH zNp`=W7*U!QY=JK*cPPpSGw;V!TYO}>2qqt$ek@Y}gpon<7?X~kbhqUgOo7p393BZ2 zyi2e6m2#cZ(#qb#8;SIBO0=muPTWI9y;4B|42!>8{e=J4SRt87kJ&8V$^b4DaQX_E zKeY~Eo4`5;CHJRxRAR2Hx4Vpjg&BH?pntFQy~teNz`okI2&d+d_KwSn8hfle)irsTI*t?%0upvde|$t$tZJOCEwHlGxmf)mXf(pY!5N zc%OL*Ho>6TWWmKdNuvvtuV;sWqCXqzw+5tSSs6~=l^mx8o)-rCVQ5N#=3xPM9Iid* z`rlW%hrUS`epqmVdz+w;aged!wpdIZZ0AeoeE-<^C|Q~|TKSe};NRgRJ(nY0D-aIA z*AB{xnQWNy{MLM}RBco{29S-yN}Y&qkd&sAgdy>J#F^Fee5aat4zAM8^Jf0YX9GvA z)m^GGMdj=2zoQ5T+k6L7eL?5#;E0xhHa?Q`xoH+D4ujHtS=Cu=iqW4%SbsO})u7Fm zu#svDSonz6!|oq8jW@1+6**Sjj^=_A>!VZ5tU93xd@SiB+2Pvs$yoS$Kp?T+0(;9O zE(R_pOq_t_QnCNGxLQb*b5c!^_{+A<828W2N*TcW=0Q$}KK|3E#Eo8K(IX)T{775s zEa5G@y7>{_No^xh#d-(xQ7bWL`KYvn+wO}7x-7uRyW1S}j^!U#UZ^@u0FJ8^VRd(} z$WT-qUX=~pbGa6DI6P!D)_*17T+@Y7L>L=xq!V^w#qLQ!;Vy6=f%0)@Tl zTRRwO8pTKkEm!T&&>55QBiPTzrnreXb=GlsBdoA4 zQhNz2@EkVO8KXwM^s=0DgG2slf8%%wv6{~5!)_tDc5KTh{JEuN8Qz>~9r4;2`x^YLk) z`&JM`g%R<2@eThJDrk*T@4HJrf8t|@uWqjwy(lQEeB+CN^hIaV zV`*dZ8d{#E*HUsj>Y{b5WKRzm=UcGYvmc?7n!gK*-{)k_`& zT|@#l0=n82%tJ~S3(h-kw*eR*>{WW$zeAbbA7D*z*G6c719)^FbN#+Ql55E1ZyF2B;`sC~n7 z-Bq$;!$6F2b9Y>?wBTaNk$dg4XRTr7^IN91(j8wshHJbB<^OXN2zO!Zl?~3|1PU># zz<-y0dFT!aYk`Qsvg_}#xL^D8eBcT}FdVgfKW9cp^k0IH5*U$(PNqRbE`7-dkkU$(su1N9zEmxt-VYc#x?hMVQ{d*2fP}Q5^gY)J#AQHOm zL5i$!Poz}no-3#I(`&Hs=x5(ON7Yk(|7S(q+zU>xV{&tIYR&xy5c48UmC(pr)Gi&> zHUfa~psQqE$cB@UK{Z@pJynpdc5zLdV~4aNM~4((%`HdqCFm+oOOSx3Y{$%C0bjrn z4d%OhN&a<>IqUeIVw!ntRab}03}D@RP_N11GLzWaV4*R z_3!RkBq;ISHJ7Jg7g+Iygh$$bU0vR>uSO;`*ukFlM$vrtuj|L_al1Y`$#$bgG%#gv z{Co{ow8j%{#AXw%I9ktPsPWX_QcG`oeJ!_TdcrXzpp~6r3XA{z@*~!Y-C-n)cSfzh zT(nySgF_9?+z99ZUX!hVx)BFYEb)#LZ_Lg76w7_9ePuk>+jK=&Q*`ts?(mK^d%R0< z|Kz-rVD+sa)tQjpA-w{|B_F8B;CEkZ0Yz+5H2e`ijS6IomOMqPSSv<5YChy3*~b8# zX0U9t_EFOTYnK~m$-wdd+>COE%Lf}Q5acw znV+MY+M(@DtOzd)pCFY~hCi2i@Z7_NhW8 z(l?jlUfnNIwMeC;G_EYQKd-=Jip4>K1)KOO*9-P2fPa92r^_X#mlDu14?oDK-t2+1;+M0}U zO7-bfKP&sk*mro+Oy#2=n>4oS943ZAEOlU%z5g@$dx*$nn|>4_E+1GzMv=c64#I}6 zuYD(sH%EkU#dj-@P5}X#drYeoCU0E>uQk#r?rsicYJ9cNO|b>h>BS-oW29ueZYd_Z zBa+2xR`*gS0tz~g8V}pK3keL#0rTSDG^r4Gw&}2ueidYS-`fTm{Dym1x(AI-KKnqO z_KZNbIuNp~aK|UbQO9J-da%L2*qE>6q)DFtC)2WWZcX6xVm@`>TJwQeF8a<*Uw-{B zl)Z3JDC|Ob>3_w;PW+mU0f&d6FECBEnyNA&R8NQov`ZKx!Q5Oeow`T~J$XZ7qkn?C z?K*Lj$TaLMf+*M1!KI@LOX20^&+`<;z&z@f48REZ5cQ~tSptK z!4Rg#TUPu+l@X=Gh5H2?I7b6D`IS^oA^T7sV`w$PmU-B7W|WNS;Bs218|wHWKuj^T zS^1?0NYdf!cT!FNv9*sy6#;EGHEZ5KjzMmk1}aX~Sk*fpq$gi{O!T2;evhp$tWY)< zVQmO#wyFkHrG$!Q1#(laYau@QEF}jX-@aqlrh4xK<$F`-)cBOgtMFj8ugbZ_l4pjC zaGe#L-NZ|Wgfsg47MFXs^6Hwr%krtF&4;eoG^z27hlAdiWf!g6VZfhV(xX&QG~sAR z061MwTd1RY?4BD0C!9yY@}!pet;1?z|MphEZT|iGm(yGLAI2-$ACgVj9pNqs`~fd zWa7->903yG0d`zGEBE-?CrIK{$*HaN5kZEfhX$X2avvdnD6##-6<3|%xa3@+W_tqD zt>*?iDFQzq)+<-~JYyfib-5N+oEnbe_UbeY1cXI&kgL0mo_+va>j9futDrquPOu5Y zmiiP&gkay3T{;FQ!p1+KFX#BcV<4I4_U?{dlJ%KX;9B)&n_u{|@BXPtnE5DSOqt_m zf3ICe-0G%QCctArESP`N9S3YA^u|NW7amR%=8%Aov!CJ)QTF@Q8Rg9AV5UW&I(Sa@ z9n_U^#`Gw%(8cxB{up@M5!nKPpQ(~I)$l|3KaoA^SdP+Ma=g3rbFn(C_kf3U%21@i zB>e&aH^Mw_o~6aYGku8uw2$JV%xRq&3PNu=DDym({B(4YrvRhxfG^p;Nt8>&?4HoMZgDB!(hqw;K^En4{B(hc}@ZSRY;}wC|&uuZabM zHJ9GMw?~e%2*m+meCu{7lo)rIyZ`bm;5+b}7WgMEP!ppoF_^@#pCKEz>BzcdIfz>o z50eN!{+v~Q1lv;7C?PYn*Li&u2{-Bqq4U6s3xW46ZAT29UJC@I9Xx4LXA zsgi99wp;MRfN(a37GoV$^rj>UqibGzp4VD;h_|1n_}AC=WWUAB(slT3dudK3*D8yT8r@5JY+K%mIo*}k zhT{wBf9Xz~peof$O@!Wiav%`&;$;_&$+p28z}<96N}`nM3&1yPGjUX{u;BZ>ovOp7 zw$*%TPa zD&hNNt25gsk2$xw-^C@Rq?abJ`nLw`s)o67)Or#;?S_;+G6>Ifz{$rE#Z5#pZy7e_ zOL^;Ty{dxcZxL*FQLl7t-S1ECV~iQrj7QH2ybq=b9rkziLNx79@rdFdDqxdZdTLnH;szr!lta z$TJ?Jj7GNZcP{ie<7~;ZK}vnAhJ`5i5mkybN44!f^TQ!tmMk<~rQWX39Vr*rafNH% zXN?hVvcg$jg_E|pjJ&1}h(r2~!_7hhlD~F%nvj`a+=R#tI5*?eHbD)H^?!-lJpBTA z-hWmVzJmYNK&G#mxz7w6c}yTKO!X9JiSy}hXjzpOs)VJP;;Lm)5hDpWcm@g4zklNp zwQgUbdUxa!v$$g4L=)`282|rEpm(9eWTtG=nI68EE?osPk%I_?9(KmfqI5CI9zS*U zW`)3g`7z~kHurjt<hVd$92aA6n=9# zBM(mFUi_@q!(Bs8K`t2~`|1nm(*nse($7DQuT8TFk^7XyhmUC9B?tJFaT;GOz@>*utCyw_lG;)-+=-%T)e7JI`eRj3eEAh$<3g%Kf~{D?4^G6E4RK{J@T zeW8paMB0eWEFU}`Qk>i_4O@iOcny`tFfIImUXU_IuXhOPL_DqE$y4+wRA>5I3e!*{ z+uo2wc4~R^cn?ZvcJdQ@Qsd%B+Lm0G5IoLCt(D5N`@SWsc?IsRJhHU&Hf@_7YefM0 z!H-)K9t5C%4S0d!l%%j~;kbRKw!c9qDmwYP->;ELfEvKbdH$ilW~6LtAS4-p#;|`# z>g_TMjUPQr6Hk6(C2L#j_3L-=wG-2-CE2YDEyhP>Uxk&|XOTmQPmoJ^JgrH00h|!ij4v5ZIdAs&j@Av!0M6JT!=pD;}o$dDScXNaXz! zpxefE0*e`7iSzC2npTjCK+>u<-pyAupD9{|pIK&D-`s9I&<~SkI8j*SHva5Dt|z zpC?df_oYkEIJ;U7{xmGh*?PHcdgXgduIIrtbhk8^V%eI!wu1Q^v~GzAd5*z*F28I$ zRkVmt#*q5k@IYZdwfZtLVDxpt|3lYThef%4;lj`$0@973bf?lSjdYhtmq>R=rwB-Q zcXy*S2uL^5-JSOvz;n*;KKDNSGfaH*?X};%@?C3aNj{+84j?N)YsOA2TP`4ISHFrv ze$f12=FdS0;J~iqmVXiq9D41*49P^=_$MO(I?vFJtY!IG+P8kX<-}VTauOtt7JKO4 zKgzl-M)!v zE*lI4V|M}Bo7&-RJ8?X-D4Jp>+6loXQ9F`Cw7`h+2{wAN#65H`dBK*^GcpM7wwxgK zoFbU?TI?g~@hm;bo2{;NpZ8Uq8;tjLUhJFje|`ylf!5ZUJDR-3ieW91c^;E;RTJxc zQ&=AlAKgION^zT+AkFh<8s?SH8{t+2$g?D4C^uxt=deIY*VdG!o9YK4BHH9UY5>rR zl1y%{JGY;|=7zY~7UWV=x*Ey>t%ZT!yp5E)FW`g5ip?=afhtA;`4z?oAKo~SyR^<| zEVQfmG}Tl%&>qOE^Y#9AL0j>->LJFH3(oswSh=S%*o+RB?*KIDh0g9>3OpRuRb^Y z3bxpNW4Fp?Js#0{9cvR#kVNDbq2c)_bxKk%p1&>K;ojFUb-z_su`k)%(;+w(pE`v?EH8FcRC6tDv_up-NnfE-H|y>AUxr=$l6ltKpk2wT zOi+41fP6MS^fLI|K^)o!5%+XmVAm@hR}xY;yMniVbQGOC+r4YE@coNd*KhELs!2O^ z4vH5fL?nLK1~JVjDg44gfMk5fpL0;ETmD^&MMWp&LWONA)-X{5BE)V22G$YimGwU| zw-{^zSf8fuHKz!ctOl-d2(?-(s3xMkNf+U=EHhH*x)|Fv22M3&6N4*g3~y?MbB*6B z``i#Q@Me76?b}W#ls1qRrY^hYdJcW1t}Uy%++p8j0AyaSIp*5JjH*|ZowSi`e;0ZF z)IoC^b1`}-OU<4skzRKz#bqQm43A!?!=FCJl|Jz1=U^ty%c4@%qe--ql6h=5>-+Y5;yzd(fR>D3!|Hq0B|Nj;*muGCtUXn zuI*StZKz)HsIm6*-+Ins&qN?jrs6RWWDM7B*?$W$iv>HLQ`PK$2050e7Q91@2NRy- zgZSe#BRb@A*xS)Sj<^WTENxrYn|;BuvnyldWWb2+*)g#L154VU1)m)*NzpX5!L4y} zFUBP7%8qSV#K=VtriO}v7EobfqxiU=4BozEqidL6Kzaln4vD=0C0*9h??R5`s<}?D9m;*4Ccb5B=*wg3I^qkz5jBSfsk7(uuv>PqyXlG}q4SG} z(*psdgnU&50{D~AD2cnveXjC<UhTQX()0Nrai`oY`q`IOoH0d>;nI2_B54CBak zeVW_i%yJ!2djhnpwq75;5$r%FneIenfKYq)I4AtPmEkC#ll+d5>Tn3q=Cv!*urNAi zJ*Ig4ZQX3QJlQ?rk~>lO5oL|;Bs}c zF>?b18m@lnGRfC7hK-8k0&2$YFtw8}YIHWe{f>gZp_cZJ9kenwXa}{8L^`D9ure?8 zk;?O95arElsw7_#H&V0M6zs({IGl&f0nH%3&H5rq)#HYLxd5N^nc0_m3Puj$3VLYb z4{;z~&D#A5HY};*n{^3#ZCW8u&(6mYY~EYf-_$dzghhh#5#XMJSvQVzFCVCw&TPu= z3r^0C5shLILtp6tQXaHVl*&x>5iQkAP{a?-K-_v4(q2v{GK$UCGA?tvyU^^qAj4;$ z7|q3U7XAQa^)yKS_%El33MwdwrhM6&m9`&dJa4}THVxN4^BiKz9YHtjFjS1i`4G1ZK5&-CKpWrEhk0`UX6BWL)hLsL%>8Z zpesHkc+MRdqjo;S^sOfO;QVhn0~HbtAcOFrMG3~4!b?fwc1^M;!o%;wq~{DIRVPL} zWNg<7XGGAVFLSdss)k1l1IZh@&@+lkYydbz)9_L7z3emCpmr4yegEb=&({zqG8ToiTu)1m8KB*=DE=5sm04Xn3ahXDhU8l2p>@n|z@<+l^50WkJ&YmEHG z8eBOcv?2Jf9Z$qn6EP27EQMCTx+Mt!otqpxXU~+|J||&LpQ>q?MdW%#fAI`NPc$Oe zxE;MmZ@K4RJN^DQ1BiP`reytx`z`#s??Yw@$SsBQ(?-uli#bOf%N)_7xp~k|IyeRb zV^J(}nI6uKuI)=yaLeASGrDbdTLl%f-hDi+T;Bo&z^>yUfYk??9=0H{C>!W;Hi5iF zL{{1pzCLMVZHGx(SR_-<}!)UGB5p zAp=v0T;D}N>;GD?Ti{Q?Fr%5)fdrj7Gzo>emoc|z=22MYiiD>dctCkle6qy|PAC)? zu!~Rp#u~-$6Y~bAkj>66C{kxcz7EPFy$G?K+5m0weudp+kAJuxgfLhLL&hG%!(iSX z4wSN$GnGarpL)f$*Od@nYHElT=wK#2j>QYvI(lHdV2-iLuY8D)pn&M;Nil`7FH_uW z(B|qRZT$jd#hhxBz|I^z{pz2WFZY_L35_OdOPEgEyB6rMi(<6MUG@-H*8;Aq-)p2dXKf#8OrdiyyfL|HkWpzpUZ5bm>b^HBPqfdm!I-5Jg+FRTn;GD#Wq ziAGJ6K<^UX;5HFob`K>5IrLV;s_qFg`DcX%KP<^~4R9aA)nuHBD(Oy348-{IM( z@qwNE8PZlnn-%*_N%#x46c~Cph$yI^?<~?6ipIxUf_|#x?}~uy!hvS=Om?TWpHPIL z)zQGZ{XOV{95Qib(8D$2t2+(aO>xVc>~)~T+kobG$--o=tf*0+;&X-_Adx8zH<4?L z3}S3AdWT>f7$@HNK7-eP(}!u`%`edl)))4Wtwe0oY6!x#G8Y>Vi~Bl2t8D01`z9n= zYG%lc*J7oqDQ=!Vka9T5%!wa$&TfY?52OkBw_`N-# zRe!_PLfKrt7k0^*YFvi`H6*mBkcAR^Rk+GKiX{8zCKP!~;G#ZGxuiH7D8}uPrz#{q z`v!fDiIqQy#HEO3yS1{SB-`T#?tooFsHtmg$1B_s8=FH9ERN*$@(dCAX#>ZrEUXyP z-9r*2Qj4&=DgvjR$n4;hz^wO!TA_G_ldh@>%9SifLW z4BA?MD6rHb+P{o81qf0}s+WA}>Sq|x7@OIS(Rl?iqaej~?|0;4LWeSgWU9Wv|X3TlkWxnd45R*F{@smZE0DGgj)?NKg>$2 zNRSn~m~tg|#~$3@J#~2A_m(V8ECJWfJ>xJiyPSeF%}gLJCM=8eu8Vb<2DIeq5UwI1v$_qnMEK6c{W*W2(>ek*C>Fs0@Ui?` z!IeAf+kH^ry#AM4BIrOg+GaRUduR|=%Jeg68pusdXx9#MX9Ut)*`nqdoouOJ>D%@BhCQ*DR{S6@T`$@_g`OvlP&!-wWM*!uK5SRW?` z!39wmkfqiK_{ARYVxp-KUtoR)1@8>OniNkl>=rT2H{6yxf6ogi6z6Mk@AGS_??<{B zdg;SaW$*}V2zazFhSXqq4)p~t{Q^ZEhmdU}mGXP3(LL{~ob7?O<0w~jRY1Z#;${x2E^&=zWPqbSFQJ3@73?DBA;}-? zLBzKZ2OtPKsCI)6_U$&6l5J|o_8%f|vJHUjU%p6`hBTSu z^lT4jr^dW9flfM^sBYfRN^7|kNt)sRPNdT>4;dI7AWgUNTqJ>TJU@W888--k zxNnc;w(!a6+*5TWF~&MFp<2dC@Whl34tr>%HCWq`S~ZjuMtOrP;H`EUa#fp)P9_jm zv#){u(|IFZ-NIUUpYVgVQe~DuXtEEQQI!QKAQlT?@E>vRleQXxM>;^J)RF=lL!Yu> zk9N{8LpTx={-U)c(N+S!t5~9-W%+HFi)ift#ZLvq&5I^(e4!R3%vIEn{r$j@-F<7z z(bczO(_^gkPohrC~Y(c=m`V*FA{aOyg@W5)+Cl9d-~#@)q9 z*&Dxm;%+f*dGKoAcCtD7kQWzAbM167a33`LLN8zN(6c!1dUTnmtW(|z4~CK-67jvR zuxF_AjbJLo!@rd!o+b_Wz4v!{HO?|um1lE=G&$f`yqu9@tFQT1iPVeoW3b9M3$~ib zkL5f;7*C&juUn4f!&}J(!T15GF$;+}ZU<|Y7>0648)QCeRzULWlHH!f!I)_+#XoynO=kw$Wf3&W8ndnDA@?T-{#CO*r)-!a~LR@`?2Z{~Zq zBYC#QTav#X(}!6lV6?IsHxjyN(&}P&bthC%&XYho%2E6^Gs(qY@Z5NsHbyZuXC>*x zVvHV-eHD$A`0Pp{I=&)btNhQQo&pGx`xe8Xf@1<{EQ<}BZOdlJfaY6=9^VBm6B(__ zq)&e7v4h)O<(wVFS`^Q_(*$ky+iKI#--D$MKkq9Jmne5DokA~tEOE)Arl(36nW(Ix z-Sr#O*$mIe94zWFTDc{QWyWRLziW~1Q`@dBtoX zo|)CO7|2e#ex7Qi@}^`fyW`{jqQLt)4af?49w^s8Ho2lZH6;iu)$aJ}!*jvz<=Q0wzK2+%P^X&m0$1bEIOp7qW!W9p8^v$eWaShw zvX~~h<-#xat;Zf5X?G^rXqTY=Y(DlK`}F zGnMakb$_JN@jxrHCRg>q*UjY4xt_1bW@$q}(LCC|&F=8`j_4oEw=`)Q8@lixH^hcr zXFm;ahO(RdSf0-@oFn=na)ySP*|*4*qKz< z(5!(Cew9ib_T7cx8mBIeSB|zOj34$$W@~kL%Q!UmW2ajYqPuEZ{8+L+A@g)54y&^N zZ3uM&^+mF;9H;iJ>H+UtK1KddkrP(Yw`y3BD*>Br7V;qiCCn#G=u_g@yWVQ!)DrZBaYZ%`( z!bR0=PP`x%!hW+B!iF)sUN#E4P_JPnE}v@X&|{=Q6666K0Txr8>)Q`2lWOT7}`+*Nl<$|aF-EDeOK&_ znSK|VY$IPBU-W9X4TVw^FNvDyN6?C2;i_-P*wvNmb+OayC`-7LfPl9_ri+v9^4nORbI#m=N$x|X(AYBEnVJ%{ zYhMxkf)0tvJuOh;`_0B-!$_g+;R9UP1Ld|;5CLA6vd-y>-P8#KO-FoOn-%{`(dq}q zPa(6G3U>R}ZkFK&{OlL#r{iV{S#luE8z^=hTyo**V*jg)q zXX9Xf6-%B)T|h&(6l^Uhb(`YLhE92KY1*byGU*@!3tX|u!4WSomDRAssHDI%9 zDtY$2`Mr|6O5T}UkVZCN<4xtR+*u6w`g~K_V5Gp8xmffjQM?#JvwO-qYM$H5*-Q64 z+y43j7w?kW{P2(D=j52TmL2C~0n)rm47><3j27o;5N+!lpSzp~%hqI0lp}E+^bX>h ze%B*5WiDo#rL@;+905uhUupF^dgAAgX1-A^llmFs$UkhD?%(Z>fctObDcxZ|ZufR=Ct^oXikeJ}t*( zF&Z_?JYw0n5(57ekFNjb(J}Dq1fzO>aUWT~h@@5aILyv@S=q?&0rtiekiZd#yM#qK zA>3Z{3AeA0EsXwGgQprm3-TK6=PEoJy6;r+zTm@xgQ=TA)Pg}d1n9UBVF=m3qnFCOazl*_&OQ@EFU<1JI?d^`q&R;*Q|^yv$4F4X&`-v> zbIm0v%=Zsm6oc?rir39;{ee1*H{rMdcnV$ZJEU_yO=LZIl-dd-(j&oE0k797WKF^=!%HdDGZ)xgd#5|@r~Y`qh2Fg_`} zw5{-CHW8@-*J-)3`1g%^Hn+&EinLzEAbu=b3KkOimGEhSNm2lX#R-9#vg*!k|6(6< z1Q&G&%mB!@@HDf-J|Zw{_kc@|sXK<&326}AX#qUqDL6Pqm$`Ow zm;G;689|LD8{0vIDUyo)Auc}fGK$??sH~DA+!(bPi>hSkFTcPEnTHNezBp@mpOxpX zCrk32%B%{+$DYbZ)v?r~<1iK&{yj--o&z1RX?NPN+D>Zz?5ZEKROU9)q)2OFcTQXGPuRGq5Lo5j7nHh5|J(-Ez1+{7 ze}iUbjr^#%`C|dJxBFahKDyGWb}-+nuhC(8S(i79k|TMG>c&cMHbXoU`^m5f1Eme}55skbt3KdQR&H zvcnM5u3V~Pyzzkg6a;ER&PSjBMA~Y09Cn&PIfIuJ#)N~#^)l2sLqu`-1T&Fo2Gm^FDIsh;ySF&1TM)*rE}r13VGBkxBly@ zgT;OIT=Q^3&Ex58t*^~F=vtF7J)VmuiWwQsItU%C=DJ{X^UsSnahtl<)e3jsM!;)p zy&S|EEIT9I=Zrjb9r<<3WG-|Q-*l2|$y)7!ls5Yz>gtk2VuJn_0$2*O^&YQ|V! ziw*^E>ccZ+y2+{tI=zMk^u*${pf9-1WuAFN-R-A>egJNWnnz9i_$ zTonqj?6k+wja7hTMIbza2zK13yzbSbhb#bi$e61{t~sseNMWbhVbLUqB@2ltB>~C4CEeTCD=qEQY8mPzz#^9^nk70;{?%bE zX>=mJP*1z@2=yXMNv{mujm0KYu7fLBndf(3_B~Fuzr7PrL{HMkfcZ>*l?pQ=li${b zAEP+GHSJL-9!5}>HT=YYyip_BcH6L}B=Xg*hG2n&4kfOdCE@+14NUq7vr`*&{)W9? zKQ^H`dU4pfYd$nZHW(J@*oKi6#3%V7V33w$m$4lqFTz0S^tT|p+nD3I{QmLwFa+Ct zyrQ-9OEVaSUxFxrX%YVf3|b7@Xm|DCH2)^A8eiW;wA$&Rvl3!=o0rhluu<5jyfeeJ z!zcGrc7{!hVz*6&a%H`0hWl8&50DC8YoBRPPHPFGR^DFjh;4xynaTQm+K>c#%HA2l z_65Kq3SzV1Fdi*9so*>5jB#qJ({~$dt@!=pMK!??o6VY#+dE|6F-ybGIzAh9de#~H zJPF7@QwR<4EL3?%ekCDp6h$v7N)A$gbP?q1@-1SU(&n(>h~;#V%5sZ#Beo>*GbwrV z{9qqJqdlC5O@B@)#~e90`05H;=FQBKkgVo)0D`$1+z(`7fAb z|26ag7UB0=;G6i4!BJ59ES#te7@+8Mt)fkVSnJLf42;ehyZovw(k$%i1HwGQ@!C*Q zwi()shSPf-?{adLKC+JTSzDZCE{YC`+w}8V>57^=@35q+{$7zTIwvr*jKX_<$G_+K zXE(mYkW@UXk*#BtF^NWyOins0#9AKhK&ScoRg>0oIx7$LL7BI{+lRvR7m~A6FW!Bk zYsV*4;@dSCAugU+>s?hF;$U>U82-)MATya)Y1nRGg1$%?T3n!cXBz+N1@5^LX~1ZI zc6tf*Nsg7tNwas!x^AchM@~XDz7)P>Oo2h>WnY<0p-K_L%xwXL|97pDfVaM@;~A&w zCBsglcg8hN`a@#qcW%{2Of!`MUmWJF+O@5tTJMFB?W}fU$^DC(xlbLD*Zq>eZ8TN9 z8@({NetK!d+I^B2F`3Bq%d=Vf<+?9j4VixxH!Zy* zkSuCJ7MY^TZdl7*(?f_Dubz8H%|;&B+i*0cm?hdv*(X&WONo9Oe+XyIvfK>sz2N7C zv@+&Ifyu1JLEFco!?D|@k)Lv-F@}eK8{7PGPHEF4#O^x~jX}*Pehmp*S8ZDTu%=!9 zS*B3&stIJM6wOAdDva{$Jm(8lr#_LMtnIP>)y`+_26FR>^XT=?KsDm`qfN!{b%v&!p?$4MqdZLt>Y1OtR;t2Gu~!Ol!12F14%$l8c%h3;%`zesV1R}4gN1li7{DYbux3{aiez);+J<(g5j z?3pPY{K!2-4w%U{ZEPO*SY)?2$A-XSOD#^Ks(}>kwW;4Ps_fjr_$DYuc7RnrZYVHf>k)!lusS!Lvx8_bY{ z2!!^&T~@!d4EwVr<6J3%WZS=7fTPz83C>nN)7gq*Q6*$Bdz5dK$rg^!8Htn5F0M<1 zhXY{yog`t`t`Aj*gjFPMtW|pof)9W95jB-lTY-{|%HD+y#lBa|Lls(q0o49^I3~+g z({T;_h*>hP7EGVN9~a-v(|>;J?dC`)_siJN4OC}l-C`qZ8!xGi)+_S%ePP2`02c~bpI6-W-TeIl z!8XgX=F;W}diAv&B(HXftNtq5aK+K>=5|JL#D-l-81o2_nE$9c8q9XAstW@qgw zO(jd~IsG8-hgqjNDEHRC&HhjfM7#Z$Hhl2i6hFqzXSd@I6Hm$|E`{(cf|K>_go{dw z7D@)|YRH(Dd$Q!*!$eO5Ua0I2UQxcq6?#;6IN?C7(wWg^^Q2A(Fk@YM7Ar!&-85-p zHh1H0Uny{K)JPOR2J&MFxd7moG4JlmS+wMk3Gi3{WIrZ_!y&A!Dal9Cs_|l zL)lSo?CMi*6cq*kaR`?%|z&2(+BB5?QTU>A;k{`MO`?eDAl9 z#Aj{;K{+v8IvCh%70OdoVpYZXr;T6uzH;e0Ho!)ncAAr)biC2i@;EQaSTkF%1sts( zZl{BAD@iX$#ycwp;4qz69pjO~**w87IfDZPvM+~!Mejc^047MAISnt-yz_PA8?E$P zB%dljPsosxcH;C%!CPC?WPDv6c#Ux!{5F2O z1E*sKD3hAD0K(w_3J8_Pv@~fCJ$At1QW>T#cXuf}E|nN4?Foi6%p<;5=pGzQ=U+&0 z_^nfPwq5z%am1bvS+{>}+BEqUp203u%#Cx4$zfV3viYy38Joi8jL~uy@+FK)xTfOK z;{uDuQ2BSv%i7uAgQ1xCu0;J@%FGhUc^LYJvo2(;jnAFK?0yS3yfQ!Mgq)W%nRE1K zDv95m9NT%RhoVjY^<2e9rfyO$Z7UAL=<*qQDeI8zxNRqUhFKF{MGWJv^r;Y?;!}w}eKa`nXO}|p1)mFhmq_;+@_;&!ByE$vRajSb z6{3C1XI+noB@_Y6=s}RM0J~8{-6>tRJL93w&_{TWW1$J^{$`q8FEZo5LUL6U+j;L2 zYV+X}ld)MK3Q{fySp3?!Flka!Xk4ZYkds!nI?H0% zQlka4x!A8z%q7b%8c6Q2ggyy1^AU0$=8dQKvbtH!=F}R@|LBvQ(KegUPO4dEY5NjQ z-V!#Bbs3nDF@V!{(q^>V%-}VwXNi*1xVE#EssKU(eG<~uvZv2g^+ai)K7j5sS@HluZ9z**jgOgIk1ym_`# z77Uaz1#FJ+sqYF)7^ABrVvm2dxMeR)spSrZsN1oaRGo&&09}ghWnJgCKQ&Q6oihbV zON1-zEOj;{S6QF8bCgQ^P@V=MU>__HC9-5Nl(ghI~d zyIBav2AdOwoBJp?*Yfn;E$0Q{+TxUdTX+*hYq0C02#DZSYcnE2Y2SKs#Gnu&H@$aP znxk-93;VNW%Nw(jh0PF(eV?V*m^bn4XwB1Ry^arPj^zXjrwN=B@myICAyy9+>e0&C zX|n!6QXBBB3k zX?`y`ugs(di8SmO$SEK>MEe0;!7x?=g8izkXgw`D8r|u=eMkb0E>JMyGrZP)jEe^T zd-@NBThsfkNq50f(sFF0K*{8IgkOts~VE35Ywl$LV zo2uISs$H)ZVGVSGzgsP?rf*=8l+6qjw`E4AMo`smp6-J#n*{e&7SFsR>)}u>R>>7Qjm z1v)ZeN!8GDi6~-}^cfW;q+5PEbAn=78u_4u5-%rmP>?#s%HFukacY_!rr!1I&7Gz2|HmoMzjPm$Uvy z{aEyENxNbTU9$cz{_8HqAn}i%DFQe6>v(7$0@QRh^1%MZ*QB3UW4Ev4&%S5PiB7OH zmpLc6<9H*HvF~6)-umXca;8)%hvhU6`};sga_ThBOmByN*z7QiIWNp(^I(jw$k3l; zcI8onVL*y+g?L4}$E0d+^y3B{f*tbco!so7jEg?OY9pm8yHFVfWOpl9(F}|Jt}?Ul zYHh4D-+9z8Z>Pakz2!fvhbslik7o*kl3yrF>gSa-b=o30ZSfWl$3bNT=7SPqWy*g7V}+O>`bDmaKSE%ZaZKlIP^v}jmzO*F{mGEvfV6Z->6f_ zg~)fOC}y3v-lEj%lj0q<-+W0}&XF`B)>n63cEk^9AIBTqZX4*3y^=S#{W0!nY945F zJt}72AIchrp&ww?t*%Wo%C??9v)-AhpN5!h06~cO6XtgsDp{_ql0Wo;uq~TJXUdUE zj(6TSboc(gLRT+_tZj%!#-$~V)LM4(!@IeAFW#j~eLtAcFX0yMJ98i&sMZJ&_fTHe zeGTTZqtgNQ4G(BvKHAc* zsxh-Uu(wQCHn}bG>5VHl=d6iqrQH6H)$I3Ur7;!)Us@)tLLJICx_=uQ9t@y5p^*5s zwsff6Ustj2kCvhUB^;JEZF{bB_rWsn9z||@bg{|3&ye(!P@<<493;&F5m;+VT zoPD15>oR7nAp~D9mog(&<7&9FTMIjoj;l{rcwRb2BtLY79S4GtXntVE-xM;wI%) zn&5po)Pa}kg!tc(EE#J9KOiAtxdocFDnUXnfEQ8ZY@n(li3CJ= zI=R$*fYF~2gqgf)8Hw8SS|VawX^yqJQOdIQ0p(DAoy9eS;8;x>#m3_qD9l1rBY>Ia zZ*6p3eR*)ZE^q=*`CM5*)Ar+1FJIbK0M12M^&b-tvr|8^5p8RN#d{9VF9~hMtrO38 z;xFe7*}^fHXPX{6jvhAoK0xA;vaZ@7&2J`(t6(kPp|_7*QsInkG;({LE0(lF~ER?i8&$xtv4 z1O>xP;(PvykuWN0sHbrJ1RXvHWAErj+=_4Sv}wtAHri8k$d>kPN6R{ndj9za2!bXg zEv+J+jV)4#ALF6+my&;SnXeyDFxiWeWI|y!F7a@@Ptk!C%q?wP;A4hx=$V&rr&~0< zWpU}5a|w+B`;Gwrz$EJgIP=KZ14GKa$R}7-&+Hm zMaaEdKoQ0d5n4fuy@Qw18%6p()D*kYtRZ}{neQeO!c3E-CwedL#C7MM)A|DM$bOdQGRUW>YnvQlA0|MY>e6fR`9Rf8=rz4%NuNQM1EWY+2!{8cR2cEn*q(>tXnWHb-^IHYk{j|JY z$DC^;j&yvl`x~4CL6K*(ggOa8OmOB`YGEb3b5J&LVV8~{upxW6xl;euG)c?2YBpZ7 z*m(85i}ZEXeKNwQLp=tL5lmHlPM>0gd~BeMe%`=a<#zt8?aRrLR^DY|zDg6_AegT< zMC(;|w#dXJCgW8sI@)UD9QxkiA?N&82#>MBHJ~_3+=?-29$YX;#8KH)+0dWav)dvE zau8jkUu$z)d0-Ead_ArqwJP@AbN6%8@Q$Rqsj_#Ap%{${JD{DV=fzVNx&^&+_Byk1 z*AZ8~u(69-LbAd|%(#wmjlUfyhR9=8%8r=PvqY2%;a}i)-9snr3jB^0B*uXO2QdjT ztdqNc-}rS`jg86o6Xt{1jjQljI>&jb!L zAqrfhJ#f6o5S>~zB3n>6lGRMb1dzo{r}q>fvcu!lf%N9Wl|9sVLKcG$fm|~iLqA+Q z^dOQc$Rh7PwuzOd!gq+~a+5BVfIdch0rNK!ll>|3d5fQN8DQ-yP2$*I1$mkB*$#90 z_3BLRcI-^UlB3YAj_=M#u>V(iZ&?QgUY&w$YyY!mBGNvH z{*u1EsYWeqQS$#rhhX;w$YWH1kJ;r`nf+ghw8H>wF_v|Tpj8lZ1a{%B>>P1&0NQ`R zGl2Tw#N3L+Re`fJtQqvh;*au#2Wz)jRGlZTP~{iTAd_`({~!H%ibVjuP)5^b_7m4m z0NbPKvIM!AzDiNG&db;4l}P~8g6`hf9N7N7kH0$uh+|q4Fg6zMkZO-e02UVw^9>t? zAkBRzd^P=y$};Zm;Ao_JMB6uEd|Q4V-mC1 zfrsLBZM-Ns`hjrOs)(hVF|XKsHOuvQd*ohI>&X29vcWvhNwh6>nzwqk%QtmovfXSL zz?+ht-U_CW3MJ_4C@xxz?dS*(0@R?Ak~MNS@TM1#h)P-sP-2tz0OhD7r4+crM z90A}OnLrRz*OdHuuHS;JVhh*Oas(C)zn3`4*Z$fPrM7bjDU_KoU_|kTYzqx-SjkiM z?LUH{WMl)&k*%1;;R#f+8qR~kw6u*6#bFesW^SoSN@n%HT#KHT>@(eAN&z{H`!?}?g%c$286e=u4ry(C4Rw6$cw^(4++;HXGWT2Da`o8K_uPnT zTNM2t&*14u0*oSUN;IgyB91los*5R1M5l@VrMQ@tzrp-$b{|FGNw4nIn60Y62JSiC z%d?5{bxQ3WgQ5kc$jGw4s|KX~2V}!Q@Mb!o>zC)2By=ZZYMrafX7I)~m0!xO2O0vG zLo$EM`?W;CbPeEg!#QUW{i5Nkay=t=CA#b9f2K3&TEDCgEu^bG~F%b!7# zN~a3?Qf)J^I8=k1%^cKH%#kjULGCr0Hvrv8`yr_}(Dmk@&Efaz1tBp*U0eB=JtMW6 z_I(EkzNUw!as2nSvOAgNfXqyPJrgQ&4Ko4xBlgR=a|N^gFI^Vk8DKA9;zEVJo7 zP6(UMun}T_K!2+D_AfaI9Ep{E!2WJ1CEbv7i)fekK;K&VFujl`LVu$?e&Onon5<@sF*;}j7i7)tr zw9&z`XU~&!scC3dT!3IeC#0Gv<1g%i>$F3Ev)_wB+(igR(pFj2hv~W!&YBb^KW9SQ zrNLGR>Uyr_vD)4uq89#GbLICk1baxhTbhxe0kl}R`PPuI&BH#WQ(6dso7j*QwiE*t z+H&4ohyaP}u=fiW&q(fJyWQGeZ9+9Xzv%@NB$N_@5C7iq{|fv+&!W?SnE$%=jzvOGjW!4_ri|nk)Qi0A4M!cO`|}B z1?^-yhR0MjGb`WBN-F?;BI%iLRKt6h9(O@$F26F2CdKT@2+?uKlBmxL$#NBq6N|L;l$PyrONjMhE+ z@4(w!;9ohzwNO*<7y5~MCYp-XbbxJwJ4PdX! zF?Ldn$H7Hr%cqQ(z1pY=$lAS!q3N6tbnUf~CpMXMqWym6-^+%Ws7-}pwNoq*_pdwPg$mq;70f47pVFF;dBW0B=T2!?k2K~5#PW}I^cNg%ERwC27F1TqK0z( zygzd^L3F;7If=s|0Df&2L>)MfY2|hG4$sfp#(iP0VTj_cc7z%7zY^Jc1>6m%05kI6 zCzmmR3;+d3moklR7>mNjrr#As@j&uP!4N;}e{dj-^|#u_bA;dmiv{_UEU~4iwu$w6 z*vz?FIKbd2d~KQ#5%jvLaaog-8oV=XDDuXDfU%=}OHFq}G zzr``^be57U&LiUb=9|}bUq`;KQ`$m3qXc~?xTi?*>Q+IRx7Dzz=MJ;=7VH4THiC|K z8=ljq)mj$9jiBTZ|K$P*z?ar3NFqH1joOfVU>A(v$xGrK69#zz?6mRySJAU-fYoI) z=%1buFC%z>DPMtklq6af0OpKc6^pJKY5(&X#wr-aZ%s7c3$`wiKTopH>h({35;vZr!F-$E=*EPG-_qK0JdVuV~#l zhBf?|(?L|i8tta8Vp5{y#xpZLtp{*ARvLfeHg6m*En`{5MMDYCILp!|KkSDf%!{+d zRgja%<)F_+VN4Y-Mm$jX2X>J>4JdcCKpnJy)=-E}7G$?ue>KZl^XM@=$bCo@46BUb z4*92>RS+SGv&PO}d+aD!rqQ2qvBam08XzcwO~cG|y7oa23gwYEFe-8IzCxW6^~0156FQ zRyyh|PuWzQ!XHj=p%&qqX8qCIvJX^_8;glrXQgF041~8E(LBEPbJ*LDiQ`X@6%)6} z7z}x+m#OQj@BEEEKm#E25+@(Z@vD2Qf;bfoP#X4M&AhGoH!P$aG6b)*0;%)!KPz4A zZieIHu$Uhz_!Zhk9sSyq^#nsZmzMGTTeQtj26tt}c{PdxfKJ#B z0yrU)#CqG61S3!A@1y|E|Ka7KIbvlbzgUJg(7hH$dCjzo*#&vWgZTlDr;C&70lUmd zuPn|%Fx0$1MZ5;F3CzT~IA49Qt#}>A_uVP>j~j4t$%_VSE^W1kHV~3{Uy8IXe0pLS0- z=K)=|Qo$ZdX+~l1L`Wo$WwyQ@t}NC)p27!5XAEg47PPqVYy)i!a88ZjPof$ln=QEC zIzIV0|G1?EaiBJ+8lsunvc43e83x|WLwSGhX3Ne%{2>Z=mhJfND(RUqiEV&01d;hc z>XW+y{alou1(~hejrie2sL5XS`vU51!{T~Ay{%eGO zuKTk%ccXQ`(q2qQwYD;MbgZrke45h#!`N4bRk?LtlhQ5SUD6`mC|yc-N`rKlfHX)+ zhje%6mXrqRMrzaD2z)ns^qlj&?{~e=Kli@(b;(|Pt~tjXW6ZV4I4T}K;<22o=sp2X zLMisSF0vZQ>nZEsdz#n;c(lH774=_=!B7Z2Gu@H42mj;yMF3r*ZgybO@2N1qCU`22 zMy|pwF37H8Iqs9Ll2Q-S5X&gSvJ36sgJzF8Y(gZ{mMqV!b$?dfSzqH~$yo7D?L(`loDC?ATFDYw>xzP8m1M7v@ja;To6>E(lre{xq4 zktnlZf{{v83kf6u#KM8tHY{+vcTY5S^PCq5Eq`s1ekcqbz#<#fvR32 zf33+d^w+>=dXcj7U)|sbY{7aim!{7IKW&*dX$A6>CWk6L(}O4=bAptgO3wJ>BCQvVwjv3xpKE2HL>LJ#!VZQ#VZu9c zfEXX3VhPQfj`|+i(U#kuS)Q+cNs;&w>WwnMKyEcWe&@q#FyUNWZ{pElFC?C1Rhl)8eKUSo)EOhI6bM?7@otM! znzNL_&gZ98_8mfOMY1TtIaY(LLd+_@T(+HR{i&r}>ZVKh6_UOCOL(5aF)62TS8WsW z@`_XM`Hi+7VQVJFS#NXS4<)=mpkG zWT}mWJk{sbAKc3Y9w;uasiCoHwYvGj-5y%9_#C0sJ4jm8$<$N|>{`L<4!f;zo7uvA z2;x_QZti^JHIAUiJncNEr&w&>+zGL|u_Ksix%qr)a2c2>JAeEh!8GU>Bo#8Bep;z& zO2Ug0K-wNhc+e+8=l%6E=@hL1Jisr<_XD^LwV%uq_+mrM9%zpPWgw2i-V zkZ|?mwt41OCiD9Z9oWtDCfeIJeuL;Jmdz{x;$dqZUS;#IKKP$e2>fQh;gY^I zlJa-~ErpBV$yZ8&Hw57+<(gz}$Dn6S`1!)?Pa(`9;MCHL-zyxcDChzbwEg=8>n;0` zr7z>+8~O1#$3%?ZmW|~+-rH)_w`zgzk8W+r;)wy?pFd@$H&kDk!{Ux5v-B%iLA`e+)Jlf565-Mv z!#YeL*ycUcTJjSMO6I6157w?`4dcRlunZLtEl?nbS4J5S6$tPIWsYfAJ&o5Ui#}!D z)MuyeOj2Sa(L(e;%Hvk=T^`vBbb?{us$T-4TUEAE=MqwVAjYtK2oR-&+On#*e;^N4;8*uMc9P=12XBt93ycMAM3Z0_U!{=Kq(ngA21R!lTf zOvQTH;g^j<>0F11gY?d9-n_8w%RrvbsI!wA9k+S2Dg99^A~_HjyjgoZ2F=q5N5K7b zhzm;ZN?t12vweVyeGOm}8=FW0STYh5pQWi*K082QR5H|}risQ^C&Rpz2+V+PXW*U+ zDSged`}0@zyFq}7Z4s|wdHw1o;GuEOnZZF~-(M*7^@adFf%n#8K+&mx95FE&Kwf&t z2=*cl<(jr+tbK6|aN4>M4;tmXMI)3vO8X<_Og~@%xu@vlJGr}gDDgf31lugY4DXx;79zpG}ZFy-U! z{%^t(XiB8@?wf1{4Vqc7S9l>LCI#GT0tg}9Bv*tW;w2_PYeh}75HGGj%CSYD&`aMI zd~M6(7JkV>u=b?Mu=jaV_07q|+Sn(nD7s%3o=WZ7Uk-33XQN*>7Hq=@z*XrJ6%)qx zG%+0N7lV>@FtjV(-IM$Y^Zdu#uB405HHe3Fd&}b08__A=V*FaWp+e?N_T~FYNQPXE zwtnv--cIyAq^sK^0uHi!U37#4kvYVd$P^x77Pv4T#Ze8RHgMsSY-2(~oC|Iv08~Vy z(LM%#CTEO-JcPgCyfd`yFHpm_bN>k*@IIYlbHJ)2>%U67)Ktb}q2pdeQ;l``-H#3N zpoA`Fy|-K?Xm^irSe<)PI#y?Hs>k~=7}tmt`sknl&9&fYcK4_2t*$%qS)kfA2=&)w zo!qB=2hu-V-xq5Cu?4zlep~YIA)ep4;ZHE*-~uH&OfT~V!^mL6;fIw0EcFw;rwv1j zjRT5a@0!{-M$lMdij zNOudEPWGWvwaG6AfFmogsBw+xSn**8buaxDJ!y89_Jp-I( z+a5ImH=P%E19fv$VSn>{PZO1)0Jd^GRyrf)e+rU+(glz?^j`zbpY3biDTL(-7&)s3 z1=x00D^&zs7Fk*rO@IVgIK0#rGZ#ne)MUhQ`6cm3%VH1!T>Gh7pGn+?R#Z zgAJFWN2^tpp(E}ZyJEjGh!W^=R;W8A`=FGG@O8-Tw*9%4%TePi|J>Pdzs!v%*nF$k77@U z{%FQg%EkM?)Ch{laSDp3oeYTLi5040D=++hs86JsK$=U0_S0kM{g<>!gamY^&Gr*W=S; zZ!`v;4vHqTyaNW)9h&M$iCAzl);hVBGlLM{C5ocLo{d^0OC{GNJ@N@nQkD)Hh&AED zS(n2kN|YyOw0_>KhdGQbdNqSnAGe74<&?eWc(M0ihpZ5lFNZ#F_0%o*s48!0>^1X~(&V8I4 zj;8TVs=I~v^&i-Hy)};{&wKLJua9YDQSRkq<5#HVkhLd}s2l52atEM<*RtxT`l(wx}P6gzIjw5LlbLK~j&yoQ-Qr?fjhvsJrwPWV_o(~*wa{2P8#~umI1@G3nA?sET zOBWB3@CR#TTO8oxzcyewS1V27~9>IN=)Ed?*J zX0iI|l`fVHp5?YR!&MbQr9x@5#rGTO+N8&mOcr$RFWIk)z&T4g33um((F-lB8;An9 zp!O&ZFf_H#*76jK)>Hm1eM}OD5ZMEdctNg7w{b?`%%(C2`MfZ?j}?pau-rU`+;zBo z%_LghKd_Y6@c+4efg`B^LSZRu@Skt~9c;ft5EKWLp{2M7%mO5UqBz7{31Em_gj<<2 z*SxuZY*83SOkCrk@&vA+g>}^qo~wA)e5@73r?v(LD(Od6n5S^p0-Wv^+arILdq=1* zRFy-OnQlCD5O=E?n71uH@9*Uwg9W4`Xy5Ft^e-cW|4xJe>2ptMlWm-o@HfE!UBv&fo__R? zIvl;n_HNU$2LALO`VV5z8GJ}smc5N&N)xl_GqD}2k2Q-a*sdy#j)*+%D>-E$RjKC9 z3Mx%gY;mYsYiImKKa2J(%qt&~&Fn)NpDmv4X^AM?2+3b6E&HU^)Jts}#Nj|c^ zx?g=4WJi`ys79|BbX2i@+RTjcykHYJ=!N&$z|(&Krl$oOK=t7qKNkNR>i=V5|NbXy z9ngaZb-(-} zxX^ckJ5rg>HyPfb_xIZQO+BjdmO5yv{~-Q<@ArSCQ4keC5-m|R1T)pZa>U+Jh>w&i ze)9qvwt_(EboSf;&YkNb+H#<+!iS~W+Y~Ej#1_lA`h|Yv$fE7^sNVzVN9O6_h-oQf zR!_Y&5X_ooI6tRMXM0vuXFDHE4;R+f>59)DQg@PIVuTw4WYxe>bPSg%UzxU(!!zmn z5t$*CybBuMXOZiq`O>{tQpET=A?%C&^eMKQu!)*YN0yixEqpLVn6|M-H|zFJfF&z} zR9^FNlneV8c<)Kl-`ye5X0ky}a9F_^sd1EKfy@kU2dl&Rwk z5I)cqM7}qRZV?E6-P%xFjp*>2qAgY4vKCGQO%>D9xkbC0Q!3_O@ulwuQ`+B5B{Br< zzVX_PakUB+)wH!1D^;z1EvNFD$>?=!JX`$v_fr<_@y|PbPy`Z_epU>A;F`!6 z`pv160-tjyc@u+C^|SiAegq+wKP-kaMruNizHRpzAJ$1NJo_;4+(8@Db$IZt!)R;v z<+L85>u}xBfr-(*Y#VC?z8Zw1==)@B3U95#>bq+gqgTpXaSJifVnu=$b5H%+wNK_) z#=1=xRyUB-^TT$99$v4QxZt~v5JbjT5Ww%Tfy*k3wP^5=VmHznpL}H(hlhtGCxZ#_ zMMsAc&HN4{UNHSGCr0s~?+X9#FJ|Ttf)#Bo+o1{g9_qBex8jW18dilWmREj_sNN26 z#~Je*;H{2IqMCRHMG}uxCAZ1MCJG7=<^^X+2589{svtw=$MJ(r^NuOW%a%>7O_W(+ z7SvBt6(Rz3bA#4ltm;{DGt1T~gRHtV5lf+6X6$;~1se4;&c#$!5D_5BMzLM;? zN(UNfAv%xY?TA8VjbE$vdtHRBfmKr@R2RG-Q?zdyqbHQO5L=p-rQ5coOY^!;KohKG$9SkEP^* z4^}M4SEkN?49EXmy1xVU=aUK*EcDMys$SQGf^HZ)oJ8iGm~&+h5)g1NsdFaRdJ@X` zj3cDJG<1JO=GVGNbxB;0#v`ww#{9Y-n$K~3p69S@VywGo4>f)jjh--!ZHi&%V$33I z`b1}T_myZlaL)nM#lm~&FvPxB_%GYfKB%{(h%8oI-Jdwm=2!YItqR8vU*n>as|Ki< z7tXP0PZ_kpFBR}1U_yz45`Fh?SfFNR%^<$#WsdVq(mB^WNjsJ-7VwrUX*R8;uI&vS zc>6=II^V4MCFx2@+3;0=k#)L`;~uY$Hpj4nP%YPob&F>_ zb9w7yOu%CI@1@Mde~dlkqL?@T+S$kl>G8k40RMXM=fzCXzsrsM123*t^&4>NxqpUUB*&}-u@22u~d@vyOCUgkL zFTHirHAm(fJG#ku^|;c%&?dBAXgS(ZZhQ8eAL_4&HKpVID6E?H&Sqxb`SB}{CI{nn z@3*Lakm+q ziMBFb&Y|`WA_urf!KGI}ygRF_ovzXG;qmRmAX0@*uTDeAHU~2gE4z1^C{@BND4nFP z{rt)^msk~+y)8Rr9i63$?jG&y$ju6oA zx?DkH)9+HmWlYLWu{zRj6=5jx*Xm03jvVJ%6QDdmO-AgDlHhQrxQbwn`Wh+cR zO(Vy{9*BD!*WvjOhTe5rKOjIZmKON!mt4i*a4qgAk>#lk+8%^lX0VWx9p#4S+&LQc z3c0mV_;BVxQhr1jihk8%mc7_C50)y_hF zSpQzfAu?cH-YUkY{yR|rwRpc@fK=s#1KhRFH*BxgQ3Bi*VA88?JCS5WnIPxS^`bPs znm@y-atLe{J~LJpdE%{jkvd+BA_WR2A9Z%VY%5f@cmcl#QpbW)k#B8#cfHYDLc%5& znf2-$&jS>NXJ?KHl+81@;e1&`iPy3;hk8BX*GdApqwKZ4eC)4B`C>LHTCL=T1<*>A zcaCAS)5%vBmYgP+Ti(g_7Qw7NK}++J=}%8qwej>PfoaNJH&G~SmPbKgxA}fII3v(0!nG;Jcsy~5Qp|d;#`$9H%?vtLNQFlRFiOAZUlWu6yO-Z{O zMH@r54W!F`qDcJHth{}Z<=g#6SU4PFXhlLlEzwP(1ebTbmKZv{lOw4*sl)W%9bNI! zB|Eygfy4@g+8;1544dysA&H(&Bl|VUEDdH8e9yCLz88R4Do^noePi>blvc&b&c4kX z%KExHNQ>xxm!2h+Z{VV+aN@INC&+GaJmTII3 z;m^|%_`y1Imcp14n4`~%UO4kkFbPtPcElt^JU#kKO`rku?fRB4xO5qHuZaHrz-)ca z+@8*bbna_gI98N^KJruPxBPR`r9L~yk)CeGV}+%Jd|)4>0G5>Sw>^9I38xORGtp0t zz;_wG#~}sjJ@O?sw+rthYbguCA|4tb)y<9WH!^%fd*(Ih23;cpc_Q=YfPF5@8`YDr zl*GqkyGblATR|*x5y#QC+Qvj0VSo~~N${xZVd|qHLQIZB;I+8pESXpvw(IaUO(D%l zYk4X4ByX5lF?3h6eQk|zG(vuEJ)G!r3c?2;0+9pybC2^VqtnPk-_MvLf>IR z;WwR*w)aY&44UeWJl@Y7xR$Qou}yaQ))-cAxsT$s`n|s8vX^^7z{qZA{Z-?7U1Q+g z>r4;43OQGp;@*faPC;~Vy%;{m3#)d>_bS@i(utGJWjDhg;O8M{Yz&x%b$Gc;jm-0N z6O8r7y8Etg5ON&4HN+RJ6SqD5z^Mj!S}y9m)2%5lr}&c0d9m2Vm8my{{I8;NhcCdp z%w!1nQu>V72k|n@f1<>S2dFCb8`?5SA^%13W)%f0y={PYFjEc+^yl^Ujk=JTN0 zv@_{0d#?`1s)grt-$W#8RXgN>uRY!b_d?VltA_b?_NSk|YxO$gb4){8i7U{&+3>m}5#6&#ZtUx+8t$u>+084)oWNnp;RGyHSDSZA(FMVn3`1IiC@vP@J zsCKA1gyQ^15KH4Z*wpLrA)BB0s-;;QRY(Z6zi2AJO5WKJ<@t?@g}-dwEuMZZ{UZkE zhQ#53pt;GZ!+b^1@8ZDD2;UVaKu>jAm4zTuH+GPb!TAtYA>7SmLuMD*0r*>nfvXZF ztkvh3JY$4~!c!qm3q{m!<;2)TT4x%|!-Akjcsz}Swmt1$uq|Uwllql!>)?79PTe}6 z@W?Es7pbzqtzE$od6T#f@4*~*5?D0fuijsVeianWTKnYk_Rvf@8^~WS9|nEKHi}D% zZPa#a$2TJRg>FAh)GTsXPg)$_eCt3>8!#F&c{K-eY?A#8w%t-Z`x)$;U+Z1*QAVE>|HkaDQrra5jASmd{t~2`cKS z|049rU_Lyirw=g~pY#TV3oc);@Wj@MV>WLzxu{DNDli@+LXRUP4uiTN9duw7(Lgd^ z&bZJYx{H);b?OQzr6ESql?icLqC&LEq(H>fBRs=7>NI-XNlD%~*aL%l&s=KttYa9J z2HziiIO@9gcmTV>vY=Xi952sQ08%V)>cIq2Vh1^(rL!m4|){?KVIB1hM9i?h{dP zUye}_-_j!l&vEKC3$^@?F1O@-_W)U;C*G*y2e#r8FQsD6UhsVOamQyg8(*|Sb~vLm zpd2cZUvkV8F5x~v#DJc&CjPhyf}HbSl*G9sI2sjy$mXH zA;C3Y$BIIH>jfd4Ze-yBG@puMqrWeuMWb>dI^3D|ck!j-+I6CBqZ@HsrnM0_v1hlO z5fvU4g(WIb3o!|IUp@}LX-#OEvlqn3fYVP2;GAXvPwL<9sHl&A?!w64R3xMd2k_gs zobS>yliuH}A|=0*7T zMYx*bQvhKpq2G0o$-UuX-Enc%5eM}myW`_k#f1pz9ayy&f2h>j_{en8>~g_WNdJb0 zwBNXJh!54Er%ilkS!*jaCZW}~xMhaj{CDfEMF)woWBnOdHSfP^B9TdfPot3DGF4Xj zDcV}E@XVu5F3_emyYiKw(Y7?aT?lJ9_=qRbILFsN$T+V~jb z-jd`G6&g^nkLdKQ42>`3C|d4zVFfjzwLk+^hXxt0=d95rLN8mK_As1qM3#7?Z0}CW zlCC~YP0wRHWiTR1aT-oBV>EF~4?FxkM21XXg zq|F;#lIej?XRi%@ERVw=dhhv@f%ykjTP9Ay{Tp1&BPX4*=pE4laSQP?(I}U7_C+EB zJKzb>HP<^ z=|)|fXJ0{w{pk+IorHLP2;WPXnZo9Z@0fF1465CUnrNJCa%g`vSeFW+t>b%MRpTXhW1ul{lK32 zTlb{BU>ZyMtHJ51NRKqoE4tmNr7T?)dGtw*lNwgqIS^%dvyS%Rr*f&{<* z9f^M%G4@w0Vy%N@xB3^+G?~`-0&AuH4S{5W)l1sYJuF^E5qGslw5oi7$LhtpN!6ku zDkzujKu`lyYde>|U%$8!0xT(E22d_MyD!&2KC z>U|8j06bB^Z#AvfVOmS(Q?Zr=a#-S|psmYn?$%9WN4E}=Um0SU;tTT5pTw0-{es6` zvd4oqt>sV?r2zhchVeb}>oQ(P?uI;AMvOacO1ZfW%72@`Am zkX%1-flG37WVn{A&{tkM@hf@5U1FFeBT%&)wr_%nV#mD6;8rDQpvd^n%?&)r7bLGg z95;Ns9EHV84JX`9Onczy$l>i`UGItkNRASkRT->eY${U%>c+peaGWywCrVX+w__e}Kd<{SUAi3cu9pCy@SJqVR=kEe z0iXl*YvQj>9U0tRitSWRWju6&9=Y~pYRvBW2~<5D2J=vA?`_Zx>1zpDV{*O%2~>z^?HU`+icj>_1Hyo@iv<`DW$Z zc6i&^wC@AO%{vB4)Q38ta~rmc=r@hYXKKGXdU{$9A;|rzMUQ-xMeK;vWF*8>f}>E) za!~UAKn1F|w!Ah~oSYOQECVwC9{ntiq#Nj`YEOT5fuGvO)|ZjsxpUWfgvK}HApUF^ z{jNvaD!5(zTjqZOR#@jAadM4LW%Ocsh+_G-*R^^LCE2O2G!}Np{nT*&9hVs6w|Ji7 z#9=EL@cgFBT>=G=NHhH{byq}7E6M28%7ygjz9Laofo}$o(r9<(@|u|mwU<+E6DOp= zcZcYd-@Xs79(^^@srELFVPv35Vp2(d@YO`e z_h3Cn*%-bU*;-BZ(q)`m$bl^SR+h_HEj?rAc*bW?7#MRN`ey0%2nA7V$@q1I5+x_h zxt_#^yR+~1yA2!|it-k(jXF7EVypUhxWc6217aX9>GV_1aZO4Fd8Vb*SeU|NARL(MyeZPo2B?sdpmww$_H?cn9 zM#K-wD$F_lB(mzt2adz4v@Xe~D;GvuQ$mpX7SQe`+d9;1I8jDN*bvU;*+IEoSFVKv z8m}c#?-Vz_KKCB7M)s5>O?@URkWH(s>cro?rA}Q{@Q~;w!yv$UhXp05>Qhy3>`J#q zKfAo;f%*l4)>V-l@vD7p0~Yk{B{@bQ<)C;1c>R|lKs$V`^t%ewR%<^WLWyzVvMx~Dk-m>Ha>ZAdZ6a16p6yP^0`4i@>KY$jI>&LV(@Bhv2 zlEKum0!;aQ2lE!If&?x6t|iO*#Y(8N-n>Ny9+d-na1`Vko)!8grz~FYBi1Of2eAs~ zt%tz{4Awt4fz&p}^)yM;yp4CqQJ-t3dLP8MFcSg^Vlgc=Fmi`13d;>7v(OFOvCmNL^=rI-m6RbVHz8e&wWC zcKjxvh_!{7N^0oZST-I?$|E4 zZ19K@3Hq4~s<;z0sL38N6;@se4?jn_2WAlc#yWw+nNYQIfUc*W0}BG2AW7a8WlE?h zRwQZNJ3cTA3_0mIs35ra4gh{JR;}Jo&2N_$<9>J@Dlq94kC!@|o9H-Y;FfJG+I*1{cy;4azJIgB zicC~MOc9gj59jd@?d&%_`3DER0iu_K zn%a&AMmE45ro>oyU$~tpU<1A~^_k`ua$Ryn7br7@C&W*V(rGv?BMgr6fgBd`Y7N)1 zK1WZ+i(^=7m`))=BoyCQ;S)c&-9jqb4k^Q6^pdidU69a`XAI1RU!c=slIx6wGd#+x zin2$&c0+2k0*8>QYaxQ+52V7kEYeNr6V7+?5M^yi-*I=&IquQC%3bMYTic2Yxu5Uf z?88uDwq9IjYR`su*LR&Iy;f6YLbv%E?TnN`^$~NMb8Dr!S4}meqRHgFJ$5+OP6@zC z%|F2=K6y^4Eg^{@2qyqPp>tAoNPNac`P( z)4@m18)I4g`YYlm!(+WPVPvE2aV>lMeXq8&Z?%&O6eea^l?=`}LZay1_WOzR>arES z8oNxBeUmvOgj>l4InYc_t!qh~Bh(cY>iHiXJlQBw4?ttRa(Wx;q$yhkDemu1Soi$5 z-b!&O)bTP`p7tbZYO5!|2~D;z>sxl;Z7YD(791(}Z_v;)Ux?#JW{ZY<7BYIMd~-Sa zZsuH(44@j9m5Z<5i9SyqYg0lmB>HQX_OkVUxMBHTqP=lYU@eIyyo<~c@73S2$@KR? z!yv`~Ftsh-LKcawG1)4O#FuUZqfqPjDvWcJg>%rzI_A(O4OBeUc-D;bP$iGM(sm-P ztb37433u_W-aPsx*e-Z}z_Lf@^~e*L?C;If&Ue$eWC zyuY3PXvJ7XYyJ6$9Hz)T> z5(Q zUJix%4oIgv94HQZ{Grwi-J2*UEz&f$QQ{?aXa!YQOP4XnhYaFvGkjIbOBdeG6KfSu z&4Z?zeJfC4+9zi3efx-9v_@Npn&PKeAa63IJek-+Uc6wX8)dn1ZNLaSLFpF>A(2Tx zHHe-0V)*T6i`jJT+7=0q3s6THYVu#qzK=f}<;NOhRxzK}B=+Xo9~s8JpvYRDBwHAj ze{+YjBmP;Ozw?Y_3Ww-BWRs}?n^((3Jm)OcXL1hf6up^60DG^+9y|F>dvEq6hHLRB ze|cE^_ym)Rd?ix4;eMH5|5Iva;D-P}RyAC><%H~=Zkm-ABPgMidW3_Zkr?=<9XedD z8Yp!Mn597pf@qB@e^TiGh=f^u zYm)GoVtTB*mr%{^qpY@)#Qr$`mIE*1g5YiV^=sbu-RIdu>c z*0~&e;nQbaOn{TTFTAi^fa@yXwv>1K(%m99pb)`<3HGz(AR{Efh%e?6KK#h-62@aI z>{8w((ZiJ5n`ujkLiK|rE}3|PVc~GKXdx`SI~9Q_GTXfh%ruK_pn|YPi+8>lkomEM z@{`N3@(76OezX-X*#`L=ouY+51|r7caKEGVX#Aoq`eF?V@{^=QcQlGoE?KTXv!^L< z*UeLWMqG=V_KO-840g^D@|Y)wwkkiR#t!SW=nUK(d5Cj2emaKkdFIvWrQO(BZ2GNc zZW&Gqp|5}H)=CPD)QdTuhLWf@L{C65p#@TPH#ydYie$G8C_{T=xryq`QiOt_>%E^l zP@9sdUbvX$y*B8+6{R`4Xjm>=XgO+@&@-ex@LR8+6>-;!7ad3!Sc;CeDS++`&pXte zXA;D|wBbHVns~6esJ~ez9OSx{a%`8Z_!}xyJwj!w#N6ONZ;}0fVMhh$BhxYk6d|E$ zjAs*CNGs$}h=V_>GmCXX!VJ;)4t;RKcOC71#=m<4kRGt7rPn_j(X597XAQe%!yk0Cn4j6N!Niuf|_zm2#yvl<6}%b%J31AFWK} zwQ>V+Og$mUst4-4LD@qV3_($^6|P+u++V9q%hG^JM*`*@?fNwskLIE#dpQLSi*7;k zD#{b0;%RR>G^>infg%{k5qu%peD6gQCKfN*Av(Ez%=<)(u2F+{{`>)E#kXKpoL{H$ zsd-8myd)U$G-RtvvzNU z-lAnA^|Q2Z9eY~Zx;Mm8;9dm2+tZ&aUXIs1AQPeA>$BT*o;xv5bnm1N>^Ma(k}Z`K zfe(#!Scdq4yE)Gt4`=cYyCa%N>YA_6$&^PvG^{oIfjzzTg?rB1zk)Ie875{kZcuNL zuzui_>6)Jis++cw6Q(=gi6OYs-EbANrN|=B2HYUcmOY$uEp6P_MmAltRo0JR#2$Fq ze-Yf}8XA98z~iI~533&%KnONQh_%PPC9e~zb3QE*tPrTm%+`mS2Pm?9;gX}K;x%wH zy+GADZ5omd-sZ?H+e2yMuLP&ietToWBiXwy2WCrt%+hMh zWVEkzf`-~$CYopav+V5zOUiZ;ef>z|1Ze%B?OL%-TK>fI=%W+Ll1i@czoES0At3oltyN=#Q>@OUg z?RfJ$+#u2n$a35PJg&}a+k782VfeMc8{ShE;1Z?_^d4KffxDS{BhMi= zVx$)rM)$T`%JNlrJ)jjo8|{=@!iS*i&DY~NYJla(Vt}kw(Xk0MQaf{%XsJ>XHUo4K z!+F{)=Rnn{6!k|yUOU)5d`$w+(^-j4bJNgwm_AJZ<_;Br@5CRRmXXIRu1tA5@I~@Z z1;&2>`%j`f7%W!h5;$aGqi*#e#5O%r)7`XINAp>?=iK(t#nlt3{uK`-Z6nz)o0Xl> zuhv?0usLSVmtu?t_pXryx&fl>Yxru9fzw7EQl2(6b}&FkMG__aNCk}F~=6nJqTF;}*H{b4QF((&%O zNau0NbZ{GW(4Ww+@B;+Mb@~5>1ql%L2TWJxUxe#V96QM;-q`4n15QREW{*hChNX{U z;5|-t3@E4i@a}fz$NK(_W^QwtBfV&Tm~sWm|D}u9Hhv-?GtYE4Fbogh*0+-D@(UKx zj`Mpp8sa8G=OLj1Aeyg>n+t)~skC@<=Qcra_HzIX0$N`9ZU)roM$dbJP1`3r`+5{oytcP|o9*46#vJb_RN0eDz+(vq&rMN;R#ntYg=K2}wPrjOuAX$hN|(uu6I0=Q z`j9;90r5-m7j={lS6^BU+1RAfCp}b9&V~gKI?aj~L-Y~OKn(qC(!&qpH=mSJuTsjS z?MEfYC9&swSj@zPIvSOz402RBiwX*%@`Inp-q(M^<5O$E5FeAp(-BD+{|MVHl{jbf zJpmyQhH!it@Yk|QS#)>LoqpO2ttj$_mdh}KeKO4HB~+!;&UN6{^Go)O9X6F-5@Il3 zFy%g=t}5b?0UEd$A@z@5qo=gaUi?R?cXIaD;eXMRb8L@|n1<2i20)llalGUenOu7n zYlj48+nS$C=R#9F9FYmBU_dTJ40EYVoP*fw&_5^S3h^V3MRW!>N&5bcG*RPn<^n=H z8gH5ymTqiQ2MR2;KkrVf+XEO0jx82bBuUWp1?s*zvCJ*4+hMU~6{Y^W9;TKIkxTJ< z(QeQHFJ-36Db9;v@7l6eXe{>c-%c)OS&SgNtwUMONWBitN(*IHiJah&wNpd~o{mQd zPpHqIAHt2=ph98$+EA^kb$nFC=uibE!ywW^=V;5gY`fH^C9d-cPjcPqM(#`=>2O3# zDojc@-v`2Y3KTIv+IeZ^6AAq5Z9BvDpk005TUM~0;X~=Xet9L?kLV#*xAZo!h9!7u zXeIlF9WFCWI!CcBGtf`ccVQjZZmj~^gpWsv;H&R0<-^<9cZAD1r`@w%o-!!Yly8I2 z(?KcMl12{FFaBV7eY!{FPDdw$^5?C;Vf}yZEd62%prw%38e#WzA~%p2)z3wV+(^NyA5`XOy(ZCxRufqP`>ziXPGH|W?cl3N&ftw-C1 zwVS+?P!^7Xoossf1K2|QsSix(Z6zsdG~h$P?~s1f!AabYU2tIpsg)FepO5bt-`8>a zwCHdP@CQXOnl;}qLI|QS$LSVK9}cU9Ywl@is4X&&{lKk=*VXue1nFVstKyD7t_OHi zu`24ct{8-IySs}#uvDEMsIr=Vb!o&2Tt`(?ufVjix?DYpO^7z(hqoTO0tbeR%Tp-= zbV0#tC6rFgFBrW2=Y^enH8-P;hlfHZL+DdfxF`-~b{UM8E>5yK^~NVTue^%16yxpg zqF9eInsjeQ)QY~}6h1nB8@;}cDUj{oaG@V;ix{QwsyYecj01mA6f;kkUpwMbyej&~kKtB>HC*ZDkK z!3=+4MdnCIJ(sq@QRGK7{$7rac`((P#m*OzfGgWP{XE!>GOHX(ilR@$u-m0;TjpU$ zl_qJky|TB<8zlz_g_VUKYh7oyn-urjqqE+n<_BX$rp@{xx$Rxx3k>we3F32R&K_u@NZ(zNB);}jWnz!koeIC1e*v=w?w z8-KEX*MvOK1xstkmhOT}r27*;?buscsfQ<2x_@X&GgQDGz!_9*j1BdFx4?CVk0g62 zi#JxD4Ou`zu0^2VGS*m{eiQA{+aCsWFw8)^ndXO<@>~{_8Uv>LuGWdlr^k&YyUZ!+ z>b=5K88*6?`XIT`@hf^ja^12ilQIBi~YXG0fQ;T(srxGXV zL&3W9un=BA&{^2{nLOUxVLpP->6)N+_FSUnII+%5xHnfz_TUm5Xnim8q*S3G1T~8_ z{0x+))I-a)`^MlmOrjAw6Q*{q>s4hM?HqlDhI@~8$U7Rr@_S8W)$$>xJyXB9wTQlB z#aim~5KS166jA+OI4|dOG!EbYU6xP(nV71V*`JIp}riK2XH!{bV5PL0|= z5l>SXId=JsLl+(E?B}e93f()b>487A_Lp)X1Xo$Gu$?L};qYAhs8vHtDr2^Z$3*)f zl4~hQFemKI!AB;uq{%hnLhm=>QTKrd{RT`IlAle$#>}rSO(A6(9n7ONljPDp^d48C08A5;^!S93b*!?hZcMNG|;b%4Fz?A;&h&e}O!u;k3(o|1Ny4eFvhPcf}iD!T&lc z|7VDHyR`3Dd}Gncb~A?QWncgz#vFW?7wK0FJ0 zE&WQ580=-o{JNBIN=d{}pn#0@%jJbLU4TWI#JP%D`gdNmoG4!cY~Gw~zuSuf&uylD z#8g0-WVq6^H)s1wVi?>2{?UIiFTykw6loF{>?uw&Q6TQUKQ7|xdO3tD<&&{g!wnAC zE7Ueu75qZwogL;RROp#D&}Pr)Ct2%UyV}`H&TO;MB7ezCKYAy`D_{6g$ns#yUL?6; zX^|3IZroE|MYRFfmtP6UGIRAJ9UqFhz|`Nh$(ZNoDll~Kk<7|YKLH(8Yp~d;uoFIT zaZMn)I{(dByV?l>Kya{sYG9$OcU4Djlg?9y8~?DS)&m=dC?>4(lgt$R0-x_xiIYt` zhHFPEv%W2oa;-0tCP>9jn>4FSQ)>H&a>U5>XBCiT88aHvx{j&nqDOxZwv$duR)ZR} zc4TYKs5oBdZCv}fDE5w6^Fln>!2t9r?Rb{)cIuJz6X|Knf{Z}to!>TiM6SIzV_7GZ z`-x6u7LA*m7_h{z+kX#nq0ijM(91G^z4c1$m;rsh&BY~MZyt0J5j@=Fn^scf&n4uK zyCZwPYliN5WBOI{&i8=nf;QSm=Kr9ZR4u@jt=lfv^?!0@$M3tcJ4z>-Ko}8pZjRXt zMDuW?5p=#GP2F{gT*JnnKU}>wm4h!_VBbSea|<8(r&r_ghS$e-D`YHP$db zno};4DT7xh@ELAvd+=7Um#czlvc7tfbI`^iRF`J^e7zeNrwE$t6Z^2`MTXsuz+X-B zw~ViU>9=oYFcrvxj&SlI-=8#`gBxKT6S*g=y)Gx%$Z<*X-hEU=ZzXMjtLL2I`P9DG zpaTV@sM289h8mgjl4qASVK#UfI{ClXdXnYATUC( zf+gFERZ2pmWSbwb!b^6nr)CnVy4hz%hfcip8I%an`-@tyST7O+-4Gs=mk;6HMoVw1 zJEwlf)<>{idIUrY}dF;z7AZmJ>T8}(qaRDnyL6j$>9F@0#Mp{C*n!Y z@FEnbV^aX9{$4S8OG^06w-%}Fi>|f5?_8Q6I>H44kC2)jiMwwE)SGkHP3ePGKiUEo)EYijeq#|oyrqRFXNd!R>utcM}yJA$rsEi;I6UF$cmGbZr+fphyy1| zotr*l2?!k*zN=LE9L&VOEu|K(`zua!sPunHJL^?v2onZpeDh%l2{b9*$#`_6QPzH2 zcT@{?W}5$7Mfb3KNwIP5Ed41XJ^rb7DD26oChTC3K+M>_?Ht%|Wy~D(DmL%#&ddCL zzUX2VVdPIP#Oa|IG#Y4XOClf2z19(xgjafk}wX@Ou zb;+J_J$JqgX>zNqoK55g&kR~qmHxXw&c>U0yL2-J8{O0(;UASoA zUeO1ytUG)AF_uaRi3|iHd~{GD`7#_bf*}@)0OIYh~{6zd+yUubD)bddg3$gylJEX@yi7J-NC1}e*V|Ls7S@}q@C-mXXG=h zGcwtVU^5N`MKsR$ReIm{g|?S@nPGp7V%EbmkIxqvdR7kqxc*6!r%Oj$XNTFSQvU|q z5sx$YsS~;r>N#FRe4jejVN7C9*Uwei4z5)#A32i$Zg@d~nEM0plk7dhuZo>#|Ao5D1M&O?LSE*PE@=Url~rxN#HYG6OCwYl^ZM&SkIA!#LPaYi4*CPL zT74eep+CT$NB^-8FIU$6o1br=(zfyo`zVzwDhmj=&)W2~52MDw#(knx>jyY!%DtI8 zkH%|#9sHc7iHUyEPHdZ`%^lD-9o4pMMlY`@e?L~^N2^oG-tjq5AYvcCKb`PO3tKA* z*fTykhuBRgkOO1~2=uqx``UCAMb5>pvU~W0Q6>Mgl`u9%#A%j@N2kL_5j_K&c$io7 zm9$Ik3|%&fYhVS=d+nnYdm*@ciUn!6Wj6Q{-Tn>EZ^Hp3Vk&v(g5>`Jwj6(X5O}u; zR9)x<*NOErRK`eW7^y4quS2S1ma9bLK0su*5ASHKw|w^m46459K)fL3Mxu-K6P)T+ zy0EAIqmuYn-It?34*Bd_79AD4g_&Okh-PRe`Pog`Pc%upd$>bWTE9T$UJYw3$ABVz zPbTz#kI)S&2fX|=V74vIvE4cP0TOr4aU%{c`8%DKf?kXs`})hkvN8apI{o_HA;^9- zKwW4%92jWM^}SlrO4ibqYS_yqVLc`#UZ^C0Q)Oy4*+}=i%qxdmp1Z5OUHqsx#Ca?y zHZv%wZk18|bazxhv+SxmIjD2~WMle$5$HV8b<%|?!{Mj3_3;|8TZ8KyzF^thJpE-f z$_}6a<@Cgm7WZ@60q_b_aHQI`A0)5o z=lDzTm^m546@Xy}9oXX=(66SIG3f&1u2p-lb)>%LKIc;VSqJt;|LlCkgAVCwQ^lpP ztBZO4oex}?P2H}TDrLN@j0^1_uAglC+Sj!X>HIEYN2)xj^ zQq9IBZwcP(fE^F;@m6t%zE($yQ+p>2$QBCO-;+-mRZ^cs z15cdrc4E=(k#_02gi#@_eoLo|lW^hYp72KVyK`sa2B}suga<^4iHcgekn%Zl-ye&Q z*EGPI?>nN9umAW^apBDW$vD1!1KgQ!kHXmhbLakBXg&BpS(iHd{Ymm;CFLio&t8gy z4RsYruHIO~3zt44MmfFM9IrR{(dObRP7&!+IJnpZ;pReW#tQAK%=@y$C{&&w0fe+y zZ=@6I#WueF6W=>?P)s|sCR`#;fgp?c3VW1~=xBf2t2sQ?;$R3F;_ACgZvOe*$96Bc z*6NIG{X@^c;!?u#*7=_&E)FuWa2*)&K04GBozD3_>QHBWd+vo@Lc#Ls_rZ>fZKQ)J z@%c#0JZ>Lq>5jtSC0W!;u>o zirNe=(sKTarMW@2ze=i!ZH-7~9&Ta*y8$s9h!uFUCy7hxD-f-3zm`Afs{ac#^frXz zNV)DZNk;)EQY&{``fM9|;TM1@kW!nb4}z}z>iR%@UzEl=G-biR<90IFq7S4R;;WB*GFnVR|`{ck#m9KFh%?|II`sw>a zE+atyl%S?>J@sjP=4OHw-#z=~oNYBr$}_;04>^#0X82TlJ>)|DOO=ZO{Xc34duC1z zCGRvlnY(n^kn~)q)B)R4xRk5tHF49#L^zX8J znJokmdD(X2iT{4-|5u6s{>{ts*ZWQX<0)nJp<8UEVbaB$>P7)II^*KKLO*Avz_k*t zF{Cj+I;YGjo9T=^3+sjybqRrQkt`M?DWN);%9o{4QlIwGR4K=lE6c!C=;4T0wLqwb z8!3_@78Vo9sot*Mzz4V^&L1G4s|Yi@`}o=Q>*gN?p5>V3^Ll5cgUBSnzI7I@4ybtj z-`_mwxFxVr1w`k2<&XUsz4&*yNUZ0|P_w~}{@pczUiFHO05j|0uY&{Mt~8YgW*)c^ z#*)0@UO#`Uu7cv3a2pC(p}Go`b;10}h8h~6_a1f%9cxcqQ$4lc_sm3F+sz@nIn$m!EgW$&b&7C0Vh#!&pl%sxMK8%nBx4v#b1 zwVkdV)R?g$MiGuKr+7)h-}gpa>ixXxC^IXy2`>_@bUb4a8a?)pXs|+w^kKf4{9%cs z`Q%RxrtI*y%t?>SN5VeE_SCnhT?iR`I-Tr%r$KCIJr6{Vk;8wzL$h)nL?%e$pg!_` zz4)h1$jeskR}Dh*g%d5k#@_D%z4qc1^K19ep&|#P7n8||rEM3V*!xhw18=_jILg4F zz7@f8IcUNy@XAYKK=^(~6*+KBT~&yD?i{uP!S(&U87P^-1dLnESBlKk0?TYqMI>@6 z#HX#zrw%L=2G=zx*N$Z z0z5IhY3SL{tN(7(-+%9`?7yp$y8r#se|MIA=h&aj%;(?m7C5IIU%&coqYP}ik)8he z$D#7K(FROu){a(QS0XIlrzsvxN>mc@&aN|ge9k-gxdyCq+EoBOq=0fSv-uY3cKme^ z{ZwDVPl5gucgPoQh$piS4Cvc(HETSTbI_xqzvg=`I}36$MbBFT>o!nsX0Ay-jYH*$ z=pkmE_XyLE@8V_W2E_yHPwfcIgr3MdA`2r^gz2;-=C`kSnC{!0 zW4x`=piff2-Nt+-Y|yk<6pmmAEzpZ4C-%yU^&1UhDI0QzQ;SVmhwXkIrO}mHx;&{* zflP#TMX#3afd-w zwD}|B6t*)Yo(x+8<^lBEBdvb(6P^j-PfLasa-oLsP8{#mAcgLi^7abNxI{Q2^8xvF zdT+8N#XHPOJEr)rqzC<6mG=J0`GB#d+_p2|Ilhg_DuniX6q98?EuyhYr(ZSc1YuzK zbW_>z2Xc;jh0$Uu_!aIT-S~RTv5TTEBYn^@;-4JZSM=|U34l2vEM}6r_Jd>$#p_p) zAC9lz4BX@Htrzf+@H!NTT~EYX&iHpduAppMm?BGSzYfApQ)e#!iv39BDU8uM;H zw@vS}r?VU%zM{6&7{Wy4&%efN-FUN??Pt(zIDL}~?F7+> zS0{i>s8K7fq^@acJL3Q@`THfdQ=@d;#@Lxxi*w3wUa9+i(5AUiz9V8Y= zxkxzzhWu~K5HctCHUAcW7w__Ya;9;goY69qp5CtPh&;g0s^qj61CriqiSNy0`044G zX^M|r{<#Z;>MGw>rJerE5;B5)`I;30-${AC-d`h%%LT=!y??e+yiij6-12?0=E+^S z>fuolj~qY{xzgM;{~N;;{eT!%6Irk5p;XQ%H=)ksGIdt=t6qM-OIp1#^Vl1n>^FlR z%hwNC^J1)SRE?X($(N55V?CBdU(nuPK#bLV9Z^V&-Oq2uh%jfCZa5`(C-raf3%2lr z*NQVL|Hg}^Ty88!bZ0F24-aVK)s{jgb#8>pU1NJW0bTYlM(GZ_fOvaJ{vIwZQ|GVj z27}$##bjXZ9)>si2Xx+u_m|0)+N`|TpdH|akDrO z+UMq|d`@NlJ;f*EpgqFvO{I8=JP{$RRFulMSfJbJF?lPj2MVll&c;-e*- zN!Cbz+LkpC4zj){#UDd}a}z(#gV3t?p$kJ#({I^!XdXs^>Qhk1y2SrI58cWH8u>+( z@&9ShhB;`zkZ0v?hZlMZqD#R8&%@1!3u^FFw1W&Ci>JRpx^&NECfgs)8@{3QuY;Hw z#=tmWaI78hP99np*LtsHW>zkWvR0_c>q+eAbUL>x?&Z~Ih0FMh)*7u|?r#IBs(5RJ zKjv4Gb5`udDS(~`C$lU(TJLpb9l_5{z5hEh)vgu|;5Fo+S5062wZh8oGRLn=0;-Qn^19eOS7XUgJ9}L2XhwhWgXl4zQ!Ll7am5JX^3D~1i_OpY zZ1pj;{VL)3@tci=GPK8!mJ3pXUH0d?{t&37qxd3urv%2E9>4~;@<&2j#7x_|JUL)a zh8Lt!KL<>xFB+W2{7GqfL%9cA{phL(sueVd3PCBKU2Qjz^}^zvgSX3ql6_l=J08%E?!Ai^D*Ze*2ERE%0r z-tu|MbNtuR;r4^wptw7GLo{y9fM;n-Vo=S7u#rJn|GLuLw7F7RC3;-1+i!Nj^7nhIis6QTpPh4Y(D{bcJ^RGEJ7-9o(l z1K_4?-7pZ9dt8kDwD`o$usiFeZENBdu)XO&I>vvw+Ra7w4?zY7 z{-j2hjdXCC;@L+MceXmmnd~Uh0+?mQ4m1}pVZSelPsATP@Z+>XhJ1xX}0ZW`@Zy;W}9~~%a4{yvp6VQL%@`HjWQj)3t zo^1D}9I#u7h=&cVz)CpzG2R5EHC9ncVFOpr$084*4DP;ai_p7%=&AEB5X|MA;GC&-hrsGkm(%GN$0Xp~w{hg$CO9iYZ_ z8icRboV|E{_{zz9vGq1l1jjeN@%KJ7%L0$nL*C*B*`|~m`&lw<)V5h!!6TsAA4oh= znN#+1$XT}wlbRM;eZ3x_^!G`@0Z-j_m(Z;X8F!MTC7sLTxd?mZdFmDq(np%u%_R8r zs#b1)22@3<-5IWqQZwY>T7lcG_}kar{*hLrNk%nKVwUA5I(9>2OA-tv=DY+uVkWi2 z-wi{tVn!r6h1gj*K)jsuR|T*li=~xasbDOJ|*Vf z*&A#msp(cgdF;P8;(kx!+@%z=wuzgc{XAy`+Zjh+n3DVmraq3qS~N*THRviFm_jD4 zW+*G_^+o6fi=ye|F3tj}p}~6j>Z|5v$(Ch3GJl}Ba6ay!Ei3+z4t^7wZvEY4`;n?X zTIAN?yKg~Yvf8yKK2 z_h4kS-`8kc_6u63Q$n)r9(Cfk5vJFetVZ7TIL$B}U;FS#`PnKWSlDVAB}Wg&eme%1 zOlT_GHg~{V%r<#*ud8-^hqP54Pl+>|7-nFz=NyQ;nztw_kW)%T_tRwUcWI^^rB#TA z6S(eep6_Op3l=InWO{leqFqNToh#6-9+APDb4ce1BaIHf(ToQ6bHxPt$-d9MBM4ZK zFTD5Eh*)udiD4Lo8>?Nkg}t#p`_q09$e9k92#oKrOAs8%?9By0Ia@KvsVi zucX;g&lSWaHC9(aFjtNOYnk@N$TJJ<$!m1*pXkL|v6i5tF-(G=RNgc1epv(+%7quF z_K1mDGY)>|m+P;zh=rmuM|w~5ybmsx{z@3wVxtH5IKH{2?&}VG-O2K)Bya-REt@iV zg?r@8AEw*fze@M4{hUM?r+IdeiF(wO+*|Gj0T`n+{*_M*H{pBWAI3S(I#-rFihS1p zm#+cGmS0QikGCT0AblNwCc^uRDlai>{Yh}M41Hu~=mqAip8rJr&P4QM}47A;z+@j7n?rq%Wn_axCp$8%qcx2_C>7z_{p{| z<74cRhfYRb|9I+CMds^+hfeh714}ZU9AACPhLMUXC7GlOe|oOuwh}P}wSz_rR}zf~ z3#Ok-S_KXrIetdq_S36Jj;z0(;$pw^;=sYbFa3`n%0v#{fv@|fA|a9vuY%&IiDrr} zkbvFNC5s<)NeK0=Vuzln2RoHxZhUxrvpkI*n!lk%yH)EjC*rMuE-!XHQ?z+>&963y zs%&cIs|^JglKRGYq#9=G)58$8c8%nivG&T}rW_nmET^$$$L^t#>??h zq-BER44T1cSW04_tWF*s53%g_3M`Zf3atQ_`hSI}qP>di-!&Qee`pn6`yKT4ai6Aw zb=cQ7^7DZUYlzx9%*3EG6|X43+pM7RZ5%|O!No@V>$@iEoyUw!0=O) zyL+!ng99c`T?;a{?0#iMA0;CTUTl<$#OsZu?QyZiE!?2pR7eI_d{h4k4QRGf!4}ae zhSC|!gRc6#X`Xwi1XeiBcSpxar^Y{QuO&3J`hxjcW-bI!d+A&qCafy0C;JTXF?(f}Hkb$_vx6AL zwRYPE8V9yAllG?Za_@QT%X=>NP)OMFjp=2YVC2Ryc4J~tm}ezEVP(}@abs2s2OZR- zMJGg2T4~+!(R=105(p>tbnjliIU!qF4R%YlJwG?KAlFLO{#driFXSj&& zy}XySwK%p^+o=A09e35?TYCx- zLn}LGF}nH`Sxmg%A%9=KB^lSxv@5mEZQ}UEwH-yI9anBr5ABWQP{UC=sDr zcd36`pA!+BvdNfx?2KtnL_flMX-v_1-p5K>$90C&^igb7(ydz>yW4KL zCb*x7lDM0bD{nSlzB9MldENhVgRdmBgc=kYYup3@hp^xI?U$#T0-Mf@L#9c{KYlHv z$AjG)_0x)zY&73kNk?faJBifw?z(6NnkUWfeHkvYtV1VTSxIuFOizOf{WZI(iZ>uF zPKc0UJ478?p`G=5NN8(E%=D>al;$(jg|(|KcrmuZ{G#;FMia?`dOf1egWFTMN63jh zYtpJ~EFu$Y5F7-pdEAFcBE7%}-%zg2-+G_U0H_QuVjU)bWisj!l zo9#_6bt-(8bq>@Yt(DFR;!9@r4-{*ZmOS4faR{>a*(VH(iye1u46{dq3g{aWA)GF<VRJETQ}hYZqaaVB`aT$q58!shzFsYN99- ziZH(}dqs2mO+&{TH`9u})sH+UPxwYvB90MpUS|NRZItKVD_f8?=H61f38($eWH~CN zO%Fg1WUV&_UZ+!C=LibBv{I~j%y@B61{VCMUB38EI(3{>-rR8h1kgPHN2mPr%1f|> zKoPW`H6RgcI(Ls4xfxJu5Il1&sjh6czk(WcU)XRgEh^d?f+!h`h59zJ1p*(!r#YyF zRyWjQ=MyWE&#sDg^=j9p)hN1{Cc-_cCz@oVth21A41D8K-O&Zzy_j-sO?~_g37AL7 z%8X}7V7!N5^k+ky1+APJ#otTuSXx}u-gJV`sL{C7vLHIKz7N!Qp$<5KouqQS4{CA4 zbqC22m?f^LkIgEv7M&io=|$9Ctc~D+WR=N96DPbIkK#b|=7g248m!!Tx$%m5DFaW2 zLDVMI^2!*#B5d_#a7Bo1(hXvi*22OR&~ws+`yncZO-*MP;Qor6>!`4r&=%2C+#UR% z#=UIv)^e&*sGP4nUhBAy1%COu);W9l78T$4An%PYbqu3RKo2p+4gK12!v;<u{Q#(OpX2MX(ra+`{zQ>`=5;;gY zlDh(yxUw7>&~`R_1Nbn~jk>?6d!je0X2(VH(P5sNpB|k|<}p`;(gtlqqITQHItpks zvJPrYVfCwDzsAGHBu%z^%5~bN#blqh%6=IaL>TcG(2FREKBSGtTUkUqmo7?P#NleW zgcBhN*KXz${A#`L4v2p1F_PR{ zbh8^ugTULx_;&?J;Gdo~}|rb}GN=?(w^TVZ5_wnG@8;o>6nLE|p z>#r1j;=kOuYi>842!$*^;zUV+`?4i0j_L0<8<#6dXfg&mmy15G5o38nW)%}#7McUpB|98?{a#|X9~ksaqtn$7~Ik6fVX^lnqniUYn~X z1BtagPAT3m+dWo&gRNNYM5a7~SN#7p${XGLZ7^heL+^h(hWpKvS^f4yPk}NaMQ5N- zN@jnxd6AAA-+fjo$5s%D^;Y*X+Rlvhg+^{H#SdPH&pqyj^B3OHVnZdnqbKk^JPsbR zu6VD(6Wa8sy#d$6`HJ6NCb=8GtJeWo#|Y|Q7IQ%IUogeJ*r19IiZ6AF+>13AYEL1Y z)Z)nCYA!``Z&0h(s_;L9%3Rwg8@04p5rI2v*?K`nro9&F*ewaUR+^#y7`InX?-R!L1dl=M@-i$RMo4B@&wLeF(;(*z;JbX?C8cq|}ov=X)~?v8YIP(|(UicwQDb@x<{BC-`mw@R4zL zisFfZyy6PmeX_Fj7DKSkJ_a`ou@|PI=ws89PdLaWMpqt&&W|TZUgJKy;M5o0S38J0 z0Uv?dp8=a!=uF7^m_uJYlj+b?RS*~>Z-=!cAtS1o|>E}+4b#!Z8_R;sy6vFzmu z)1P^~%+%RcQ@w(slCSzRv`y-u9|zyad1f&zpySltotp@!&l!wSui&A(K{K|hjZ)zD z{$q1~>x4#~-3H(jzG&~@IT6Bfu+ziIOB-eu+F5ny$E*hsALQLywA}sDp%wPRv3-_@ zrxHiku57H=hOS4}Zo{GW4?V-OLx17Id?A@7C?#=C205SQiUuPexLycPmVc!DQ<#s> z0~6FEo4Vuw713$!m$L78IKdUDRP{v<85i2c5E=Rto9eJWU-kU4tTB^YJx{PsVMf=gb1O4SkLIG+9MqWEs>Fb!@X3kb`$JAkw+VL#>ldpVs;rl$y#7&Aay=xiNUlEp+$VUlPjqTI+^W>tN%e(9TMvGlxDsN~WKSFsc3k4P zY~~>qZ2{eR8mkF$)-10@r#sxGW+LkSM1OwFq4xNs^zPu>W%*y0(k7Rx9AyG0JKO#Efvx4sb+H1 zns)iA%{Gg-*#@miUPh$`Fn`sNo5%iE=ZyfMSbuPvzx*$Bz!@$n;0)7biD$89WdoM8;F z#p-rO>v$QtPPfowPX}fAZ1wwK?S`lB^?2~I$7Zfbn#Y(C^{(}dKI=NrmmzJ9hiHD!>GeS*DR6M70sH{v+d|eN zhhE0G!fG^#7;AYnXo&?nIBpt3&!-mYr7KE(66AwUH3X$UbuaJ06v9%RsU`)Kva1`Sl?k=sD8O9qX7pt!`c@X zQpYwH&NT;NCvv0*J7TGH;X_lJDk2Z9~x5Gte7uXXcW;$78BWoBwgSUoUbTAy|_SF-+x7C$wG>-Z< z2Ux2+G@8$E-bdy&NH_%t1J6+T^T}G)=dAe!nPqaU%YiWw|MCKCYzCI1o5c7NtT8Tb zK?Y-i-QeXu$Sr8-(B4%=Cqzf$V(RuXaBX&mm4nx9T9YX0-VpjoLN7e=OjUnxc(1fWw)7oNf*#rCdgh(e1q(3cJvu259~tP)5lK< zj`Ncj>wV?AipBggMQ-_T z>3mrAto3u-&cbI3X*P>+;=ysPtD^9R;bg{`hs?wl4S@I1B6_+wMyVo$h2DwwSJF3Z zb1!sPqL-oEf`7lPX2ExWcMP(DCOw~j$`QK0oAI~!xzb;JnfT?_r}N(s#`_}R4Uv4i zFX}9(OlyU_ca2;%mtGxAypin8iqDhZDrDOdIac$1s)j?(NJG!u$=_lZDF*KXK>c+O zJ|(ys9SW7z^IW_r`)5K+^<12JUIcrbJ7UCH-&GC^Q}GU(cG>1|on@~y8-@Pnrq>TJ z`p1)o+RHxz--H>6)W`w?*q_!@_}_Qt-+kTQmm}g~5zu)WD@)~mJ=^wq4rUa^PqLxV zB;P;Eg7lP1ZpZOeyi>E})GOM(t+fyl>&w2lh7&KaS}CA(1=Y;6Z$(;-P;C|^p%-gS z6D~qF9Ve%lu)?&DLKT^=Xr>GGS0B0Phu1Cfr0tSMsrR~72}58pu5_WjH4D3;tg@$3 zt*ENg!;p88I~%h4j6@T=Aq0ihbW_*lZQ3~wA@hqZHI6jlX`1C=axjOwlXDGRUY&;}H5qS*ue&-=@ zW7T6VG!+&Ychq;EL94=dcBb$WNoghO z=_YPAjA3**T_ty<8uGWI?k{WiWPWLV1Alv51S-3C&6~Y)OIvvk7)Lh^A?ZaZB~^{E zP}bmv=8RPoWvps}8eD%RXVYyjvHMN2;a0ZaOkrW?JaijR00ReVG43Bpk!$1r`X$_7rXy;$4KpnVpa?2(k<9mH5R zB@xrB{!GBX0!|($@#=@Vp8MbD--Cz9h5+ogg@Q}L4i253z`BSJ$i0^F4w3nVtbeBXC1ZnZLGr=`m5Vg>)L?$A|1dX z@8E7Bk2~}{+*K64*?Dc5Hn)=rNgUTu8KL+!UHT{u?SRQZtIWoh8tSPU**1rQ^0N@q z=vx3C?0HWM(S!ys83*DrmpJ$A-UXfpX?@XgnrLk`-RmF}pS`nwe|ZF`sG4J5MVhbF zHAGUa016>DoqZsRC#j!ObiOhspAY!+_Vn0PC!C)*PUn5<@l#e9d_u{xz(k6n~EQxladeiGN?1&Q_v?uoykwm{B zZ>?oI-0ytvKWtk&i8L}$fQ0k#WlmWxY<$Xmw4uamStD+G#WPLx&`<5=2q-59Uo{b& zrdnH;$`mjF%o_T0_dR)koy8Yq#)TB^tXCuq%IwS`{X@4(kQ^D`gTm5lz;nBP{d*IW zE%1fdPyOTtx8RDLRp4P;q8`LEFt)kgZK}O8E}R>IUD+9w&K)Z@Gg7eJqN1GT?iuy` zk9Fd5YbGI9-9T;Y3b&Bx&zprcnx54eh&7+9*fa=1K11#ndbiC)i72|fN*L^Mgz6)z zlR$C4T6+-QW+95JjEg_{7STQw6D(x7Y*`$knfd^PkS^R1k)7GCM22ZfD&&}8MKbw8 znaQ~;Gm#d)-IRsZkV*m_HBn985fg>OFtipv+0dly;_By)(259TKb!+Wn`nr5SB`Ph z+j{tz+K&`5=jN2)QrE=lQL{jjIDZf19QWs7j|!o`9FmDXaAelY%GLjCqlw&aG(2n9 zJbbEER!8(Qh~90&KEN)DN7suhwejS9j9wpqFM=;VYlwW>&GKZ5emp~8Jq+v2tq&nM z)hTb@Rz0N(YbIYxyx-n8l)(r2>4KIZ*HMh&Td#P$9u<0JcwD9W;zR3}l)U!b6FD{W zrrr*#_$S|&j$g@ov^r`{@vzvO_1LlKB8Wg2x8v_GNW%L5>M7-RzbE@aSIB)i#{A?nxF+mv^HdpdW0u`;mFQe=Tc+?rMh z2%$&=LpuPhlt*svad2n)jLSmpwlP|IGQO<^6{CUoV3vViQ2>#2XZT~!k8P;#&0$n+sM+sWz)v!+zUnNzF15S_j1TzWg~vm2p=A-E zD|L%a`dY(+%(%&T!alZHSstYVkD!ES`D6Zw9!P>BygNeXV*z{Xi0ta-Z8lp~D&GuK zDI5($!Lk@)@UVXQvv>C}k@B9!wPK>mzvh;PvdH;$K4e8I;>@sE6yyp0_$3Lr-#~^l zBf`nLW~&BIGPgQ2B~imc^@Q>U#=(MBtymSqD#q)4+(>m*G-78OY^l7B5G}C1>cl;y z@w<_TykMUCX<9Uj_}#WLZgq8slQMvk=uqaYZ#!yao>tipvcgqvHTD|$Dl_c$mIv6E z2ek6yCYtC|dl)F~z2J?l;gmRuEro&pG_y)P+9Dq=(a;M(0VBgG!CS#@0`oDzMHPsI zH4j_u-M>2jM0BbcWIJ^7unsh&G;Iy$(@j;F{l#m_zjOFP4;~v&J^IK4Ov@M`= zg#qTV-B_k&p<|j>Nl~=Q^T+T)Qw4q{NzAc+n*6WkM;{jYas#m9}kbM3QE-^%3 z!YZ6{pY51;oAUND->~aL>Q6q&P)#tYNgAOzJ{+OhCWK{+(_y1&u@&+eVX|pxp5?@Z zDY5wJ4xTN3vBYl8DYE|#F{E3r;vG*M=~o?WxCrkGLXGWHw*8@*HM6P6TBF%w(G^Y> ziYFrU0~Nl{h4JoW8u_NDyQ4Gfc>FQ{Ftw$U05Q8gXL9*}=FHUG*K@&_2^oSF5-ou>+*pWe zq_w-naZyX$jznQ!6x@BZ_+JQy@% zP@g)cLeETG5-NNrF1@FiA227i+Dpw+om`vs`}yRFE*-x8oS5jTA~(Cc;;;*Y*CdC6 zrdY+6x~cum8&i52FSiYMvnfF$kkvw?HVt59K984+`oPOPculR{5P}!pi5{^;^@6~p zwG}6z$_lp)M~~}OkM=cdO5vHZ(5q2`J%(M-dp)mEw% z%4bIzu&`b*nrHu{=*O#A9}yl0S@N%X(Q%Jz&3Bsm{dN4WqRm0s^Cs)bxz?&egZaP> zL|_S`SFiFf7;n!-*3Wc0bz{XE6hc>Hqg4$2V%HTaj#rotj?Sg8ObcnU*{4PV(>!H& z(XwgcJVBWjTB=4W^bv+se;h&*ktI|ZkhG>F1uUb!j=nU;qxF_I6Ie4W$7ngZJ=?xo z%gyMDX0S~EPNz@#@wY0rDYC1gE0w*JiB(c!7}boV>FKSP*0X!HJ^t6ErqC45fs#*6 z5*I7N6kY>v@TuUIEp7dz37m`09`FTB=Fx=UBp7DaO3nbW@k4Z+K>2LG>yID>%o3A! zT4j66TmL1BtQE*X_Q~AG0_oE^ejZtw*PH>MUiQ)UI?*D=xGR1|PH-wZ9xG=u{aT;3u@vi{ z%+4$8k+wyQ3>dYwI%PM!lLW6??L7U;`mtboKyaynVlBbRyZ+P9O$|_cs2JNc$o9I% zMO6H?b~6z1uxL)AVY4X5wXDC`tRfq;^$&QDh?Wm+vZT+gWr}45Xjb~L^lWx**K(?V zbNbBoaEZn-=hLm9g>7Dd^k;9!!^ zjDB81WtV7QFWhmPusvc~gnIUA-Emo=aVBP>_+x>SwREU^RuXT@0zm&Zn)Vk5K(=B9 zo=2@&J8}>4D`j$N9#V$*xP=b;DY?;HUGWN-joLLq`NtUuY^zU{CXpxZfhcDNkG_^@ z=rzr6IrwIqu%^)U^2!N|*Qa*2P!a2Plf~8nD_VV1rcSbyF0dy(7EO`Bgw|ycLt@{`^%_85RrLX@^l1n}1eNny1m+)4H5Ii$vU< z;NO151hg-?06Ve&qB|ZuBysX5Lx}>fgbUX4bAk90SdOZ$cCb53csEvAKB-s9UzYsl(HIEDRCuH+ zGw1+&gi>1Vh2a$-Bt~t(Ibs#B5XTsO!X(%4Ufi{}{6fz!Gt=sFJ@fQIVF7-c<@0ZS zaQsH5iy=?Hl~$?@E`e1vn{6wq7W89-ZBEX^Zby6%C1A{KTlNQ9xsArrne+UO{@<{y zQ3=tshSjKFI^9x`&0|6XN1{T-u$j?6%*mezu2+dyQ{Y{=;X*XU4A@zF! zl2#*&xa8cpFRf;T$vcLqH4?QH+c}Pw1EQF+&BcG%sP|<6TIuA2$NbkpJpAk#Gd@g&)Bz6-c`mgLr5jdyJ=o&zmn?ZL0b?TQFv!yuoT0OPEJ zMS0z*dT~K3*u(PXFfNre-I25gwI3LOg`$!Qqp5H@`WELuTHKdzzxeLO(dq2c(>vr7<<7SJRfTi4fqj`0d8_83J46ebszOe!Ncf(DA_R7i zz9pFo+PW2=DyjO7hjFK=TD#m0KYr%1(IHyULahsWaCI4ZzQvU2782%(%Zeywe_}?6Vp|`p4l1-_WS?X`|fb6 z-~ZtTQL-u=$*f3aB|9UtL`3#>jIy)0l%%XgAzK;QTgWJdGLEbhLb4qr9V7d>k6n4| zd;Om0`d-)bJoWkKe2)8l-><#z_Zgq9$*?}g!i(%;v;9~?Fz*5fV*#2`{B&<;I+wzp z1*{evxg!C7mGWI}tHTC=mf*_@j}SGf1y>r$gB9FqdoUG=GsG+iVi^@p+M#ToKEmsk z>c!?u9uZn4!IL0F+=$KdEh)}WD$0~2p1P@&q*nBH8S8-~F&)F69}x+c zN{RP3(dlPt1?l zWaFz7Gdu%?*9Y2_x}2N)JIaUcE1vfE+CMy54x(CeP_!+=Q7IiSzT3pXwaUQ^Re)jg zNmELwP(fP{3?z#8Cz=KU?2}n6K-o&(u6;)1+J}Ms=Ps@fC5d%;1?jr@Eo%mj;Lue{ z2mHWiY``B47bJtv3uFy^Wa!MCd#@TGWatj`2aotx*GY-E#QZ#@t;Go;ZEGxgX7flD zazs~0-9!iRM6~rFKXb>xBcX@0Rhe9tg=}%t_{2k=l4sQuQ2m1!1V=DHRLamKyiBdN zl7E3Fvvd@Cu}~J&vzz#q?F9cwz0>uFia9-=iSK+FE2lCIiH`U^S@H2QCVm}>vPxmEPFVVLR5?jzur&;R80$~P%zK)1!lCY)1b zb;Us;?JU<(Y~oRWn`ecVU(3i>hNf^bUNdePNa2>$UM-jHy%Jpm)aaJ>hCFqENUe~9 z9CF%|Xz;R-Y&rw;-PP^fiI0SQX>-2PYp-}eb=-)fT zL96&E!rwejkzr#K)}wm@r~13Z-VvTHX`jqfgcmmYoA~T|tZ`_~E=c0y4nGk(=K(=f zBAR;d+h8OB$oA?3&*@n~YbMq!=XrU_fJ|iRVh}=Ws(10Z0lajBa#()%%fRT$loz*t zX0E)SR(QuXr{;;o>St>!+;f5d&2PCXRlNBKX2N!pvt{*Qb_jownDNmnT-<&bG<=C~ zbv{1%!rs`}X|%O6Q}oIvbz$W0$0^gjqdO1zPrguu&zVc?dndlm=QBjW66}t=vi|>n zp6b0Hci2I59RyOckV4)5Z(r)zBV5{Y6?+~t!*Mr{qNyizf8dDz#0XxwS*zfiX=TU7 z{JMbJIP~rs4Kq4{L^WJ*yoAKoOsv99?VEY^CukEHF&DOh{Q;a+UZgx7^nnN@{$j$0c0*K)<58<)`8k znGNP>Ywww9%|hoaUUM*gF)3fk{P?(l+gs1c!7x|Dxik8BA)qy`k+oi3W?xF!kafnWcU~v0fb8kcFO9Up#;2b`^6_DVy=)cFUZ#d>4h_FcSPU;~TOj zQZyM?F>D%o(?+^M=-)nAEC8G17ghXZO~YwPK0iGppI__DBVjTk+r2UiMleJWB(_1D zudndH!~w$Bs|%%i>0y3e9x}4S^b1J#XFl?D3x|J| z|1~*TksKFNq9zo3;?U{}cMcX-1#1;*m(Brp{OWT*iAhcEj3tf>j}EcK(=y zRp`*#gjVOx5zxTVl4dklGxG66>c8;Ccwr2(3kkQQBMtZdhWDnxMk9cry6oeiTwRgT zk4?WeYTj10uyjw}+@-v~9l2m&x#Q?(P+tsMQhpSD$}oYc0fNpT@&m zCME{O#;#FN^k`}@Ccm`)T=etEfvrey;)6jux(k4@NXoIMP#I6L<29{E_gcm+wlX{Y z^adgCkZNZ_+on(*RMiHo*A)$Kujb_$2(B6ZY~81k*822JpjO1}4N$I(>z5=R4+T}P zmg)`Dh9#~cBvE|LZQo}>D1y}60W)0^_-svvl*{)VR6>Ja$a`ef2Ga1kXB@Ak z<2J1Y>=niomU#Vqqknet!?9TstD>2Oz9yc-{E{fv_w!v-bYdl-RW7MufF$0LpxM@rd> zil={EOw%!2w+)V%c+Ee-Nak1GuS*CfNF^a^;!9f~T|3iL>?=ru66zv8haKm%J^k`1 zd;rwm?a>)QsRV1A0(8om9yK<#7^i>aS$=za_zR|9))K5~Gl9f8UwU($I4BIgJE&59 ziT|{atKU>!5t1FZ|DYDUN+B4%I~aPT;uIE7Tb~68u$miS=BNch#Dq?g8q+2gEqBU9 zUHWGke;6OvO^KR{KgI$aVo_3*)idvPa-Rm|es0+?eR&hxwLh)tV#+gMrjxFzd959< z>;8eMRzm!t+pyd>X`oD;gMOyIS8*9;mfhdo_b zhS$0?7}2TOvocYeT$kT_;j;lfYUBXEV-y<5F(=^{B9@jXlb4;Psvd&HliEhG{Qu)G2Sm+~b1gGN797m*?NHZg1xp z03`*luy1O@@A^!DiS+2!c`WXNqe0jnnE*LQDfF9X|AR?ubn{dPr&SSNa@Y`>-#(PJ z9R_CG$uM4bQ!Rd5vckg@%!Kht$Fwzl{sS#6+_Q8vi32yG(=7}{?w>FQD;1rm26dV? z3C3XUZauUe5St`}(E)bQ@*a%a{OUJJL5To>UBaJhK(1IhAsW<{G*uXTwn1p^!IRjI z`-jEerFU32O1Dd1HHB?e{cRpQ7z0Fri0a|vZT_~hrF$+DNT2c$GE+O_IH&;fQzf>Z zO(Xd`DjFCAAqI8MDx9pCGQ@M6PTMO`mNVsXnZrOt@D=7MV=jPX-eWlaraRbHu-s{Z zucvq`a_tZlEi$Gt!TAz-+is{8s~dj=BQ{T-oLe*5zp)TQp8C6hKZMGj>-p~X$ZGpy z?y5}ey)FQC9BVirL%9QU(prsq`+aVg(EyU}i!r(iDWAQVjBJT<7Zf#DSxk(P^O*8H z1;~A-FmjG)liXPbxH|#ZmGf9BoB7*s2d7*CnqcFQ!nv8n{I+5hA~8&w!UlmhQ1QJW z$)?zqw6(ah4Lew00C+?kSwBLOu_X_Pf17|{^SVvn{>W%D0Jdh1541fA0y1`Hzy4p; zt;HQ2$IC6MwhgeWBIb#iIQdy8Dz-DFVL@P{XuVoUHSx&-3ocQY*6xTHt1ckWaoewX zko%RkGFe*Pi`!aG{d^V_G45F#2SpD@9$DhI){Q{``dFaa(HNCKX{-rFM(r0Em7n`*x+7xl#K4hK zWbVz^fZWFhqwuo0V^~qbevKGPG*&m>cM_=BDHSxNZ#fPeHCIEWIr=xs{VL!-A%H+$ zl&?l1_enkuAm*)8Trb;f&)G6opc(k(jERsJ5|r(LM5D$onXaXgP&V%h@Tv}$+n<7g zeILfQ?7A)w3GfQO+Lr)3JSVlc)W?2fOo}NG)m#_Xq|MZ3JC+Ht0I-X1kVBe8qEo;| zDINOA3Z*)$=ceq%xH=23yzLHImDMP~itiB*9zu!6sz9)w0vlcY@NBIZ@Fy8zH^G7c z0oSk{5z~GL6i9re%oK7TWlFG5^R2uCI}4)cMWE2Is~7R1PiwSQSZS zFp!K0TrXt*Nha1^0G{2^&mhyBRZ@tY2J-O&=lNMEerTIw;;m7%Jpay!ITC}~r&zTz z)IPU`fQ>4CEbI9fOGXJaBlWIG4mQ+A+CKp+JjbQ3Z^`?9)zTGhaR0UJ_UDSSOPC7x z$6~j%3w|S2IVc%$IUB~(L5aqy?Kmh2Hj>Tihm1GHF$K{V{$2e$BDQuV z%%UA~A4LhUk4<>0@Nc&AS8MF9g9*1iF z*oQ_I*r@aqK{{Jt+y*9P9B>@!E0HOrGlU5MS?%UKOadvZD<^2nf{pGz=3rSzb*p$` zR%nL-@zMtLOnW6T{;0606_n46UMG!$k+sPV!=O~nAaJK;xoj>N+ zazaPHs0g9PY1rgzd6n)sI}Xv@ybA7R>E!>+y zyQ3Gv^?3w9!==nig-qI$F>!-qi{f80g$dyG_&OQ1U7*g>)&Q|!YBfBU5o*P%#GD0L z;=qIdX9|m#?-_5r7hqi_{uLSjpC~eZT{awFf}iE9p1)l>9x?IbIE>>me%(2%G48yq zg&RWeC2!#Ew>>U;^oHuoo04;PUkJyExhIKA(#UxiKTzFs^KhK|(|wZ9#EtN9PoF$# zcI}?DyJryhwM)X1#!m~zBTjzC!Zner6r7HndrN1+Za2;o=~U-}a1ucKpB#6JnsE?7 zgtkuAIlF8PAnRp87DO8ahYE66vy@XHc}u^2@+f2yR0WzFETs{He+h*(K*VfwG8r;= zVKoAg=e6{%?>mCygNcOo@VFqisrLt!d%-%ho$H)J1RxLKF}X+xb(qjCu;6`alMvK` zRpKaP8XMi)J2FhpV-2dD$lXbgciyE6Z~Du9PCvbD>}C!N2xGD_6*Q)irkg0Ntc&J1$iQsBP=BtVl7@6P;PAdvc=mgZVI3M+I-sY$ow?{Fezn#lM(U?m3o~Jjkpm3)V1Pc~y zV~;~ESS5}%=piAWq!dB{U6u$$LcG|ueCMw5IdE72)0bCJ+Xx8)bzS5x`b*Q82UL0c zGO+m}1>-$%1SI)#dv_*|b{r@z-G^=?)PttGCQhOxWGK3V28}#GN{Z|`6BGrtKLe!7 zJEO*SmYjNVAYZ;uv!Q6S%iR;~gCBDY(wMB6T%p}y753RrxKR5Dxq=0=qoNW}3s%)} z5LA-Wfy0quSb(?W1u^xV03qz4W=uJ-B{_PSHVq!F=^+q83WalcfP&_~?lv!L2ih^YsKpQ-Z1vdu% zws(fn;A)#D1(P8ZZFcX$2m2fq=J`vfcn=`q0p(cY?N0#}K+FY!AMFqPpcbsszWxaS zM=5IQ*5$FYBO>jBs;uF9tD(L9|Axr_DGWzsi#zLe3BhG2~}Bu z6sBN3R!$`iHvY2FX~t31{`Ga$xJOP|*n)Va#qfrVYd%ca-#ofo)*94M>%u{ohz!ng z`USB~Ggsz?g=HzMe3mqyQ+>A8b6}r?6z;$k#z1>mIg>_I(0(rm16u zd?rS^yLHm%$(o!zs@wXj7c9y-r#hs8;ZpJ@`OZ2&4xts&H@^SnLx#id7_Ztefmga3$Z^#4zGiCXuLI4VXsJockK}$nSN9VQ zifEs|SgijlH4PQE&?Px}pmUX4;NVSW`(!Noa>A>z#X!P$IHoNEk>$r0BW&iCQScWYd$9!bEVnBCKy} z?r6~<#bRYgmHpw;j7(^_+9FI$cKqwtub%I9Z=n?9G@yg^ZD{bTl?wxgwmoOCZ&*5^ zQ4LjC5`*CD*m>@8N2ELUzX-~*XD>SYY|w8u=1FprYRSEBX_t-xs%^vze#DEdle40L zg)4jcK?uX0ow3J=N`{JiA0qJ`3_(RjWiXL_`uTxj4h?+;1)Ij~y59q^N<1I_M0w=Y ze8rrJ6n)^?rEkVzO9cykU7Yw|fWYqFLn{SP4#Tox2>jsdI{y6~(dV=ktJywUHo7X8 zWklGd-uDDW4&J)uxb+&UQ2!p+F*-E%&GgJNSSilw#F>mr_^k9Rzz*Y{go=w>aP=dB zISCm)IM@HE&eGLQ=g3>wa_pO6(e)MX>A*udKz1%yY0XLnYy#Z2W0GQ^$3aSJ^^!mv z9_}FKKj$H|_@#tJBw_kiUJEhqkuL4}N;xIj`f7*17C;l04H77}_VlFTew@yQZ}Dt` zFaw5+t3ta5Vuayt`Qa?axP8u-g-;oFC=r$ZA!xAs;A!q@lfO*7odp)38GTIszzE6W z3*6=ut@N_m-wp7Fo%)&#R$n0b#WegXv>32KVWiXE)>LktlD~^(v;-g;Ri?(y!tBlj zoT;h9?leee`VliUYn2}VmuL@DV2Q+o0_OIA(f0!}5TXFMbE?)s#zk01K|z|s!}2e) zBG^DUNisx_|LUl>P-hx9A-AEvUSTvv+W#V6-f12?lTQSRsc_d>^%L|lcRsB0G`1}=??Qj1X$1ZP02}? zXf?W*{-t6zC|}0qI^Sfw_NIzIgq?=>(Yaehy{JH0!3~n`q>ptcG zR~ZD=Ga-e)3@3&HW1wFMGlkURa!fWhy=1*}?BdP|Y@lgQIHZ2!o(HxRxRbwJKsW#r z@H5&E#J6MPilF}h33#CTM(a$3w0aq$JX_Ai^+0Pm zct|$~G8{UH8AJU%-}smL<2E2)xt}aIQtLmU{?~^4*M{3hSnV&-jIF;Hfad?YmK;!2 z*YJ*=ivgg1g*9dQ`3@D7ad!U(7CcmY=?=AE)n;d4ZXA?Dmo@%1hyFE()_ucPMf=wr z`okA(#uM8AnnUZE(4SQL*BsiaN59eMzt32l6(w`uhL< zNuz&Z^S|k}Kcxbf=->3(FGl27CHoJi*Zxn1%FnQ-bbaz`4YDb`DLVf$xAt6&9 zRcH7&-t}+1YukwB-*^|4%G~}p-t}iZy75ifzwxeLz3*1X{IA8kl^Po|v-;t`*W&%x zUW4rY{A;iMYp-qR`2Mxmwo5zLtl7Wz+72%7-z~S_YLi>4TmM>Wo5lV=sPwO;wnb8Y zr_;atR*)v=UrX%|RoaxE|9MO8<3aa{^;v*l5BRU$y>h3&SWNU5b7z85J^e4kQn-_t zA*SPb$xwdVeF6-{_!=VVw@ki)>m)chI7(Wt?I%*fy^^-VG(ipI6cc)OA<8WOO#bh8 z7vY#Gv-S$;cjd0U3kom$%2pVNl9J~5Xg%DPwI3Fyxj73&mF^=1vTS*T+huMNZf9$& z`Fez@;%8(~L&356ro&srd=&I*+8lUR#^ATD+1xk*Q1<#jyET9dSJvi@Skl%&nsdk3 z&m;s<9s0^Qp??6FmaiUrkQ#O!J>$hQon82}Bm9X?5+b;QfpT*Nho8=Dy_%Z`NSQC@ ztn@qn8$k4dCef23wZJX2h>wEHta%r^3)8gRq?GL9f=XyTGOCi{9pFV{2JT5ouID|z zBNSl^E;_|*f|LdugWRw}Fzi*&mFTt=hpl#y2J@chv&h)3FXcFJBr$22gVrOv(l@WD zh>zcHO#QTl$#G&DRw4Js1Q|hFJG&WqT8}eIxo*QvFD1HeRZ`9zw79V4SFK>h;MhUx z-McnW`t5@d8D=gqF=^{HF!yv)a4v%|Zn~-t`gKr)+W@$S2L?)#k_rVpzN3OsacL;? zD8#@xB1UWLRn&USsB%Pb<8Mc7ro-;y;4t5iV_OIP${t|HHx9`(+bn+UIq)8)B@kBI zY5(oY5fFlw?E^>7wbOcNsml-d2Q_4lA1ZVw4cd}wM`P{+6Wn?N*;A}I1E%LXJHo=Z z0O5)~Ju~BGOJ2n{)&lD(eO7BA_}N^-4BRd)o7iLAxqGsj#Zf|X$ccPQ?~wHZIH-G6 zsuBvf)tBvNoM75`sFyCIsI+vs7uF+B>iz!S2fZK0jB$v*hH!x&WPrQ_!nkhcVhQ6i zj4n<~@`I0dxwGM3SD4Zg!9{4Xqs(6frBUC#Pri9WG6>88Dz%B%LiR=Vz#aGC_ealf z%<4QUDC_L(v;r4O+C^|>?3I(p!4;t8@8F+&sDE_vK@4&2Xg$WYK7G6w)X>9zLmu-= zHe3Y-7g`UCzM}`U@-Lj(M0X$NouT#bZ-#ABa*GOsbyZ+wk7ayuaQ$8H^_fCny1JGt zU_DW(sJUA<8q1xZK7C5JaUS^kaeRDyxf#~urrG#Yx~Ql~&}!^U0fNx1Oqil%y3IJs zVtH|frE6%2LKxSs&LCS&7}wj>Pfej2ai7RlTY*qX!>0LVyLstiiRtv98Bi#7m#Jo@ z2wt1vH2Om`U#t*4W3SP5f@YR2TVD(C12mSa4as;R8^;I0D;3V%2o+ynkVEupQsifK zeLWu4LKdtC&oH~SNG<5}++ZlXVcP5Yu_6SQy;4kE`8}eyf~DUcp6Q&N@OOKCtt{ro zYbJcV#VMrL#E^{JoC$8X1a-?>l$>OGr<|JBYg;$BWhSo5sxr$-w-IO-~TJ!H`PHFyh2FxV)gm;x6wx5 z6H-tZ-%opC{8jKfSQJlKt!uVefv*46gvL@?Sa9!b0}Qj^(UY>J zrKO>cj*etNiE$^T)D*?%p*Z`2cDNJ0%~pzj%@{X%PFu?vCx`4kY`JIOS=Zwp-`#=J z;i&joDknL>DJ%Z~4iG3Da(8n<9|*T%j5E&1&Iihv4d|X;C|L26GP`x8xP2Ib`Gq^> z1J@OD5Atz2;xx>Umo6I%%zltr613a`>p}42Lgvcz4NOp!XC(sW!Tqdc80I0?Bksxy z&BjzWrr+!iAe5mGBpV+Yakk}o#}8!lz1}&8k9)`Src!V^O;IeKL`yYs-Agnc-$M)2 zSSn!S57|n9Ot|Qtx{I>1-(!5okxWifi%j^Hn|+{ZUQbfek|l0;FsIs-ck|c*+Aq^( zMPy5I8pzVea&zi9c0O6X&!ROGQFCDf=-F|Ty$MN4$a_Stdd(d+8l^XYM>eG>F2LOO z)B*Y9Y>$y4LF$-R5zRqrlKhex{Ra;t?j^SZOO3$1Rl0QAT}Cg#dfIJmZCz)-6j;5i z_LfX3W=)bG<;eiPJS z#i=aAxJT;73vD%I(}AAX z4WNTf;E72|n#96CgOkVn*p?8F$4LU zy_2)Tnd1E;qoe$G=R(ikdeNU8@+WlQ8pU((1(8ZOcU6dU!1)-)D1zHK1D%_Hu zb9T$Zl0!IopThV+h>&pvqkKSL%`J~_#*BHzw0AkcJ= z11FSH2-_Qp_+o4`MWoG^tAFl3fcxkEWAV@f;xUO0qKn`_h1Q?R;|FM8-^x^e-3?SS zn<72!Ya)i`x;}#Z!28{3x>R|vG(UgF2G~4yCiSN}g_ZY+BJ00#nEJcg1M~A8L@4uZ zB^mF~08?sdMt)O@Y$b)M5u&#^%vLi+@{%lH#LdUROESvjqLY$XJc!av5+}NY8uB?W zptjy4A_tcP^>YrU{wmj7yhk6%(Vk>q3B)WY`+!A2K(N%LoB8w0ha{8UW(iskJ0?#+Ek9xqv>*80U6wI-LTb1#*_i&keT6D*SkgA)7QQfwLn=7iTw?{#t znBU_&Qbw`wrxMb}MGd!mn&|D1j$8q*P^l$WkMFbKlJfWu5+L&w@P)fTPd+e@DFv35 zV;{wuG7Civbcv0{>~V5(l6fXYUoA9R=#NYY(###MKOFV*UAn{Mb7~h64P>>Q%)A;? zXYpXx40CDA2ynJt;I#nHS-T5qJ%6GA%CvDghlQ0V`Lz@jx?sXTer6hYgZ{sbqf4$xQ<(qjk-Mye{+}kpKn!0=Nav#qnzL?MwND>9JQmek|v?QMm4% zSU^sn#xw*qwHt z>1sA6-6VQvs5xNwH0;=eMr{aCBi-rinhb}n?Gi{nXGxSZqA@z?yCR<#^5rS=ow&?2~9 zl*0DY$5XPM6Y2&K1L&Cx4M@T%8|f9jG68xuRLw3hbXjI@4mc5Z5aibUK)K;|e_*S) zse$xYi+sS=34Gyz2rk)EgtZ2eOxLH{t+OCI8$DKDUQWm{p}Its4cz@gIQR7Nil1)4 zY=+Y3hJ$pXEFK$h(+u;ji@d~nz~Zp~9kq(K!^D^0Eb}X1ZhF8M>WxPfJSsbiDI)1V z=;L!{T6PGvduC$fqGM<{&@M3XKxRR)k*wD~Ehz~lf?MiZ&^cVVR04D@iu2R*dM*>C zfSk(K_36rTMiz700QG1A3%WVe$>&ibmkgl)E{;fTHX*q9dYY)HHN==t@{m zrHMZOAcah=6FTHfe?l+5$aT5IvCyH&*%qGgaCpaDhm&Bld`D42>Eapo z>`|KB#1Ae}dOigoic1gV5O!8r+fr zUaB6NUAu9GcI_d$y$iEuni%M(%NVxeo-N=0`LXQmArS17c1W3EuRLiWN^{aehwm|^ z26J$!;BoKr7etqc>Q%cM?|j!#?BVle!!ayQjHob9)uN?);GUv7Q$});XhG+6i^i!V zkx#4pn5!+uO@w~j?(Xi+mCuZ{O&OYBk}%SsL`00`ie1ZzR%bSs2~%R6LU?G?+;O~l zo11Vqu2ksl@%c(NG=AFQUdsXoJI7sJHt8^&Ex@|B(u2Z|NzwDqgaEfKfB5~QDJ{qe z)xPXIyJrO(UL~6Jprcn*UE}1g9Y20tXTXg3%($LAcs);~X?xyiw?k)S&1;X)nG>-B zbZsoghIyns*3Pmrj8m?3j%s%*N#Qkclf!RGL!}if1f1$ot&O#4i|J=dip}Q0%rbjFCSAY-oqLy_mYy54p0mw5uHr&NYh}??QSJ% zn&%{@q`)~vr*vQE=fk00%%tA)UR6kqLezt*I2~9d2PNv+Tb>`PD8dI`Q@(hG(%q9E z;xz6$s-kjpksmq(72y3{*cY|$=5S*mmMXd}H~d7@Iu7;jXE^Hgy<4jwt#a#z;BHU= z9gox&+xqQ_)c(TGnmu5h!5is!q$@76e&Tf(xUS+oCFt(&+~`h z--Q(~CuHpM&`W)cyRxGqrA%(`w`AsPnk95LL}pU z%)HA6nd_HP4I_7+u`8M+G~l#qcvhiXYPDGt4ToNBQ7gj6K=l zOqp3_Ay%otZThRor$+%ltRdNcg+y$7*J4!PHOk(@4NUtAXJ6Y zAi{W$D$4F*)qD0=<`h*$9k!44`j<%px+5d0vduaQ-RizRQ{}Y*IVbDG*eHf09lAaa z2Z=9VVYqrq@zjxN-ToO?bTwuFLI7Jo?)_VOA`fi8)YsPws#nPLg$T!dH$kAyqCWB* zGwQl1DtI`cixpj%mX>C*I5!nWQGk?usrF#-dkD{`n2+5C+Lt5_^ae9)v~ltB7BaB2 z81$cd7%Lq)z@GuH^z)_QGIP#2ZpTqgnwsnC?dAMgU+vnF6%#prE{_C$;GS^l=)o(Z zgr%M%c!|@UGYkeLT)21URoQTiHY_rrpf>YGGw;$lZ_3L+9tH%Bv~z)`qWNzozloj>rNqKQ}jXWC4m%vS&Sd85NY< zGENi__#iRN%M3L<>{vZFh2lCXof?!C^GPJ$)nS?I?xN)d;#L9&!DPkc($m4!sH6Lt zq>+wlZ2m2C*NF?x#OKLs^XK*Abot_}R)Johk<&8#i1|L|qLCl-9L6m^c}A{rLHK#fJsxr-uxNeK|p1tUwoxgn(hY!G5PS=m}WacM2l+4 zr=U9}^+{jh2HV_=cgto53+)YV)6u?>W3PMcn=L-yKAUjj`l=Y`S0-oxRwS2 zC;Odwq7c+gA2T}3%5JAACYv+FE{B_3HKqiHBDW=3v=?9Ay7Yd?tXu4x~6th@$8RPDtN2e?qqAJ#18JaTi~9p8Q5v=(t@1T|8s zk3ZyL+vsE`56OJBs=TpFj@Re3w`a06uZWTzg_|hlOYTJ)l)NY{Eqzqor%6l4$!emN zJY6br@2v@Uhwa0Ybn;GDJw+$yGcz;866US0_1EgZ7BXHGATFry&LSj-n`Agi62vw2 z7~Sg<5jEsueWP*hNO7(@^YSZq<6HZ;+N*w_U3VO<`W4~NcQ(^f31D60OW%SOHPz1B ze=1Jkl4z!uTxTY{@-46n0-qV@$HvFIn1ZW|!@TOAnbg)S6dC0@Di^3Pp{|c#HeQq> z&eX{HR(Mrf{SJPXMxR5c60p_dp-EG~nVI+lR0htWr%ye+bDzp_&_9gJyt6H7_9=^2 z$&LBx(Y(O=*vm$pH+^`x&6vAp%;EhaYPnAxBM2WNitNBWUFP6+Z?C5y|_Z7yX+P>iAnGwFVGD%S9&Rn&@Fr?K^Sk|L#<5`yNKiTMS`cqp*m zxV{u?vlr6Rit1`^o-!iNoJsqH%)lRZX|+woFR%A})INP9VW!Sj==6~T^Nx{*dmTAn zwAS2`NX_oQ0&wTzlt0nkcE4!xffxRP(&z^~X?p$X?C8`uxuF>P;hgtzswn@5Rbh>! zfz^GH7ZjrMox7F^4VK2~BdPg<-CRHu@QlGNoc^445A;^%T{XZNsEM@_Ze>Ph#H9sFilU?L*K#Fvsj z;E+er`uvUn__?lX2=K9dCg?1$`{=P47|2^sc6l z&nVT;qz7fWsolIkmE}e~-31syP27Q~p@==@c-xB#QB#B*LmwX2c`aTj9ha)}ZI%xh zIe+D1N1KtkCvxV;V`msh@*+seS)IjOKEO58U!t4Sv)u?eKTr25))ZxnGubb^Ri!z> z^dWJ57GO**HDyS(dS;O#bapUg!q|sxI``8pi5S@sL=D+FG@t70pG8KLvn5T9jp+9; zalh1f&eV}Qaj9Q5c5zNU)4@f-%-Sv7{+{_^Y`H2WX?}+zu_Z-@aA3<>>z?H5|S~QO}dy(y~Wv*&&4AKS!VFceg33FPahyxk}`*Q@p zu#&jz&L5n(FpkdU0I$46-0T@K{K$GxMw;UOFU$||25hZ;U*yLxvYr8L&;C9P`+3;t zSCb@_v(YO7UZu$di1n=!jORVLd3= zGF+JWOIml840zebahev81#S(Le&eB#+5zBYpXOtG-&ygp1VC#S*_pr4(*=9#*tb}l z=(lF+JNfwfn)-xQ+bTRjiOY{4GTeK^`&` zL|9$M@pmrzRD`@@_Uk2TDT*cactblyTsJ2HQ;|d zwlqHVrE7_a&xBH%dh3%zEk!D#0!pchZXjTa{7}Pnq8S*CGRuO6E0r8M)*<8bT~dQb z+rDSN+xHMO9cOg& zrPKnI96BcxRypI+etE*>X3sP3(hBtJCm6#jZhRxvpOl#5gh{ji?iWn@fnMR1p53{e zXP$Dt3PA7OySEwyB$bDGtqX&4JEoZl?tEppjZj4yi}vbY2I}8HgX^jBu*gTH(E4SP zVsd`YkNf-R>(0<22+F@RFr(hUwz98=WcPyLV*kg83IW>9#9^C2H$)L6LGe#00zuvZ zjPcn+m*(#X5=X#&X5U$xs4&{dr8c%J9qNCUW7UKSbwONb$MN(1AO|qO&L5v~7>kE> zJa2JyBhKvDpD;ocuzaBWd+dY;JWHy7eyOj5HJ*|jR2b}~c%mk-?d2!i zIGuh%P+}RhrzUQF^eq(>zwA=g^35t=6(}D1lzzCsP1mQD!@JRh3OwcERfl@S-*&s| zvJPyi%%(VZplZ&;FCb5B zL!X||dFPlo&ce96m*FUUq^muLnvpV@ih$lg<$9)aIGI?0OUe6U0kuQQk+0}er_f@( z-oR5ooys#ZVe+z9B*Ka^JCD^oG#-&^X_7^7CQd*|bbw2RVak%&0r-@08A@&)A$}>N zqS97`B>Sxekh36ztNUzH!(X>Nnf$4$B-eiLl|m&6hG2^=scNPf-?8_LagoDG^Nkun z*SW*8;7ht(ryG5knfy~jg!R&G@njQOoUIwHJQJwAT*rGZur+-8lb2R6}XwSU4 zA_+I#K}qL6tCgIgAKaI%%k<6lmnWUO1Oq}%@5fvwG!V$9GJv^tg4&#IhWBXQQ(m@D zH!DJxZDC}5FVfS^XF#nw?B=kA)jhWUTwf2~AUU13qsh%lr8?!Fmh4o(P?57SPFm)( zRqPv4y=5=?(nvVQlg}wv_tvPQ2&2szV#-c4)Y1G%CQ<8RFXv>SITm_)dSNJ4r4;2) z#WtuG3E}WS4M3=eppxlsT>Ih{ITHi^(F9=Zr(EcGPj?r5Zt#=6i6VFFoLN+uE;wc- z9e+DT?xQUtn=)hCM!9^pB7~5~=gKi%kxo#=p{9*pHpx@WMk33(%umFgGETkfI$l$V zWIQ?!WaNVf(lR+AmsasfAHO=I%?V-`jw{j0&E9Fq_CsQyF7{c@x~W~$Vw_4-4V0&- zGo6~?C!Ubwo$S!|ne|tcQYR!|e&j`9nPMmiFV2^HsjlDElhd+SbW)Ny6VdptjFo+v z5%*4?9~%4~-xjX)dtl<*Z=CSJts&B%H5RFSOnl4aH@gxd03;%xZ;J$?=0UdYWngq8)wHZvzqYB$e)L`itbMU^)?zU7rv(!+ z56!{y7X&spt=hip>3)9EtaV;y|2~7Uktca?2n5W>YX#+abEIdcJwL9$)GjjNv~Cg^=7ee9NGw|}y^%;d*)AjsuTj5| zk&UV|iej$_@7CpXt&LddY<=N!hS{)dhUE(CJl_Y=y!M3A&(n4Wexki9HO`5sJF%zQ zQb7X2VY=|`y|>-a&`{?1lwq8&4YymcM!MHo-F}Ai69O40gR3deUAW-vUml(*0_sjq zF)p4pRXXKgL9m^H9oQZ{@FW+(!D5#(_2$$=X6E4Pb_Tz6yK63Ls`l4J=gL`+-3lix z-@-l`Runc-Q`l5KjKbrW>dV$O7}Kgvj|JB4x2f5`;y|-6(xir6J|c!kKYwoG`bfvj zqJTl3`+cBsOUgu817jCd&XN-P#10OmDtn`X`x(NnFy;n%wHT^PgofrnsEU+!R$0P= z*Vv??4Zb_RX6rXkvtjU~fWhjUjdFvm`PrA5aqlc1jevi%k$kB|{Au^CsS5}v5W>tk zCHJG6co!EJ@2N8z7M;F2qf^woSkS_DBf=}?WGtxBY47 zD_$pIhani_T*G6W8svT1k|{!!81+SX$xd7#3pO1)rS1!wD&vK= zqpD8h@a6MQy>nT<{7D3nMAT zK9#bgld}c7#Cq`nJ0iUv7kUU74Mog#wtBc#GdHW2iirw@$9(l4juJDYkNx-)Wn$O? zf}OMMZfvQVAlrZg!#RV^W4YMI!d857X*t9d1sptnkl%(+?iFz9_Lz zo$t-)+06I*4&SA*IB-VhM~Xl$DQ8j20~ZdLo5^<^RgICw%GqRC7cJD!~ES7mqwT|nTekyl(%EozyQ zB360wotK*(_l^u#(Q=MM1u`8c{i><;RiKZ0{WB-k-Q3ywAB&^7tRlwe%?Xc6IReQmYvJ0lbrpMV_upS~Ba_C%; zdpRlF_Jy%S7tTT!5SG+DjFJ!+55Ln2BBgnS(&nyq#Xw{`G*v^!@#$aW$rUs}ZafUr<@%_o**rAY#&AHNE{cRKf-kQ2$$v3aY#o(94VY8Tg{YCxxQ@Kr)GEpWtMIfbh^7Qmv`t&fiwh5%#A3Ze=YvSC0TCLx{KfU|7F;4A^ zjAzw!Ihbd4kqY6wx}az1ay3T=1Kj|qhp+`d;GHOL)Yp>d zi3f99oDb>R71)@xbFIh`Y!wUjx1=3F1*^B+tSFGGBILEm=SF!+9y`TjT%UlC_kV$X zfI%%}kr<1ZM@F>n))!Q~vp{J+3Jr!T?7A0b(aS=j*p+j(A){@EQDp7I5ng0Xys82v z)8%`L*wIvU4aUnSV>=Jd1ga7@pU=w3ckMRYD3O0alVQub!sB_>w7Nwv;wQ{xA*R+ zt7IEA&Q_t`G^jV_G|fyW znZ%Zg7@BapCRu!r@%qS07bmMDo{DF!Ct`_5Oj++!U1w~J$PvW8_#Nv$(d5s8X*JQ_ za`OJtB)ONL$Mo9Isk67rQoME~Ce|2kz~>CECILf%cXN37CW_fjW#nB1;L2!jxDWnA z9%@F7DfO=92Hulb)e)o!wBMO0lKFfr=MA)1-Hs`POz@1iOmks;$tV4*aZQyE2Bn$d zD0ruyPo{=!qie6W;fL4b^D;X9T0Av6MdE~q0=Rt^gK1S!30_|K#Pn1uU3P1otAE6j<1R--2JKFtLk616Z z4$QSZx6_0&At~iXb6wfBCWR z^EtPrg{53$WLp`${Qmen89SP<=Kw3Z@(vAme)s-ouVjrEg^3FU(nJNKkRl=?dB&~f zl|TBb$xORS7UM~68Yl!u_g93V)bk&cvWCl6+KTW$4;a(Cg#r?(9pjRhU}IHO2eT4x zD*<)^!)k{TkaAoE^AKHcx|n-R;5AA6THqqZN<;zp*#eecRb-Bij&;iUW*r2<&mz0j~6sB4|-9HuR@0?|MVsXw;180tKA9qvYh5=O#Qa zUtx?BxFVXsQ)iXI)_=7!v*&&*Q6J?^#JP4;7Ep#cZ_PV5H)jimH1bf!Bl9p~Q?`oF zW_7;SYSyYa`&P2!wJR<@=@W}sGfFNE~s&~dh49`yf*VDDpRNA)I$P_u8HbC?^!xaN0?{Sm1MysAi7-V z(4Y1rxv`nPrODuF`xK4sgV(s7NY!ARY7Am|QrZTC^QCcEpNcafYS>4o;fMBRqqZa; z1W|#$R>FLxuYNxn_Qgjnv-SE(epG>nYI2>zGRI6(Z!1WeN*#FU7@2SQ@_N?ph#F9A z#1vUogjv(sE7YGl#O^0sDYswlV2CJE;V63g00}DF0}hdfckKeHnvI&fGIT^*BU5l3 zMn8Y93zkk9Ux`Q3S3{mc7Sq?G1V^#ng!W6-MM| z*6;4QvIEb~=2BL+m323Z2EkzZ{fK=RAHfD2Ro$e3qX?NfTmhSZzB`n!gXF+HR|J~M zwV#^`zj&Zo*bx-Y2pju}0_^M~?chXMc_Ny+-;1ZoM=#N3rB~-@$Kw-5MRIces{a4O zzF~k+jO^!X^o8~qvj)({q{QPg`||UvwW*5n;sN5DNq4*SXp19Ic0V8gbn;=#6NyB} z1kD-BDMbP?0R4?CC0$=;BCDLctpsOdD#i4@W!HSSxf0I#&N&?Dx{x+|2N=5XGvtjL`R@CGW=c}K%;vKR^} z)o@t0LTpW_d`%=0u}hwL1fX#Ql+vlbwG|$d_krZ20IJ!^ac_8`wo^-5f`VCWCm_td z139$?$&x%Hgo$rUiGy-z&K=gBQHNIbMw4u%Y>`s3`e%GIWWFD_!cCn%xf-ixs)pt| zWER7}MJSs_111ESpRaGY2O>GD-RtT6C%Zmneq|S*IjD*Douvk z=Qh(~L9j)6 zCn9Cjls|yj+B;xI1>sgDUu`5*Ux9#gACB*5E`u;na4}TB`tZAdrtS9V^q$rxKJWSg z+QI_|(e{3|+gu&gpXhPd;@cS6(#Eb!C*8MKqzPq6mGY*KExH}vw|SqmoI7O~8N!|G zlu(5*kRA%B1PCtOndL95OUFZk5?=`i7aTZ9H z{z-*~SfvUgMH=MRt_8yF{)hnS6>ft1+_c`fRy4j1zjb;rPkki+JrxQD2^8$qpK6ZM z28utYpi#+vvA=1zs4v66X)1>k;dTSyuy(Xbmle%+6iH=Ienh~W(u#7dSBZRS+{bI5 z#p%w^=vR7Gj{z6+FdU8b$u2IiLUQryS5C%U0WQt;>a}a#i7g-`G_LG1yBq9YGQ6Js z;#dzyLoD)QYp2O_g2IF1&s{}r@s8oDj}|oZVQD>;ri0_}=t!>P&z9b$<7sSLE$W4$t5PoV(d@H}OG{&s4z&Px0EDQ$1sCmn*a%Z#s#sa5)fa(b zi;grl{h$n)D~DsaA~dG5=Akh~40kAbCd0f7Ps7ReCK{NSCXa!cb9O+r9pQo4ROp(I+JP zk&P`!^{7)YCNs1Ng8;*TnUh7ppV|6$OsvnNE-* zhEN$!zWvIUUwNfYlT8R-dOb%Vu zn?^Yf8pbH)s25J(g3E*0n{}{GxyvSjd-_q2tUvgfE&<4HSF9rS|z??g&SJ`4sT?KEI)G6HCFmKC*#-cp z5;uJjWNrMU)j45!5rXBxxnT9Q$8FTbURmO67?=p?PKk6OBXt+nw@#nXZGHi$M=@Q_ zCL!F@cs33oE_+5YT+WizKA)13@{R}|(bHy@t=5mBw~b1u#2SL)jD3PWrs< z1|=Suu5s;VGfQL|2(M$I1fG}Fvt3}9c%>K4yjuWC2;e3`KOj(5#*uPq1J%59B3~*Z z>$dCbL%0LRuPhX;x2X>DDl@%`SSfSKpf*@QhEa<}DklM3G2~n-8>B*2&B9&+#kQV4 zH7%IY7v<2E({unm;q@n41rTlW0D&`W8TOLd55L6Y@qznm#*V!k?Oeib)N5}k z!LlkQ{mm=OZiw@S1c}?gR7K>HOnfkRkoMdFH;ad369uFT&``?zZk)*|_Q>$pdIr6i zj|#7kSxCAcT+ZL3;Xdxk!{d3mh1bzsW4$(O)eY_Mlw`}6&`5+)w&pU#o{W@M^y83W zLu^Xcs*!C~-=;gnR{!mpoUFxJ6`t}2Bc!UFtTG|#EHOx+9L|lekGCQArUVoHH7sHH zGc{d(3NUZGrh9h@(;^q`*Gs4k*j0^=bJSW5@e_l=Vaf52!RQjMunh zzoYFxW1u70i_dZTULsoE?@#U7RNsrsIz079m1CEc^s)(v@XFp!$o=(Ks);?ysTt0A9`iH z+0q;Q;#$Ikv8Q>_#8d|1lBN+luZunW{HvTkeNe?Aq`8c+k5g{OW>!=zn`vo?eSwOb zjt_VPIFpb9r?(^5&KOuasK)~;2I)pv%RqubY&jQTjTLA`@bN9i$w)g<8ze@n8h_d- zYXb$?46v)`Kh7b7Laq~X=!SIAsfIMi8}n#2BA$Xurvp2wyU?ppq_ba?T-q>{cOma57a%?% zle}N(5j2DGC9*?e0DEO0)gxOUJz!6Y#ZGi~@R%D{Yp8W7o$4G^a6vPbS+jlnHhVNC zwhA8EY!S@xcS^j#r-z!K8~ zG@LYf5kDdap%wx6X;!tVK{r#|?UYgqs#xh&rrcQ!M!;COLB z!tq9~S5u=C>8`Z^9cw~dYiyk#axXlHvPvrNDL zdzEymQzAl}Ql_)JW$J5gm#JyC?ghyBb($FKU6=vFNjt}Bc1A{;j<4ZajB5M(j#TUh zlk6m@P4qL0$MUIl0Ss$VZ)55O_m8yg71@@~IlIx|8Ye!3zJ=nN<=TgF7VBdJQqp6j zzWNJWvV~?%w$=4^TNN7o&FLrfWz7mC&{*C0i$}>8*Tx%3Z#mrM7+>Ftb(VU2R*lL) zC*r*!a$v}Ap$Ytp2BNWUB?R@l>zU6ZH`z%ODEI;G18d6Gxl&|&d|Y=$c)1s0F7+tF zqfarm-9xch47#Ec(_ueUVRP+o@k=wsK}!7L13Jrfr^IDMilH)ZtbWNMy-|fp_o<+ zr|5SEKY~$5B9nk78UWwwu2_BtG`KXxcJT|0L?j{Dsk-ig@{-!APByZf+IlJ!dkmIo z_BacKk|}sJ{FH5h?%a5a|PrVfMe@9ZD+{Z8=iLT~c{FALf?jzYUXKYb_9MN3F3($*migHyD~9 zB;Djeo_<1VN7h1ob{pP$@~2$98A02RY)0ZlObgl(ZSd1LwmN)4uZH2<8`of1JDx*vgJ{PXG-PQeN5BLM=K)eLk@iagFZ~$D8r=@5~~V zxh|&8pEAATN8;3XQ%+k?Xy{#U7M6P=h3`l0Y}lSXK)Tc1XZMT@5U6wmj@!H-i$($f zOL?w#n2N`wspy>EmWNhG`J;+_hA5-;*TrMEr7!NpkEw%h8rLx(NyoCbv|^8R|B<+r zAyIyU`pegp zLxVZ;!+>|35}wyYkgP$tU|3(S-KN%U=F{hrTBeHp7hq-9A1f5++UCDjy#J;2@oVY% zSA6?W#v4n9hPnfWgB1>-ZhS1$lmw^#=J$?ITZa@H9Ah=BU5W`Y3>O2}S909OJz05# zd=lrh<;|~!`%Wv%x$~z81uuSEi!D;T=p(7G+m1RjD*&SiQv<6ZBDl2MQK3Nl;l;(z z1%-V8&v(+UbXjl8s%&=BiwY#NdstA-BE zrA&&YKlAES4Q*{TaYLcWfPvE=yHQs(&y+|R76VqaLf^brgOKC0mQk?m=z816{w7}= zxx(*uhssy%x@J<}=5t7RIT;11p~iBX!~JX}(y*3@K1N8pqH9fsZ*f5lak?{U7zjVJ z{Rc=sr$&A;gbeX>AVxr1Omy|Os5DEl=&XqOTMW_9&SQ-ksha66SBv}T<#tv)@=(kcGrduD!y$9HvT<58L z4_89}W)qYEAkm71Vjt`yE1npu6Ml=9r)WwbsWkpGlEaJ7DT0|FBxd-(q=&d87RYYV zkuI9-mEz^_U0AzDx}j@SlMT##m@`4gRNFvDdH#%D_N0hl7nNP+%uug>Hyo{?WZUlSl}dtS82;=XuIy7S(o>JL&0P8S9Q zvHgi2cVWl#4oTPQxha&h2q454_4WHVwS4NU;^~MfVTwV4%9eJcBnwTGh~PE zxP7lXLHvvZLMKb3DP~m}P(wf4NNBz;(2m*-ziCBAx;XjODLqm!eQlUX*P39q^Ly=i zG(6L~$0;|YHZ-0fWo;Y6ZKMl%Ex4)w3%`ZF1Kk{?@zDR&(VK+0UMkon9w_`kUq&$9 z#!DqColbR{SgKtXF%3dGH=={z2v!z!Em8ig5hx9i4Y$7g=mNOQ?Q6nZCL&DK203!- z%VEWJ?>RV8j3K09rChK)Jrym=Di@q|JDJZ?M0RP|*?{#C)c1j?*qonfSOKYY>+}u{ zDeVV@U%GZ9f{VawLeueCOU$PT!DH#Iyt~`sjPfqP+b_oK7(8YUTthctXw3;`?KErP z6snc?nEG)Si$@Wj9pY3D0Hqio(B8mcp(T`*>%?9t5jrb~PRSpzGG+SYn+r7g-eMBD zI8Q>w-vJlE*hS_{ zJ@pH#)F(m!`CfD)k8kVQ;>&f#7!iflSQlg1mH}4C*%H+9x~cwz=5uBZP$AxnCQ)wT z(yp+Np%rpuZ$_emKw%uxj-m)?f*6(TEwVzi6Or1tSfh5&l~VaBS`+IwBan-rDL4r* zlUHGVA%pF{#dPWq;*id{lwsAhx#O1w1vlRn(Vd-{pVxfSj-BA|;p9Qm6 zR&ZUq9EHO{;Lwe=K=}eRXWSNPcU3(tbFJ)AcDi1>(-{LI#n5fv0!X6cF-}wtk)*We}HT6@qkxYpm({9W$5%OEN;)i2REpMwJOFFUa86ur ziFBYN!-QAs(%OI)umv<3VpMl4d57(wUSpbTm3#r%$5dHUf&g`9{E{h1TF+IS2o8v?4rffI`x`GTso@d}O;D#K8h6Oj9V>j@CQ zcCttox_rxVEeOQA>`E981T*(C=Pvq^S-BmHf=NDZpD@y`<=-$j=y7{j6*}I0DY65Lx0yoe-u+XS# zzG8fx15BAN1M_5ab#=8o&8$XGpgW&3pq_gdI0d5M(RRb>`0j>9;daDUrLRl4Y2W&n#~^k5S^y=;Rv+S-RFMAmrO1Y?&59wlGS>O_ zX&Dj3wm8STQX!sYMQOg|Qf7ndFZFc2#}R3Y8P!e|3nqdM?7B5>ookE37T20iq#M)} zU71UYFsFxI49%{!8XvDSYjN}*p@#rgEVp!#wzMNkI9FGCO+oYah;yZ?k?89?tWjEK z081jqyhnJVB05Q6X$Yj(iA4Z!>I`6tRL(6*!csoj=+SpaP9)TS;jzq%orGKfemntt zbpKU;`24RhtP@z^BiYbXhm*~}>=Ob22}IuTf$x6=4Z?C8*HytMd^YH-X+Qfih9YaQ z2o%lI_Iv-xw*WU`rM2>tX+YC00E*KufwCwXtj9CE!Vs9sk%%1a30zI@8F-7>LYK3o z($0ZsvjbHnL|ZZZqOagY?1lkIoD9Ps9%oB4PrmpCe-TiVU)+4r4FQ;UaiV{8dN5!u zHF(H+x>3DlJAG2=oA9!XY;J&!gRV;M$BH6qI^+OWQx3|~0MSgXYaM|P%fBnL4wz|s z6Duy)pMJp3CjzjP0p9Tlvk-1B3)}KnF=qRq_I2v-EbXW^t|&R;5V}(PS!G^6zBM7NJz zE65)JG?Cp6e9noWr=nsW*m#K&g`#-@ALn7%R{R>^Mf8)F;sqUsc>hQh#eb`s*WCd; z*VFNVCb!Klp(?%TA6PYu15~A*$4=||=L-f1l~)m0h+aA=7D^<_5|eI#0z*baf|^`4 z>#G(Do?0lbmsPLeI<5Nrh*QvH!5P}Mvjzy_LM;H;OGwvV$L~z*DN0Dwu>|aoii;wq zqsF4L(gDX4v^dv4gRXejf-KL1d=~9fjk>lO{%%omt^xNwA%Y{lDPyZMc-gpsE;!EK zZ~TBZ-Pg1bAryhm;%X9+yqgtgnx}02uHE_^0Lnd-PF3%yP$93zG61}&W!}NK$jnnn zh#GJh?w{>cm%NXt>fv3n`^Ua6Zc|F zw49p$z>;Pe|G{CXT?XhMQFcr-yz_u7wAskwSOnJ_X5^833i??p8+M(VHOyb`v2iEy zYE{k+w(yg7#;vXgc+N)0_SVqSem?wvMG4XQLY+POI_<95gX+tWN}Zzua4!<^*~>&myDp&QrLgtM-hFf}XhxSl zRqgS3;~k&p=#ofN8n;nELP1eqZn3V0(u$~~@x#`Apv^T$KaXJDnvgU>55V^IypIWJ zsi+2WHL{161OjN6FJ^K;_(55>KUywrD`Fb-0`Av5ObB?A&R|M}r**G^y9YM0X1pOJ zx;t&)MW44`@SJG)N;9geR)BpI1*p>@_f^ko%u@zj@1q|*1NcRGxfW?XTLCi`HdNqT zlW~W|p<=!~!8E_VO%{mldZePHeE|2x68X`0<+OXtT&{ym!Q99q7tkrBn@F-S2D%q| z9RPz8kofvRmGWa53gne(=zQ#H)*v^ihc4N@=4=Obary7UP!*J@vYNCGD2El;m8Tu$kq-`d z&_&Zf%}&hc>&orgay(oAn$Ujd z(lGovU4N%JqURucN+N!jefyw_@O03xy5t92@Y)%?bAoN|-rUn-XB$gKq1mfLMO*BU zcFWX4wc*7<&ZhMpuuqsxRuVB*QRGsD6*f-Nhzp zk?x6Sd1X)*Pr>mHdeI&FqexCj!RVFAPzA*HXL{Tvv(<_y<&ub=<;gkK%-&Ic^MQ#| zitc5P`ZnW=J!>M@_`6cw-R^>{b_bo+PE+(28VY`@F7(;5aAryCQ6|0O7uRm?{N=M3 zC6xiEO&mDlu8k~geb}KQ*vb}h#z-_*<>NP(nC_&K?;=B@2BE}&^{c-2x>!_8+ESrx zS`H+a&(6AnJ~U}*CgZr$zT?Y+kj+s4ZG8~jnK6ml0jCCFS?oD|#Cg+%$4N20W)&Ep zsnbTQK+&-zm`NX7s?-!rrlX6F9dUyH&? z^rF^%@6*e7r=9pJEQh}^#k9Cr0CGuw=NYoxLU=p?%|Qy7jhFHwUbwxoDl>E5mC)Wa zf2)R(^93`+iWuy#oMxM#1u)2;)3l=F5eA@VXsD$~g>ZYCIV{e;1~BaF$Yag44Y9_c z{XX3;@qG~Ga#we^fi4a5JwoULBmCM4W>9)>~GU$oHk>1iVyTm)+-wni;qa95=3>Ld(?OG=N@1NBc04Z>QM%L0KSspqK zv#oOEd2Y%?i(uJ>j!L(UuQJ1>R;5XF5pSDK<_je1YsosLAO(;7*REI6FA9TWQY6u5 z!u<68|Az|E+1j$eP!}GLjJ#v}Uk7j^UnM`^8wvndv!N$>qfXrA6^W*dKV}# zj&{VXfliu~wIMM^d8eGL`NYsKEzumq2Il7Gyrkq8;~&QnHxEVK7wj8tzOpb55+NZu zX~#?-9jDTi(?&Ju9;Ul@!A=^ZY*y?!vaPMH9YjOuf4PDQbbpV1n&^1S9H_`1(^6}8 zR{?#J4q2}8v?o}U=~w3~wA4Tkkx>EX14PwjTrd3?!H5tR0O;SzRH5k-jEy62!g8ZPMmsw#pF^{ zO3NUHQ4LpMYVE1c@mOaL(ot%e&UWA*eT_us^z$>w!WMCzt+Utc&b%}cq8Ntm5CYem z0-xv6Rt9Y-r^C4ThpM|oO9N(#=y5GuE`O;dGD_=V3Yf{l%P27)uc>ZNW3%7k@mq8} zv!&Dd`3#S4XtIvcC9OjlJrp0^{({9%s(A=8{>#hc9dMmlt*aOLL8sw2{0k10%>`$S zJj}A6kM!zOh2AUAx!u@T$-L^ESZV=y@XuRB0oq!==Ci}~K4evRnHn@v>I$xg(2e)^ zBSH7}j!64(W?w??xJQ7lbTHBxupa=;uMbo;Dic*5SnUUgB11bh_Tv4uu1n`;DcPeN z8B`TP#+pf}S&}?`k@SifK|Pzs%}fqC=Cnj{X)>WZ(QzATg(5FvrmzL<-8VzIr_0!R zHM#phcvqJ}UhF>;4jPNUG&MDq7|r+AHyWS1l|;w@0uP#uRGR*Kfafw;0MZo^P3q8j za2`P`==)4WZa~6~r>l0ZU7S64|i zX*u5vAUd7i#Gjz>Cg_3kzxhx=b5oy!w)do`|& zv+`IK=XRSGSLma6TAG>g;cq4A-h)+u`&B7Qn?LjnGkCL1(wSkJplgwrXIKKVBCd9n z&UuKEkrB;=hg0+u8;*r_0Z4≈#<;_cc?dx;P#B>BB)jh}J$G+O=qkJ< z9)ShYOh_tMW5}s}sx$9dCd&OBONvj{H=b%YEpIge$7)fod?N|%x>YFA9Ol9ii%t@qNlCd(N2gLV_GuJgB9 z(rUs}Xpr~hx9=2d?5vo(v~AVwX+Z?vU}MEOt3fuCxu+|6Y&@Y%nDi5zdO{nQP?sPY zfWN9If>~)M@~z?xstPpCql*W09C=Ba!(a2>)B|p1lEH}%o}xOATc;Cla>iIViTy&P zh_yrSl;xVCgh#{dtBFNte{un!wBS1mewU{Amla%*G;dTSc4K_Og)J&@Fi+omF}S~A z>3qT`Xw>{2xxr9twT9L(^tlb%v}x(KNeLR2V}|qCkfHPS=W`wwjw=EUY~vSGPh8USWa827?Co{nkc^}5fu088 zy*M@~N5QA~a22E*jmom(Qlv^+^iIpDfs@B-`EKL(A0|a7M*z~m{O3x);-hEVQG&rK zA<~W_Li74d7cb^Pf9^hlSL@by$l^BU9$|@fQrTHGn%}43ykk~v44%k?O2Heh1VFP9nIog$m6qoemsX-YM8Q65~6lW!Z3RMwos43`%-gWKmdweC@j?@`|w4C^!$e!g1-ce+=&70%^KDD4WlQ zHwYh&mHu>k2o?}MP4_G|82t#JcsRj1SuL>}Ps@CMo*32kT{8dSHb~saz_}8y{e?sP z&$S>t0kFaZvAtHVHFgYw8%);3k|rm8$Nx5Xy93}Yla4n*FAFu#G8aUNDOF_Ntis#h z160bEhxJ2XC%{1A<*YZCA8xup1Y`Uu3?1~QLAyt`v^@^bUez7e(WGMRhQs>M<3z7+ z%B4$dxc}_>hg|&6mvuz|t>It%97}S5>OeEVg5_dJHJc;>=wBvi{!wHCXOF?rX9Lj6 zN|&_U9m)Sd)$K=gmDW}&{{~v6iwt!d^NiA$_4dfZ45cq$mv$w&m9{&6)rjahGwZNt zzzhIa#)k|xU|5I6J){Z5y;dUzqdolYpLq%`E8zY4jR~;R91O4{1srV!4p89#6wL+k z)~Cu@?pV|xB#Dgxw4_|JPCPu%{$;}Y6>YmyIPt!JLKep`ESSL!X~s^o;~m`arzC#6 z7T$(FL;?JZ_{LmLqpbd*=Oz{beL4 zNP%TLOWVt0d_W(UM*r(sVn6SHpAMMn5K>2v@jpx&f=&}VTi>bv_$Dp=%^$RWvgir~ z6;ID`%{Qljxu-r;el0ESBS2sz%7*JD4?j%g{Kv}v52x&biKfN}WdEF~crrNL@;URF zP|er=1it!ZH782E7$x=E@X?1XgwaIM3%3s!0%I3L<{$f0aA2(tT?asH zIxsEJ{mpQS&@E<*j@R?O^-|VoA0`H?{$+ z;e%rx!Wb|XX4^(-Rf*u`49Srd{{CZ8dBYswsKpQHsHmL5OZ%|<|5-hTx1mqMv|A!q zJ^Ii!aU68-lmtJ%ywAZ3L!3Ges1dPF-|r3UPq%}f-3InS){^6Jbrz_AzyJ?8CoayP zg3)}%t|7MDnDH3jUyV#q3(Sb977qK5^Ai?Xz+!hDqs%&aAU|k4aBoSBsoi(i9{6!j zs~0-&4bWf;YJ&|ah8xApvLygc=QGY&CjUE)weS|&ZXT%RGaW5(BlPg7Yq4NIoQhr> z-u-M49jVyRTjwHzd?iIL%JZaha99?Cb zdVjk{Mq>Kr_S|y9#YZ=97cfQ#9K*n+fB4HEL=>1(xu7Lq&@XTQde!q&U^8JgnU*+q z9Fs)+7+hM9?Xz8*Y7?@7l{BqKUMQhW7vg&iBZ6r{pp7R|+36c>Fb$M8xt%kOWqyAk z`p)oU5+X~-{}{`mhe9!rd$~x78(v304?J{+HP3Z@sqfRL)c&;2Mt~hqUBx7c6^5&3 zuHq;+_d93~PEQ-nkGU>uCsp+pLVkQ}|EoX#z{MXXpIbke!2T@${6pmhWWkVp3Giq4 zhw|;zlM;)q(KFy!ih*@M7Z5^@^wB{dg>aR$a*=7d}JV>oYL94f&jT57P-U9(oMi zZyb8VYgYISIWv66mPc!xXZ`1(E66`2iG2+j_C*7o#0Z@f)?{(8E45%v(;1ny z-!*Mu890Y=n*kVACOE!gK7U|*U3hu?#(Q7bPYwO=Za^)K7$AWAzA`9ygk2FRftjzE zQ&azt&TFhTVHEgLk_Qv;9F@{fuU+=Ea8vT6kHNp36bMD>Mk?2KS&+-U+RN;o-ERDn z6FErwKOr92CHlWEcpz|QM4QIHD_NWljBZo>IsQ)_ODFf5?)XH0O&i5Vx$kWQ^P0!G z#NbW;P<$Mt#A2_UjDP6}|T$nD6K0$q>KRV+nC6q>4f)w!W3ERog|U%5C|kd;w9ylB;yK~j13&jpiy z4ao4@Gjg^A>-qBnw1;3pdkh)B_sRzXixq*(43`k&F+^9GFLnc}^S!l_wosB-&f|?X z9@;Dp3Z*I_q1#_%>1M`R6n`3TtQc?zD5vQKsy{#bpH8mQ1DnEZx&i-TcTvEms$^Jt zv+s@(CL$AX7%*^y)xhGidql$TsQ$hJ>74x~C;YEUoddGXd@fD%7rFm$0Ldx9&VBVR zUptTi9DZ`3Wak0HJANM$raJ^xpPa|waK&alT>1{!L{Yb*&{|X2d173K(A=4AP-OIY z^`*}Ke?$MroJgq3!EM7Lc))1?RkH|haE-Ho;o5-{(mi9V{m^lHQX=dTE#hB zaxA)w6-!RYV6R08G#|aF6i@hO^aOaY z(FDmL#u1m^NCb-bAVjA6x2^lvVGzgzua|d^8ZavVkl9aVOlyE^wCKd*kI>fs*Wp*l zfXa+<+8qD3pZvJL-eoX@tCI-!BgA_!#<~l@S;arPtw(sorBFMxK$fl#k{ltB1FO5q zB7XC6%TFiz*LxC3fT@z)q&{_oI1c6!dlT3knG2L1|A_&{6plg6mOO7@V;hZ+NwO)u09avq_w;HzK;_n*aqqj7$Y;$L3HqGRd^ z>yuf>|K$R-42%@P?GUsqG?G89`d{zL3f7{{m4E#hd;n)aM<^JqQb&m6$5#DYIQZ-0 z`yrD$d$i=1PzJ02CAo*`c{j8| z3Gd0)EVnO=VPjlo@`gnh^MR8YONVb)u9@%}j~;qHq7WSuiMp?a9+7rPq1WoQ8DbKv>(+%R^0 zX}1Wu*T|GFL<-bwy+Qn(VciR~%!|nMMSi~XmuW)D&|dx%)ga~%fg}9uk|pnZ<0cWZ zT3;!C(!HXuF$8vQLnnPfVVFw_yz@#%s8@r z=^u&y`K0S$O1_vac?Z)b5ZBXsQ@n`Thnq<_F+mN8?f-luT~V(R{^oTaJ;R$tHihAUd>!=d;8o(a5+xpNxvMmK=c^j3e_L2oMdGE_dbx65 zm?v=D&jcT2z<^2x)RTj{TYQvy7{Of9KPSU30`=r6z;9s>;TBq;^1$z+{<=Ww9^pm$ zU{v}XZ}EC~2j^Lyl;M^A+uCt}#hLRssr3jzp)L>1g-|})hE!aS0&^3ulsCJlPw8dI zoOz*j<-mzw9|WF@iJ=m8Jk#r;5$=;?^y>$}0el(^^l5zOiNoUkN6-haWf%S1{D zHe6!3KF$bk>8cI0{a;qXv~;p1mN@HP>jxjb`2Bfc0q3T5@{sC&zqj~5foSlpWOjOE z)d%N4^EI5fYYM&ue%$0WtL}}re3`GUZoD1Z)Rr3b^WUNk@<3Z!cXLDc{f_qWscY3r zuf-l~Jloj6_f-bH-BP7NPpi*+HHuNSx1OD}+jns0=ycZc+su(W#U0>1Cr#-=bx-yf z*Y_2h-THDjTt~sS$10GA;_h&L>{?&iku%r)1BpHRYBh_b(R*jC7~F%^omkGK=S}A= zAAt?44vA5c28m;E=hN4BUHSLM(?57#`#SE3dT@S#2Pg%q17tF}YV3G~RAO&|Yj|Ap z%+2O7ze-f;efi+WzCrT2hn5llD*c1K9Kt~TCvOTQ{F=U#d`wo1v-wA=r{Ma&Y<}d4 zXYH@sQg{$qBfcmhv`xvkBc*Ujw>`gg=<+Xn_HphtcjD|B&(fsLIghE}-pCYJiG93> zozzEWh<3DfpV1NjNTI&rQ6~N#ZQ|coV4FQBFVG3!ky|9@Dv9Ofkn_ua+y_B!y z61a9_vx1d)j3q7jyUxgY%cRx7y~wTF`#L)t%(%?&>&LfLFXx>4)Hq%#M2IW;b#$V= zb24n%-Sw@a{}laACHn6h@?TLA6G-~)v(FQ0viBv_jE-b@{t?!nul|W-^l(WK@kwm? zpx{%h&nsz)vTGb*`DIN(@551JP6kx(3LnAN(gcKPbUBQj1v+;=e^pLZ?FmY;6;>%F!sA6Z4| z$!QxLG0Z1%bHC7hXxzSd*X38!1AxW_ePR6oLzef0&BT7XZ88gQnF=r-_?&X=oL9fV zrIB=tn``?Li-}r+cXD{*8g5N=Cv@Ya`)mcm@bsAm35mjYNw;irizIagtV)li5_q}k z7Ca(39>cg@Ta;(Jxy+c>v#Yln@$F->n>Z(4l|{zpnd2q1RY_TAe)VNw2V0{JX8pWTQ=G-_P;wKJ z9o)|o8v{){ULLNet+m40IvItDVr84<#hvz%e|2)2dj7^5Ip1S%&s$_7v;88mF}Vb= za{>h3zRKB{=dD!epKVNxeaNA<|H%aqt}@#sWO|zN?!tWmSusjPg6;$3S0((!{7*&D z@$f1-l`TzW{Ka+-Ee7(2EhH~fJ!sCip)HU997GZjcgJ1Y8iCQ4|>Wn%=2Q0(6{gb;71M~PPGyc{G@2j0X zl2!}XKO6gPfCF)6Xd2r|?dF+^`rGxpaL_JK;`r_tCv4&w#l~wRA?W z(>@2wYrXj;D-tWEDp@YkP~GgfU`kbajmw`gJ1Hg#1DmnOgYjJf)a#zgo06ABIwa5J zNmliA5(4d9^x9{Vh&0b3GEbH;;&+{JH6enEeEC%KN$=UsTW`cIbO&W#jj73cx+gbf zn(ErT&3Du7bW!GdrE2N2%tbaZxuYphkGF&s(nvD)yjdsvb}Bjrfoh^x{JiiW;Sm;g z!ym-VcvoT_Fx+yj#*Sry@lC9>ynGjImx)VkzZ+X!>QbxI~Dlv+=?-rDP7qg5n6I~Ro$ z)ToQQU?N*Bu)?H5rfa!_BcSiwnn+Lj!3 zv$wyA&l`KOO}c_bs+~!r_jUT4lDqbw?7Akr`_;ftnp(Q-$(T7T9kU>5!pBUh&JHT} ze8kFzxpwtsE1+jRtOGw~J{NL4f#s-(bPX(%)?MntdLxW_f|*;qxr6E*zqv3FOOOpX zS+CZq)>zi+I4iazqi|4=S6}ON(qS)!)tU;UFqaAj)bEt%yttQi&blQ*V^L3MB;#fn zry9dewuu=01w&!1my&ro1(JFNMun+~)Ta^Ix8v13MQK_IZu;hCjeOhHC6ZJVCb`c} zg0tr3@f>%Nw9m{5SO&Io0x> z9(IU%iOErMova9-`)Ve?OtCj;)DSDxl3K5hO zOf5|?&AW+|1FdV*)B$x5o-&KWnwZWgNh4n{;!MWVsxh-ejoLLOvqq$0R-rXy1Xh{u z+jk3~d20NhDYw~s97e0_hmW7%&(?w^L8ieR8Fy4pHI0VfMrg0lP)L6Jq93KqkKv-l z^lMZG9NT>e$)j(Pr!&>*L>|UWm4W>(wM>KaaWvMz$#QOzKWeLDHP%VMUyVO_JFOa8UOV}>3K8+A$Nf=^3k zyG@hbg zOy3gwPZ4U>**sH8>-p-4eKEO|R%G*cL=c*Uv6Z7CLjP!lZoz{j>RaHOs&!@&?3K7nEhWhLwuGOdu8cb!|v{o2OC3?`}G3^Sj05Myiz(>sP0o zrS?V-={zY|=BJD<{w0b1xeWoI3$C$Z60O}LT&hqu-E)+5!TdJ8zME(FyP`_YlicYd zi-vJb4;RqGR8u&J+@>(hb}(z+uBJPed~LRydQ+t}!Ij;kcKxYLz1p1@@)%aFY?v3D zjby6lR?U#%!&d!u`jhS|cSPCoAkk_=Eo;w_$AcuA?u*GWPc2tZH3z*?ElJ=%=gteS z_*j`;dhPj~%cIq@`)JQ#k51Ow>u2)Z!nl ze%HBl6P~VLt@l;X9e1AaTPzhi5}^tJ<`xF!`AL%aBdi2gh=>$3) z*;>D-{}^~ad5rDJW%h)Nu78a6pLhlyK>CvGF61RYcjJ4>Gbek>$3=W!1nNlHJ^cd| z62F+Y{JQsqWYbPHYnvf=_AXnSOra_cs>tqxN-7&>hb>{Z1MkU&DQ}|>dgNO}M(@in zH^nNF;Y@VNId6#Fqsbgm%lVYDl&02@nbSduLmo1sEecatPvJtCL?sia;-p1o7&TFCC~r4YG?QCqBn z*BVv%_0+RgGFJOz5<=K|nmN>T-1M}Z;+&1u9_EcLBytF$>q-BQY!8RGHvr+|VOk3e zYi?wo6aPiv7)PC>Xv4UtZao(@z|70QKQqJq8l>7)hY zXpx@I%Y>-d?v+e03w~nFZtr>LKdXo$ zE_76W8I?=Y7aw9pL{<_sVf;$&)sEb&sn=<|ud+&28pD%v6$hvb&jO#73%ILR!0^7&|-Kh2^N4n|C0a017F&w;<-ZFo5bOFI%$TSq0rxzi~@!?F?XENuJ@bm z8$J;M<2$exH6xI&%|@@XwoS23`|N~-yQnK#<&h)9i-~GKX$jwEV216wRpdxM$-Qd$ zcI5aYiG~8ZAvxBmlT=0%-1flNXPU<*-u9qpSsSyRdT)kW^SE34oODJ-J69IgUXWG2 z=onuq`1H8}1$s(c04Q_M?>xYnf6n5!BHs*02V0Ny&D>H3j*Hil`*Gx&RiExX`S5w+ zQPdyqrLO4YLZvD14Qq1NF6uWq%yp`&wp=bXu+Fxu7co~CAg?bQQ{3^5NTqtx;-zLTISn2wC-?hFvGx_9nJKq&BFcZ)vbdfj);CiZ7d?(lwG1^& z`OD4T4%HaCwj6DH*|_;B^_zFqapk`&FD~nR;94?Ljm5`VBBkoFHFNPPXTIOZ2!%Cu zPiei~)|J)ciuu-k^28ioowbw-Tq-Qn`X&7Rmen}{gd+4Q|DJ=QDCLVDWJnDw{`8dm zn$LXr-Kl6cu4!ZS!Yz(_3qqMFp1uo@kOFP5l}*ddJGX6%^641y(b}s3Q$EeKG#@xxVu{RxdAz?MPkR-IH=bEA|aGY7K}o^ zAdwKan%ZnhtW~|XTD3l6T}N5-M0u&7t=+J?`{7gSfgrX$8S)9+E!I1@HySQ^`7}?nCkv%BHdy&*r@fqgY>M5qN5N&gJc$+L7{KeqT=hJ# zmmkgo4)MJ?%3ez%XCFegNZvQDZ|_XD5#tz??g9 z#itWlFxk?yw&qkQmubIG8H!`tVpP9%k$qn!;=0DZRa2|s6N9-~w_i<(MVCM$9L{B7 z|I6Y;Yuo5Lh=vJhr=MA-_4{3wgW=A$&T8uq|2{c_+h~mFgxpnzvXxtvpR~HPh`&ym zef6o|&==>z9kST^;%9dlWXQ9UB|=fO2f)wQ+x)fLtnb zp3Qk(3OOY6C7FdqNEH?Et<-0zX|LOQysu=RK7gwcCvq~+$fC=h>0IdRFX1&!XU_DH$3%yr(0uI;8SLRJEp zIovu&Lzp`FDp%0vI3jk5X03q4XC?wqQHOp}KN3L|e7a%AU)h>$tBoX3o1-bOz_?C~ zE1otKi}D zf0G#h`KJ5WK{l|{6+&~5#C>B?Ys1sKIQIlIc_x><#NUxKDA7Mb1nb z3S?7Vnekl3Q+mIg6!T@Sj2ztV%-5K;S_D%^aC4uZWh+Ayi!qJzSeql;ejkfLiJ;VE zB-TNVzgyEgGKvfJG@6X0F#Z+A7>T=(pDTy?aCOM;9W}JqcLsr8wu5wp|N7{{C*R=}p%A z8JrT-cVm-Udi|7A4RDuST^z;F_D%Uj`8&`uhhrEd#iz(@&Mq3q7iLXmdAD>mSs(H> z(H6J+I=1pY+i&OWN$o5+Fg<2Dtxk$vZ+p2~TV9Kp9pTZ;JbfOOeOdFPYs3ji_$A}> zUljSEreFB=yv{G4m&^4eZ0esop7Mf)-^NE@Cirx{TlLBb8UlryxL@(AV*e{oxe%gA z_a;bbFde~z|8ibm-m(QZwm8#5RF(uSO=rv<@qU~%o>B4dK-KU9*#@Joo;nQqN}*Wv zN5t{mi)wN0)JApu7=xX?Pu_2L;@VL)Q2g)c$zQ#hv}= zrcEyQdF7ueW*gVjYijtc1GG<;+y2KsqUwk)@`$$&-T$G=86}!>my18*$e8qbV;c}P zR1y5azsLlBQbL9mplGmHf7IafzFV~-il0`kBiQ#TpiDX>Bi91;mH1S=4rUT`)auD6 z;m6MuQ@=3U%%bqIoDt=TPCIcn8P&$sol0{&T>jNUD*=Px8;_C;s7)y`urpBML(3V8 z+kAC3IWShJvM5@A(dpSF`}qe`qbuN7U#jiCIp?$Osns^&-aZhOqlBpp7yTqbV|5Qh*o=?TV`W8t z?=X(MpN7aUzNEJDmVO!4RMY{o*z#QNyr^%jxx4XSHTo1cXw?VKsT@gvRMVNGYfd^I zCeUlB=kPM`HIrex{6)tI+|8Avz#VCbhBUuB=5m^odET|c%?Hhm!(7RWBAH)WhkCkx z^q9F_zA-T2-Z@^e33n#9(MWu#sSI^>)|!#>(y!F>+#%jUKVsS~s(AMwyxXtu>ZgJ< z$nxBGC-d#?2@<+>zSsUKG2~3SxU!(aER_A!M$1`%W~^;tIfG|lyp|^ zWsPJ_o?Ec@`6YzMFAwj-wR$>&Y zb2Qkn?FH4i;1Nz%xNiAD0q)(p`6Zn0LAe@h|5^rT*T9*@)R&!HU6$tdR|#2MOwH+! zKw;~ZVmFs`$yHqRm|xCI1L-4LIbWhq$IlSjI9F-7n6!UCc%f4W!4o8}ZvGl(Px%a}c_`KZ>{=t+Dli-*L?I;Bx|I_gw~O25TGf(ybw8cg zJ1@GZQl4vLkg4bj_=wNZ3{TrWsHFsx*agtsU6kVNo~x_ciJ%HU)=9ypjxwl>E#J4I zSq&kiC(}pJJ$_Y>cY~;&%y(WGK#fVfUZEVki3JkkoXCJxU%eAPRauy=nU{s0d_$na z_d#Io=ffm1M(&({)jtkZ-$G5EJG7|wG`UL`a;wwI&42xPL+AB>c-b$vi1DGLDRaG6 z|HcDV%WeQO!oT?$no!Q4v=++Qi&Ts>brnla(xV#+_Iu?<%`=iF{eZPjlY9_VC@b9Q zX>hRb+(YS%pU%GELfm^MS?)P^jBfV^*SP7vVnymMY9nO0l62$d?3T0w+h8^H5vr~- zBF*pocfty#siRh8ap{=dW3p%OPk^5Y8W3&+?{T`$;_s4_zrr4P)lc)CKs857n1}Y- z4UwRit@vd&orcnDxBl_o9$kA{SI1{jjw)Ky(R#^~ldb{CIF0K|uiBd*t0;4n$1J1F z0LFCSrLW83*JQNeGV1lU)nY>@c@;w4oH&{8I;PMZgd8!dsJt1ZDUxSS%2)_BMH?$|WK7v#MA}y(kHthBrhu>laPTPe1=V9{9x_)cwC&AjAS&$5}Z`32A+i3*M2?`zc z8!zHe7+9cILmkLaD>Yo9qNdgFT9H>mYhFwIJDDE-1H_wFV4#Uv`}T>%IQt6(E%X#>S7P+hWkmbAz}#b;)iC ze%sxc_XJ+?w|HYPu>L&^!R!B`0x|{qHvHs*PQYOZ_*y$y+DGi~%3gBbZ8=x4{86iU zd90FI@w=7y+dhXl$_N;WR<4S_#+YM9M-PM6CeIi69N%~p?E)6WPC)rm_XcNYXWNhG z?S%fJ1;Ai1H3FZv!}80*8y=ZH_|x0@Xk8%pbF0_xq&DrbD`ts4{$35Cs2#&+;5}S^ z<#2CZEXtsg8?M_qKcseI1M26xdd*4$AxP~Nu;Y4nri$O3R-NVMF}pN#J63IeJ{953 zhy?@nd1bzi^r(XN1V^tehDWCF1@z2Z4f(RJM^kd9O`CVKAHC1kIF8M&eqtLrVZLZv zKsLXZB;GvG)a=k*sYo`r03rFs?i&3Pm(IvNYLBx-R#ozF}^_y(q6n+HI;Px@*=Gv6t4 z92#JpYAAcZodjzsMecLRjGCH8fsvjEvdB?~I5aeq&zXJ&5gHnrK>y@(xP(|np9zt( z`t1S(E?#tHGPFIiKs`8OJ>-^}`bFN$ZDh3_ZyZH*H+(;Md+o$USW{i+*f1v}Jo70z zs~*?xYwMRS8dVdnMne@`*Ov7ru(<+uErM@|R_#Z-R2Jv4Yf4!pK}{rqO-EA0!0B1q z?Yg>*Ar>#-8||+ukl~5rR}q)33fG$$?v3hl6IFXSMGUkq{l+)OO}F5<-)v;4Rcn8@~6{Rz%W>5-&0UjTed4;oEcBGw!uvGe$Z8u)Ahn>joAAWF3(s(PWk>B==cLkROF?~t-jUvro=EA_8oIKV?)2^dtlsJe2G`WCFN)& z4yl_hKY0ACXn!uy(UEPeR~Z052%pZ*%}6&WWi*SS|3hI z+>Z1+LV{o2*X?XVlu0g%IN4GA!N8g%}GMe^CZ;w}aMshRmg7WT%?@(40IheF~!Meht( zM-#xyS0df@r<>Gf!w2a_7BeyJ4K0% zVV5y?`v}&q^q#gifwy_bNNJ5cwrCxfkre8$Yy*~A9r&RbF z4_hU=A78~ad)vs$*>bo9J!}m&ipV&5dhTPh5ccQR+3<=C&{nDyzdB#^ z_2HmO(yuxld7^}Hx}fNr)duu}VV_Q{Yys0z;Z$|$!wvkswZeB`6f0$ZP1TfcQLr(P z5*X`_-q@E(UW=VoDV@?C&2-K~LSCMt9F(sUl{!kxQk>=A~vc045i$@@Q+#O3%DRV+WfZ3vLdiv32)pyykk!J{KncP>>@PLyc{zz3l&$8 zOycy=9Ac$_4h0CG)g8fqRNQ|F!X0_wYv`p%IVQAGRL>tNp@xrQOZKf;8}C)F?+n$~ z*H=<8ia*z=4xJWN22HSxur>|#^?iS^@nY0QsOSu-4fbFR*}aI2z6i}MbrFx>OuDHB zK+A$2tj%s`A_t;IQ@MYv4kNM*3-&g16MR zzu5UYMH~^fD7LB<-qMD}@U{mEWsB0T*L=0bo!)osw-iE$LEXOE_2Jw}@zgOlWz=B; zye+hD5oANX>DKCTTqkGc7jnsg7_h&K(mpq=ZnVQZhiP~?(e7`sLO~L!^mu~ zXS_`I4&D${lu&hb@m@P~Ei!ALuWV8fdr3FINb%gI8!4k%*DjF4DLxe>QmGLxGThCz z%pPzngs=IT*>lX;m*u)Ec_;T;YZ7}krMGxipqc4W9jl4(H5-hR9~6GfHO_g&vV}?P z*|)o_jaqES#%;6)QD-V**nm^K*TV9bSskyxSL+Yvnxk1WH)AVF$%{Sk2!phrs7_%U zv@2Vz`lKG~Hr`hJ6)xLU_2a|w?_mw2`T?~!$>Yl{K{%228a5vanrRy}l@PVB*%4c1Chw$# z$ssbzt2c9YS(_j5)j&SWbMkV>Y)@Y z+u0t^0KYo@r>7l?r=2AU6w6;T)ED+1 z5S!UdWxSrMN z&-7d}e7-F$`Husm*Dey<8qk8KV0JKI8UFh}{H#P7!8aDoyRwPxV<|EEe!B%lMMZrD zIVF(G`;?! zm)-EyHsj~ja%siGf_wrAU0UTEFR6`d)D`&5MS|Wqj;PSx8Wzsxnwz_)bt+FDr9q1I z-EJcZQsQDt!-bJu@3U-JT~O<#JmzXt+f$afDjwV})Uu?#`gEpb!h7sdecTl<34yUw zC;O3E_a525Iunls52&B0z=}yW{`B9U&CiF%Zwnq!+EU}J3A)7g1T!~9mU@H~`Mgls z@ed=;%QvTeYsc>G$bhjQx+W#Hg;Pe+;w>lNAV;`||ea9Fh}2SGKt z4%BfI*CqzQp=`%Sn;TbvT!s^D((WKr;P4)B8k1qb$oV}@hp%yin+$7XV{HughbTkj zNPW7oDR8h5HWJTwl6B7k`N1{t3V)`{Gto*@)ju)yQPSh=5P%A;sOcz=X*6Vtisa?# zYb)@ze94*8-7;b^~P{3W*_CX-g9rlVZw8BKf`P{Q42!qkqViL02{{O zu!DiAsi~HgavT$?j?So$IP+s8+T^WkFl=7KA2M-P&ZE8{M;Z7d=1n@q<>5ly9gdlN z7}RO~dyfIpxfr`Dr1x!4r!9=w?LK}eHnZN@+37;gsy}AU%`qe553zUD+jLZFK%kou zlNi4hDa!N&ubr-YOs-EHa9#12C@Cr5jl1IF?;Wo)v&Q7=)iJtvfQ48WM1+MeycL^` zZhG>Tm`S+2bbPoT&ydH}@QdV{|E7SJ!x{(=Je-m0#!8c2$WN znBw9dDEPyfbY6YnN=7N8=}+ZXC)rkD$*kG>uVDWV4EE>etbm=fo9-l^rF>oO@P%^M|``^RiT`b`qv*&vH#p4J5w%D30{x7_}xKsO?oyFq~klMiji#&#s)zIwu zi!ac~NnZCS-*CPI-OhtdwE!5$a47cTz$mp}g=2;apx`llRU*lRS!aH+89U;|)bv}a zjUlc8i2e8E!%kp8K8fewzj%7CGrB|t02nFBF(WTWu0!rn1(|ZtG2f+QZFJE2+7rL( zh#qtegs-{iVfJlMuS`+c63;Zq)P$X9*T%41ke<`*spW!&ibFdI)>eh;@O{yrV*5W* zUO;DB?|2B$nDOjy0D-?^~(|wU2ys+NeKps0KYcQb=#FeZ4Le-5u_$7n40{ z!(OeHDEhfNfo{qDb}nWL(5$p~t#(w8Y+iLB9F|+2TS*HgIuW1$k5O%95dd$b54;sSCNn1d|;r35g=hQt5^8I*b0wIPLCPUu9`ExmfHtt zU#nT&-{!@5#y^5Z>3KKet1yEqndG#`nMuJ4j0%HH!MnZ9B}*)Db@EYH1g)wR*Xj0R zZDe~YT(-F4lVl3KX1*fV`I^%;t77fSR5*r$6`_^xDIW{yFdR4vi4~LmlE06|k>wfM z+)vL%p}Z?wUzo=^G`|gGa{JzM%`>b7nrW-1AJ;E2{Em!RH2mWX)0N(W^q+&}q#Z4L zAq&+0*E1q1OH93cFePC$IfBVn4u>|rd2I+tHPZ;bsi5H7aJ@#|&a>?*T{bDt<{#Po zC(_l5`nbwdBNm~)wrm00y{`*v^_}vR7VOKu)8O8)Z5AaOHdQC36Sk*E4KCb#fK$>w zp@0k;__SvsbZ;5lN?8%1S(Q|1=HOi=PwxnvwzVHywL z_4o)Pmp-^nX!shgvqUb44di65u4EJ0}L6I{H!Zw{`;e=6$-*{4R@n;+AS zu*v{Yv@7j2($F53$VSxIHB9jNZ^?sAN!6_vn} zmo6_!AV}I9BzUr~J>00I-TBpL+WBN8AvBR^X63ma_&sU$b&MG3m2)%&oQI}?LO_YG zsn~SQqwtf{uG1QYceE>*Jjb6K^(?4-t+@I&N_nS=W~t*_bdSuTsXQCHw~!0OMWhVo z6KAwWj;Ka4YK=hKooQuo*R(dv5gKJrdO#t256mR5^70NzG#m|n<2HS8w|4L6T`K%p zSfk}28d(}g@8$;w2g}^pkpvs8l-DnsZ!V7^o0^(f zI@HK3=sj1!n0Ek+zcC#xb(idRhdvTc#Eyb_GD_sDIbp&$V1@wwg{~TXH9+PQ1OX&j zX=?^oia88co!KgR8tXNJuZp_)z0d;d42fKH-e8V^c@1ywd!Lt+ymrQs*X851I{16@ z%rS))dCi}Mi)|$`IJe_umBPI32N?TGmcT5k{a2U3fX|hV3dfI2*M0!Q(fo(uoPFfv z(M9~)ek?j=!5L_M*W5C6URcb9#9(Z6oy-;T$PiT-qKv(FQs1do2_>mP7lXT&V=^Ek{xT(5hW30$@R((dNb z3t#jSAF4@#IV!{{V}{QGu)AyC`QYEc-|u4_(hV!sT*9cVZc0>T9eD(kwX?Le1R%{( zCxW7q3{|8+d*Q+cgZT5$rx8WA66CaGtjX)EpRc@QI0VyyCHqZNWktESkV9kjD)!iN zN1Tw$m=ngYRTQ1|cDvg9U}wkeQGV{)SJ5s^1WZ}vl4c}T?qqAHa|1yP4yxJTDiS9zyZgiO0|&~D92%j z6%)ryHV92G#+*V5#|Fi!Jr>S*8$Yss53BBT8gmh0v|m79%wg43>}%;n;G{||(^K|S z##+5LWo3Q=mDglhax&PPtj@n|moI$ggAUt^zR*FrdG`3wo1R>jXpN2QK;i?z7l)X`a9>Bg^9~0UJuJ>E0*=?eqNHK}PCXmv%OF7ndx;bZF#wl&tpOJI zzzATBN_c@v#^-{Z$0JM55_I4M%bb}lmB3!hXMN_JCb6-7ut6rW4c zYhiHg5ae#xCck(+mE^EqRCZF4W5zbVI|*jRx@GXiJlpd}F^!}@BofBCb>MM_eAPwA z2YINz`RcbJRfn9`D0qdbkFijzaf!3-xCfR???F!L2}wc33=# zvyy~FYl5j_>JhMrnwues!Na{TpKcqVs6mm09-7$;Nq|fRgA0R- zMc86NJ`rbC(yDPXC)*x)%4RNAFws)F+i8r?&Ls{!%mi@&CH0(IEJtz&tk6o(ZA5_= zNK@vXi+f|NAAs}2=l{Y>s}ekVb}$7IeOQR0>)UbDx-T@Yh)Fc7*|q(NayBE^4-bJC zm#>+QBxF}$8>++oMvs~N8V1k!T+N8;LU+D#wA8DRgK+24(W-m3x6F1N(bl!dHEJq1 zmKBu>5}u*KO&N~as7qeW5P9n&?I=t$RA}B^u(F#yw)0y>870IRyYbU4c7mKv2%iK7 z;Qnt4;Srbsq;Jc4@$Id|c26SrIzcf$c?vLq{*bAeyHVBv@PC*VwFg_*wQh{k09JZM z++*uUL#uUqyz@xDg!AGfmvL9~N98=8LnY_}wY>$GPyx>DKC$02&HwTb4w%E-kwZ0-Vbk>o`&|#}$^-`|}m#?MXyFb0@4gtOKp=w#QOAiobF-sup zT5JfUa+!bImpA4M%tb55V9>kb2}ao=n_qzPjJGbi%0+4ryUi=d4je)q(h3wnN=nHw zquAL5M%#fgw1h)}?@9oHvMX||@X~Ky0pIHU9v?dQ@XRyHkbu9am0C^G0>GU&@@n5J zrLZk;b8JmYrm%G*8 zaJn@Pke>XEJLNfYiRWeIE7xDNb#X}Wn{3zB7Nqp*>$>vCi@11c+wn(>ETbCI76wuG zoO64VlEyX1_a2OBCTbAlTd2Yu@lLE0yD<5L#eX>?|MZw>z?&05e+X zyf)ZsK(BX%VU|}VSDv#r-bIbst2I@=$mSeRfET%gdGyRngA*l7%#i_o7cN|Q2yVqA z$W-?JcbL+$JQz8u)C;I zVWSnt`-Zwef)KLp8&<5L4j=83E|n`aw+XO29A~%<-S!5KcmYd~U6gn$A@-Rdc;T1h zJ57C`Ykr1vF2E=IYGzc#XUZrGkIaE)N&_liS5qI9F26F?+OJk$VVOJLfGRIS1JQ?5 zs(#~VF&8!9)6gr(xtI-<9O{pmMEiZ6DxR)KoQ-~f`I5Y5B%WJ>vN@qeCMOsattQPK z_xOX5gtu+{fDd}=neVRLBYWKy?^)YxhFIqx?L!+Hgpr*YP3zo+Wm{oRGKsjrdO>!u%H(nbzmN`;%zRDozz*~+jB2jqr>qa)s(rt&*|K|4Xm2B5zW zu<9GGfIV#kH6{bMlWLx*1@`k8t72*PRLPnrT~S~MNUOf}YXhpRUPd14%KZG*GA5t$ zn8)X1!Hrc0akYxgZfB&flpLqB!VS%kA}k(LP05K5<$f=q%Q)2V#Mfcpr|Wch9-Ej{TvzG7qj6phzh{&ob@{Ts{lnhN+yQ;H0fj6_-()I-WQBB0F=2tA9Ab#tz& z*%TU<{GkO<>kZbecjj`@#aqiBd2@o~9MA8%6^-l*3XklQ-o0yL1%nSj2#WpG@PzAM zoF@P8jh(ZuLSw#e2;8dutJ8eWzcV zn}f0)5~SP3ur`^tO%*6Q5pZMUx>;Z{=woQa@ge8s(d~0T;iXauu$i-cKqr&?)NA$% zVzJe7sFMzLx3Huwsk0tODq+cykiif_U{4tA~`4hhR5~P@t(Qo<9&r+;Ub_IX)PCI z{t=tUyYQvquZ&e+dDOk~VWd)cem_37u|c z3l`ukr=hI;D4Vxtr~{+U{b6OvhMw|Pxxm#mN11c3qtAoOFR0n8h{br-pQ27O*;rp# zTb@no%IB!yJFmjkPT-S@&v$cy9xrt!ZVuV#ncX%L_ifi{sVLGjp=4d%ywHXoHRoe5mlo*XD6A*c2tt7esLTBFy*7$AnKjI8WVCS z5v%vDfxh#tAu%a1)uM_;gL(ZCFlT>A>5Rc+s2*nSIqzqg(xReu!AMij=u!}n(Fbm)r|r#jI~4$sxir~cyZ1Tm+VK$1<}S9+7DNE zIgK6anybB?j!S}9q^B(TA#nS5;stgrO|Tm-{*d&@b*YwuoSIF~Lk8AH$^Zz;LiC^# z08cMR@1}ILmN*|^rBQDatzFm4F+lr_-N^bv)<-p$?Jzlj)b$G$jzVjS(hd5)HrmihYd2!0x8*HM!O-{}MX zE^E(GjX7FNSNHicAdkBp;xHVk2P_l!pxX>U>0;cq@LP_?r_~)R)m`Z}bnnjr%S=46 z<^YhVfNByA=Lu(>$A3TwVE`-vDMf`D?gkZ5L1Id1YRAi@wy6bmOyWlBHx*r3e^1C_pz5UcJ@^cL_&BgR?q8MT|>! zT7#HQ*Hrk#Gri6AYYUm!jXx&+%H(3J?^QanHrfLK)asg%WL_n5pil1bir7e9V^NAN zJ@r|HvH7Q*EP;c5Q$(tJ_Y1H`3%D~Q|9NL9Q)md!BEHrOeXJ|<_*8!LYb3IoZvBmc zV2e`4sp~_7*;BgO!#wdChqu8nkyLZlp(v&f#g?(VP30Ej#(6w$gC5nI`MjJ*H3jp4 z&?PXy*Iz7Z9vZ~u0J&`}}?q9d~&sT|2 zAYsy%tCFSwm3_a7Ep&}BFA$GX)s-wIT#(y~9|noa4UQQNkBP(F?GXWxvQD{FEXK4YOH7v#d7nzQ?6Q^FsP6oai!>mdv40$3Z*msWtJT9Jr<2w}i} zHxIE~nFerPN9}rowlSU(Fjc4%=o;{wiL5gZ{U!{Q(dqgC5w@o-bi@M5B%`P;3``RgcR)Hx$g-#`bQut}4%*nFg0+C*|upSMVkRl03~d z$7j0bf%24JI0L1NS%@00m(fD zPCoZ#`%QL?|Heae7X{rcJVC_#Q=HG>0~un)}MkG$zkM9JCYNY5vrY*gdCyHv^y zvOoJsm{*=nMT%9lEw+@B74gyDanzxO(VaNQy;WxO8@tT5fAYO|;~D~k;ea^Ww6e3- z>CCFefZ_LWki1mJo&w6B1aekh-Z{Jk0_U07rTH5H@2G$Ue$3r2ULdAq_@TTM$4sDz zh=@#8b@hQvOqYc9k}Z}OnZfQa+Jdj*JLOjMlQp4H<-N&#Lb6+j{(n4i{*3gT0ppo+ z<4G=E>oE%3yKLoIFLeqIZw3t_!wiaEW+=wxw6B@W$d-)zG{~JSdl3u0k0jBp>eYp) zbR$3N?}$^(O1ub@l9G9jGg6_%R2e8AJx77KiFwxSLz+XDSL-CXn%eF&%uznW~-wH!E8@4)zWU z?mC@3{pQNOw{btvE1oMpvl2Gr$%>Cwa76v+$dksbH*Ru4?I3~oj?b^~Pl7ToglMIQ z0^OTM0d{*fa61P!pA58a7$yMSzlu+H-@b$`sk!F=BC=d#VzV1;F01kSa{TS|o?>c$ zIcCFFPf@RpLM;Ex5 z=gFuR$Tq@iO$GSqRzyCgZHUd`!UQ5uoC&4yAtf|#E4?h=1>A_a8b`72;75b;L0XP@ zA^|=EZiUt39%57ZpJj=^zp}Unveb{X$iORKmTuD%M;EUAuC+Y=NdmtL*12u`L1%7% z9*_db4q#3KI6c4-e#}9mI>99=EFz+_38<4D7L1CJy4o!zsH@LH1x85A&lc+qB*Fp1 zKotEJ7!e_E!o$O}A7`pn3NJ$U;Hw-LLVA>AfvGuDx%ZE#N{%C8ZHdN{=^#b1lX2{D zV+J$v?Ht7?a_R#{}iFXQ?kbm_NLRg`np z$?%*B8nR90VC0HSfuYW)Dc$AQKQ$umT9e9fGVD}ql56qNX+BUtn2{=^HvHsmH`?pj zwA*18dZ1eG7XI{m(aWT!# zcJsb*<{jf=x{KpW)h?fo!7$U|c3+mCwcz}+6jge*-#SZym{ul!1TNo9|0l-%yA}R4 zj}-{u*-G*atQ4wa4ai$PKs)As=)KWJ_Ao3vcN4eXVW+9TnZ)Yfxt{s)393%*RtVKi zNoSQ?pDzd1Hf6Jrlbluk#!A&QM&>bjZBqXJl)Sz#^-(g9)gSdCPyKWuc6UyahA;MK z4zpc-Id#p)#%z8xb9p{XJpG}QMon6!dG-Fbg>z;1^x+}mG5Sg2#D1v5!cs+%DzphP zE1g*E6$devCim@DPnj?88(#|V9hG%-bewS6+gkN@ov7mJ!~Q5xN=MSp%d2a0!6sVvXpkE4sAxNSnKD+w^JswM^a%beiEiVQmIKIy_w% zT2PugI=iK^gJTMgaF~MC_!TMKVINqBK9iwED*NL)Sf5qCaM5I(LB2=uUhzcA^0$n6 zbH-mc&^!pcrPFWC9Tkp^ds4#v8EICpuQsKtCKoGM{eBKp{}^h)<@(I-S#0>Js92i| z%WYTDzR?;f_LSb??lMqU7`W z0{;fN6IzcF*^QJbsKLw~w%YMyXEk+oB_S=V1t7WOlZ3#)FccC|t1M@fXw0*jXH|#u`W> zmE?lho#{8vAPJ8v`_~fWf+}655A~3Q_T43Y4yR9$DJNLlp>fDI4ul|Dgrw_@`I5J= z`98TI=I1S-ddTqVE1Q!wNH)vtlY8!`WKx%U-9dvA2OhwN>TlT69*adSJ-vPBv6vlt zTGBfEizfPWT>{NjU^DT=ygBmuqArBATcCLl6y%$g=-0ZFu2(#J>E!YF354{-v}pA^ z_zi`is=L%!176(7Mxxd)DgQT5H=?~nbaam1ZU_T>aE6A9B}RyM33zx=5`UxG=W^A0lCloGKMaaG&2oy-H;%paeJ7M9h(=J(}-S}KaWySooDm+Z>E z#>%nMP01ou#gMdv_K~GFNA<{q&rTp z(izQib|kBLk(ByXx)PzUtx>IPEK=tyX=Z(`)6hc_5NXC9V5b1%5UYYoixsogtc$3xtFn;$nwoSR zouJmXmJDMZxVVg!yuC)>u~X>d{XrXH zZw2V~U30W`3S5Tv)z`xXbu$QQ6u4oNm->{$um}6QE)&n%c@A5mLP|f*ZZIGT%xVLl z;BKLiDU4J?LQFz}z4JHuP5hH?9eXr^qdD4tMr&+m@m{~;prvr^;q_FHMDxzpbh`rMNrb6v{{isGUi6m59BZl?8aM&_d-iE8ho<34upADbiXHlZ0p-*FM>a5J! z2n9bI40oO~>dh6~DQ`z~8aN9f4XFFo6cVupQ{dv90qdqKyp3pY60h(rL$HB_aQDau zw*SM-FW((WV%jwBC9Qbvwr3Plr<^`)BP|ErHtMEW>DQJqeIK#8gmr12>r3Bka`4y% z>o{lMGRt^*#*+%C_LIp4d1s9XDy&mB{NNe0f1F5LNT4Q z)1Yx8duy)&tlt`S#@Z&7Z_mcsD{Mkjnht;KsIqf&P(|4s)+RLRBt@qPZ?8|@(w;}ZI^kamR%m<42E7)f+ zmuF8-=Rrt+c~sZ#vnUKW3buuRUfa$+oElZmD2Es)Yxr%rdFGC@nV@STICpt@9($N9 z=YeUV6N2sIqvX`MQKwxc$)d33UdN~M%>vh$wAojVSZ#-2ohP`7Zh1gYvdk;Z)jhgE z%96JPUfb?oNt6AzD|m$Tnmq8tuy4bqx4}nA7uFA*Ku`DsSzTT26&22|JTCmb3xt(A zSlc&yOPTWW7L*0@5RK7>W`atxDur)zw2bx5i}qE0@yqLc7=8Dp%qQqOBO}Gu7*Br) zExic>!_$+ip=_Suxgq7UGVb{pB_6n4WuSrrXFvH*4__r=Q0GwSEP0;; zUQtmI*W|GG)TBchWG3SuYpSbrA0i~_3G+nCNFUVOL_nB^SB?{MS9q^+UNa#QUvw?2 zg`Sqrnt&J90A5%SJ^#ZKF^+tPiQc`6B)D6JT-tu$X&@3#`UM6B8=a#-bgf!`g^P3S zs^8n&yAmQ`e$6|rzzP9}v01xID=*_y^_IJph_$_CoK`}Ad`^3a!>|7KW$gyr)31ss z-3xa$y;&c```0?d63!`U+(#4!MKo5dKNdO3?~Vmv&#+>~{IGfP$B2UatXp@pjZ5q`%RNr`ThOKdpC;cmNlCJX8f=>G+1C3>@1c_Zq}CLAnXES*X2@d91!VD_(O0Mtjiks2Vtt! zyMab8sva^fM^_{v1ZK^N(B{3=tN1<}VFQ*_?vu}8ZMXoVt>%~ocJqXwqT&@-(4J-F zjVkuK!|cfJ%)XoP;nROUeBrA<5~Hf!hl93WGCq&`Xn}mIXsi=>TSI_PWM+*;T%*>S zub+A$N5lQ_n`e8eV;Fvo)Aj?(q}TNFr%N=^4|C?T(WnmjM&D>mR_T?!4;iJW6N2Ig zsFTLK#O*5pDoOJ@k~IE2?wH!Y#4oUTG4C|-mNndt&yR!~rf7PPg@?oWdiAhcT6q;@jj1HLulpqv>I>$l>*_@a zb=nJug5P7nPC_Y1FRz1HeLR4Rha)2+IQP&?EjVbV!RRTEJp)8>`14kooI>-MhgbtAx?Oi>-wvWB@7JhTn*`F)k-Z)mK9&xg|B*-QaMAxT~50T-pa>&n4o%%23O~8C(KRQ zpO=)Cy`eGI@YU7lhHAVI zmv!vQXRM`9f=$h{*WVz9Tb3PxO9FDCB0(G{lHg~R8=m5{-#j~XMW|)-;zkKzV)vjj z)UcSvO_%*PH&|PDi^Ja3^*7QOrt&ht`} z*123VIas-VRhG4p!=&RJwY1YF>LSKQ_z>vU9_tNmgIJ={B%>p?h}iD{5#2mgU}e2y zTL?gmey!}^Lfuz!lR(BkR7TXINT!6C4XWeUCFw^<&7x}k^xb9(1g2h&$?|?Ko+u@& zurfjkjwsYSl2-M%YAl2vto5_$PYno zh3=b~y5x7H?2dv!mu0OR|c8l3xqUMb1Z6sW$eS< z)$#ee8%veDAe0|IAa_s+-Nw+F=+nsefW{MKe>E?6wTAV6z(woGSKQp(qsq{28Ict< zK9G6=X%^zzKyMzmnzWExxxC>fIFbys(_IroLoW&;Q}tf6G%X-E#aP@#Tzd6YBzlszLy*-=YTT zCB8*dSs0u6WB8BIXt;6m+iN$uZzOu@d^-Ntlr}=)IBmt0dINKHv5{P< zHc?}P5>M%xef7EwuEf$t@rC;!4Jtpd@yfJKcpBZUtZq;4u2(7R}GT|8Qh zMC~v>smFpr7p#^4SpMod9WX^L{M;>KK_N-0FA!wLL8i1k$=yLTxZ}LX zg(RrmUR;W=0Kr}mhr<~FX_5+Sqh5%Bfz{26D*!3`C68xM5PVr)*j>KsHal>^AiNv^ z3{W-HILUgUw>S*{Kz{;y=&DWbpbagz3(;EH>!S;=Dm4lz2gvW)I5O%*FYahiOKUEsGh)uT&W8t2->5aFy8hpeC5D4nB7LaBE=Y z?3H;3WujvOn^uy>Uz8Pa{)*{-@mPPnydU-n>OGs{7jQ?A_p>@AW9K&A>ztJ0S$h)Z zI#$5ooKWPD88^+m1F5K|!`Fy}_t*(Hdjvv^Q6KqgKAcWwbZbf2R} zkv#vpe_1-@@&&m#Ffd>U-V*k$2-VTu?Y*(HfU2>KM&FeZ`EVLHJG)HV;&5KRr{?Xm z{ojD@gxBsSu$}?Oj4?w?KKzDQy8ZB3ckw^80AOb!Q_HFh3W?N9D7ZL0s1P#-&egFn zd=w19lR9b|vaQTvdf9!y5;AoOpxbZYz@Zq2(*#T2!{(zvZPulInO()cS=%l`0(iOL zoQ(fmDE&R5Hh|9+Q;>fLEoU4z_YK5C>|!H&(fq9iCiBMGgGCMZyJAM&N*j)V395Zw z&AsFK_B)Qk-}Tedz9Sn-55Yxop+(VgeZHM{cC%-v#;qfonq<$^?(s7S{OD2gMLLoo zH;LH{%k%&4?~ovflnyiwce8{m_jkVEN2ef0h)bsBRog&C({{3yex+SKHlnKaVW8u-fWP^IDm?DU${$P$gjZg`#d>_vsDwZBrh8=&*HrzV2M;)bFZnh0sVFCX z2kx8%i3GOSCYLYdTbRBpJ{X>f@Ekpmeb*z`cyM-Rtk;FJxss-Mp*U%j^NsU{vbT0_ zR?^{{=PPB)Yhcp%Jdl>C2$eR(_-`uF~= zRF=w?r4re*g;aJT5<=FouSv3n?1qT!WZx-kmNB-mGqUefma$XBU_?wz_TPKzF5P=S zzx(~?Uawx1c`xUE&U2pUIn1!LgLV^?KI~-!9@**cR>?28g4WD4(kXPiR(l&H73fz} zC>~oKB1#l&PW(8CWgl0b*^5C-d&pd2ey}ymeKDfr-N=ac z8tEtz;oTqxsb&NPhce+<{IO8~<4Zr5V{(#ftvMuLPYYBX5`jSbTyQa6HuwO#=F<*B zrbDKPqd5%aaVDs_k6Kq>pY+zPTT*-hBE6%1UjV0O6z;QCR6XFGedAZ~#*d5j6|u+N zzRRjT*E=eSva1BNX~$ylLh8Lj&9r%>5jv!r_Wt@}`3&*s8JMl7p37p9_NYT*I;p9h zWbu_4n#DaTj2J9;OLr!*@l{3okOND{4pCJcQrK}|^ZlyW^iq9Eh%*ejcP|aMHL{S_ zQ)h5a1b0+7B{X}`8}qfNd1x;>!{S^^EmKKpBE;Ot&BSC&zL8Zn1&zC538yjpEX{Va z+skB7QzWf?g3C&dsPIr$Sgw)s*rN-CNA@I$42Gxw`?dFKs!|52BA)i6PvCjv8wtSw+C^N7qIfFQ_<_Q z@qMrAgMDo9MJX)W&PA@Y91M0z^3pHS#RQw`%o&P@3>5aNge&`xd;1&e`|73HaDIq~ z-h#;4(haKm@c46)1#!t>n4oOSxx(4*sL4GYw&k7UWMuyHeN)JBx~(aAUP4V!rEvf3 zST-D{G(5R~T(l5z_alR(lTN^IM6LqIuhhohAk>8Iua~b(Iq}Ebk7tpO2omr=B#6X zaM6PCqWp(Wt5Qj&m%kJq%~}-qh$LCtKd}kI*Te_H2b1FI4R7_@>A*Ri3Ir`*F^h4Qy;bB(}S!UVjn_xzJB3c zK`;g1`kJkny*UGyA5IB%Slys`6Q?v#6ZidUt|3Y5*N@*_A6XKPm3Y)|N5||j>7tkK z{Er*?*9w-(gSb{jd}C4zgzI@H@RLLLOhG1VMX(RC?npABk>!0>uyGN#h2NE>&nx49 z;{WT)R8lN;d1wTm&~AaKQ5JK*;j=4`lYne~Goph(WvgMEk=kV?|HcW0=!JBQopdFIr!_CArmNjMcAT8Nz7_5*IRZFU7@v zTM4sUKD3wB=eqFzS{vzvD*lo`f4N?+F~6X8bfoEY4y`ow=#JIi>m5PdQK;iU{wJ`lEJS(_7A zK95xh{ddkdnFdz$mh{I9kKrrlKtBSQ0~YZJy+6(lR$J&Ez}sJRBq@+GCk!6dM^Vh# zA5@x$dw8CgcfeYmv-p9`{(c5aHA&d~XY#PmbniL7UiBM44snsy=e`x+Ur37<3vs4B zQWAs<^{AQ`@h|4$lv#8i4T@Tx-`zZ~OEs3Z*>l{$>$V<|^X>+F1g z&82(Df`>08*O7#8%ZQDZou4xHIEt~Xdhr&b^d5W7)!r4VHFL?#q5dGH#bl*wS~boG z6q(i_*%F3v-ZA>Gm+KgDp9Ba%$I%`aW{W0rD4q5+BLTG5qqgfw@U>0vCP}E zslKSj-Lw@>RD5RPF0>zgP$8CXt&XD66~{L>l4oGr*_J)woJJ0p4$NEDX&ka!b7%lG zUI-$j%GSmMhX3B_iu>SmCa5`h4G%CijWJ^G(NQF^A6F^>!|7vgG|Cu*Vpc6UYl-#6 z{YU41{hM5={O%-PV;T?N6`NYF<-Nbfe;0w~PJR5Pnr8-1`>uz?c|dmDHCY!T(JU*Q z${ynp&}wTG<%lk^_Ijr1 zuV-b9i|E{C1F0*EpUtT2%jF{;)*R32X{(JKT@3Xo_^QL_kKoiDB2|2r4#T0Q1El8J zrgMb~0~+nk4dP3!jHnV5O>BFqusvrV9(c;P7VqCke&6_7l8PMn#jvdKK#^s0++88f zJ&EQ)=vSL{lhCiJU-m|~_AD0VBsICF^t)5gC!J8WU!A#hJ~tkH{iwPVIXwVnS+T0Iz|h` zNY};=nr+gpe18kCn|lYU7S?!DcPhrMizcU=Cpbu>ZN{RHQEto?;bgRgu(dBJm(wV>Az~Ih? zwfSC-NOIk)q~^xUP@cyn_Ob?6@v@tSf}i>o7kgs9ty(8rqkUx!>K+up?Yo_Q%-|oh z;{#eHl2A0*iM__lQ$F2|Oz55&iuz(JK*%oz_xRj>9@p?lKE zWP(FJVGKE0bJRpu9gjA#*DdfUJXA1ChI3AH!u;81b zA2wb%a4Aq42HO1qv##2KrCN$z|q$7_@|J>~|HP5QP){G|w;BA(~D<`2e|QzP1HKc_upBzpX! zvhv-#lU3VUuI3p~Rh=6qVs)8qcY2BMN1+lMbmHB0PrVB9FL}Py=o`q^u{yS=C{Y>9 z0L8=M{D}oLTN>|5@`E8hhfYGML?UZ>NwK&3^HDv{*OX(%-TUeN{>Y##Sy%eUWHhdU zkGVzon1{hvDgOIo>Jc8%BF<{)%H82fDJwp`z z#AA**lzfdJaFc6F7=vv0Z`LXKd@v1ho(AoYb5`ubG`ofFNLG%7Iu+q>p}~PKu93y1 zljGX%?1kKO>pA=+jU1@eZ%V^7^qN1eGpJ^3j0x7|+$b%C!y0{!kVbhr!#ooCFfU!! z3;eBLsQIV2>PURLJS)EQfMyDB#>Jc9L|5Dc`$MZ1k@9PH@@skd^Qb0~fYUAHcz?1E za8&G|4L45z?FfU1W-_k%jYaS2wls+}wq-=%O)jN5Et#b9O_Y+q0n2-jMY>pQ@ozUW zzb?=_!!}BiMn%K5*pwUIBw#E!k(Sw0J3FUgObZv)T{hE%1dQI%jHexV1=}WHu^O@9 ztXzP`^P7e!bM6fA=uXU-dC{f%D$uNo@4TlXwrnbCt)0q~7Dag-4SH$r9gd%v>>O>H z32bhM)%rrAF8AZ#LOyB=E5g@A$doo1&-)ne`v@++wlyvrq*Fen7I!=TZQ}V{L#KPjh+2(YK!SEitW+IU4 zK=*{aD;8tx$LCNHmz?P2DtA|*xk6sfchj(A=upNdG6RpZo01dDKP+@EC+a<(T*%g3 z*mSDaSG#3KWZfiv-M@fpD-9@?#KTlP*4|xhYkWBVQa9R*AA_gEPQBr}sJXS*4nD@S zdvVAAOz~td{s)8sw%t^pkU?R)SJUb!wp<7FhPeLuw%U>(4>GPV2L~ZOc6{)Vv?XrD zF=Y(x0avocJ{%Ek{qnflyxgG4H2*nT+S#w4@jbFxXQ?PI2clqVgIX7N99ZU=k8d=g z#Z6q&4)e&xJ@qFRkEcDR-{2kM%no=JTD`p+VIF8v8X`Wv_L-#Q;@n(!j^D8j@Y^(H`wrbKRPS zsL%&l^RpGe&g{sN-;xIkAXk9aRsT$p|9RAZ-QuK+gk#`*Uc8C|)aPE`Zv3l07Xlbp z?$OhSLrG$`@iU1@iFaeRnC~tes^!@us!&3C;|mR zkcysMB_2*YCiZ2gt%|{L=8=jZ|IL(r=s4rU$wXX;MmakJ?GD|!EOs63t&y4032#u`xw8GVKNftbuCnmeZ(e67thEF)d&eN|2Dj1Q(k$DT0{;51QnZ5m-Zhqs3 zP}`~+$nB)i2RcqhGD%K_T&YH|Kud!MJWZr5SzqWTSPZ;opZPEW^nkLs5zaqXVx*?M zX^4i!Zu+&;#5+7Z_IME=_Thj(TaLT*@)JK9~gaRr6G>oxFE(!I8y z{cC&wwgk;d1o-8GKrbGo-J{f$C;lBns5yT`13BKr#=!GbixZjnObM&KXW5DwxG+q6M`T{m&|6~gS5Qud{d#ms|k@?74onLCBiXAyke+l6Px6^{03QRmEXk9;Z=d9lo|-VNu*Y# zOiZD0b9+kPdAY{!r&bWKsaAwd)x)S%|Lqg~f^q!((v}&l1YbbEj`8v7wb$sBBb~0s zf2WG~kNMq!8|mFD6Mew-yxaA*5xlJy9Zb5B_jz0-YvOf?bMiK#J;Yh3+yr~3T462? z3Ez*2AAIL@(Kz zeZ4PHT^aiC$L{%U8Ujw+kwb`cyF_D5|9bo1r#HV*PK+XECL5Gv7Az5Q38TX3x1e8z zILqd>yf*MnY^yaKDyKdnt_6xm!#;4LCL29{E+mwzrSAp zysUnXVD~T4>7ijD+l61_>i9R3U6pZLeb9z;g2u%1Q_+DnqALMvsGqO@u!y!vtX`BQ zB{t29b0z+Q&ej6ml+%1aEk2aP@Y?kWsn-f)+{eq#@N@3;D={eVZF7X@-hCaHH4O3e zVef~t)#S3|;2SiLx;GifTJ$+eY0lKFTHw>ftUfTBX2!y3tEBizf|mDXP3CKSYojos zR@t?_tR98cvcs1=VvM=`hj0hxM&Sc~qYPHFa4l`4=zeJqZ3nhQd+#vN;Lzxa7dRt| z+<0RJz(CW}gd;8!q*C;M9C1Ey#9<5l_LgAnrbr28bcON{c+~aF#E7AZ$*fDq1*`pD zmr*0Py;sv+AZ^3};l(;|vo5Qua?PeNx9GZ={7a+|AOe^`Nt^{z$y}?>kJVK##3JzG z3L8o<#4^41WX%i+6&7~EXg!R8Z^pEY`1e8iZ_){S%g?Eh>9APW3{41s#Vr!DWaAi zeasd0+_I!&T1LFw=?RM@wkBOpYJ7BF8b77x!1C#Br?#YUI@J4k8P9+|^i|;~s!Yw4 zH*1FIbp<;}hUP&C{8Gc3Z$?;z|Np905z1$s=tr4^q-g5wj}?;#D`tiQT}pmp<_v&4 zo0I{D?HhqJN3O3TL1hhWFF64XiFPrQO4d;7cj&%V25tEZn$rA{NX*((+uWd2qW9(n zVLceDQ|)ns`n>YeQC7#2A*I|>O!B&7R&RnM0GZ1Tty?SC{J&x-<9lS>0uPH~mPlRd zq)ZgPW4TsGRMV9A3b}M1?Ikvd;v29W1LKA~0ByL>8Nx5!yec6kT=@O&8r?acqh0E| zd-c9Aq?0p}=P9prE?aR%Qo|@VW3T3!Ty8AU5lxTmY0@K zGJG)y?ntNSM2uY_?CK7cX?kamQ;dy+>cMq;XGrp&8PeZZ(}Dof+^yj?3Ii2%Z%5LJ zrgxx;`*CGuMF|#2PxK@Iu;45$Eq#i-o4s+3xy@(xQAZM9Uj5s58B#!kRg2YVLiA4M zKY`5(c1`sC5{~J^pQq+QI2{M5Qk7;BcjLn>9zLv1mxG*N z3Y`x#PyH6+VVyqHR?BUMVr6roxe9|;oP~v%vNP@mw2}()cM8-bC2rVSF-*8@#k(g7 z>nrr}WNJ3~a%~9ItEFQ676wkKy-lQ#wBiFT_SLlPH~A5`_o;txV9{ap9X!VhN`OV2$^luUk>MO z=yAoA48gflzLZ>C5mB{0i5)c9lu!EYg2|42sjS^FLM(R5&hzKm-h~7){^7L zMC8ljQq=$80^s3?nq!?^(V?Ex0TV43=-Nl>{0;Yxi${#(3%>s5!gwE(xs6NVNAoGd zzw0TpNN~TXW>|h9aHskexTLf#k9a^N>;%g^S*D;i|F5XTbwG=Iwh^?r zb3eI0yhW6k@LWr&5jc0mU8FI7gJ`Phs2QZl)nYk1bcv`HPLqP}`Vp7kyeCL#s{Twj z&d4+?4eb*twaxnEa$&teykqQTZnFsut&) zek^A$i^dU&UHhppzlXVx>u)87ey%k?_(Id$V$4tZVXarCp#s~dYc=`3y=ZW`#@vcZ zcZvFjvc%A3Wjv~YC#pu-!zEk>C1SE^_9k=C#WjY!9=AE$%gl({9Ci%$TzK*2x!d8g zbd(Y^f9tI1Yv%UGa>a@4rfnaM#*+$WIf+l7g#lcx`<%YbfZemVdL$qayd(O4efnLH z8upLR^}kPeIDuH?zEke$Kv?e!Dnr5C(RqK6l=AdX!z{tL2}dw$qdGV4D0}UqMo({x zfVchhM)-?nUdqR}QjyHL-Y(TC!kLMLj_6Zx%q{K^Kf6xskGW`(0lXNVlHbi70|fb2 zc>P-4q*Vwzf^g}6thJvWGw6kGQZl!HieXF$)pjAO(t+DIk30(3=0Da))g{)<9mu6{ zrMF|0W|Gi>2&pqS@G=~yE|*8Fur+!&IlP%MPgQKcg?Z!x6hl;)RjA!ZOY-+$ik>wu zAJ1qaHMA=$-`5J~*?6xshBJKc$jKPFwZo(|T(RzT&{j|Su4Ge`H_{<@f+mYL?Aer8 zJbjvuR;~-GJUOyPZ(5;l=B4P>#F;8SAAXx+H=E;OtSc(Ni5Y<3UOW=4S){nr#a}y> z|4LhLn-&qWc^d2O0u~T#4cxNFZJPi~Fx7rUqlYuV>^oh>h)RL}IAFcbDBmkw+PDnb z#RKj~aqG;lk5h48ccC+ZXkA|M#}9_91$DEf^i-d|voD*;P6%UhiH^`am0MFNe0!$YGzJD!cc z%p9xB!KtqRc@gm{-Xo#Jgl}~`duZoGSn&3twxTD{1X0E3aEgM+Q6H*Oo>n>vnKKka zvXGReKTAATJ%HeAMqp6`UPK|z*)_gf)a2D`hY<_PkFKhtBu69ZF%6kC#u6B<8l2S9 z;od9!Z0!S=&Rn4iFUjz$N#2%8Ac@AlD@!{ac24E>3ew=!Tj3k}w1o5V9xPj@{6F)? zzeNjhrY#7I%TdZ}00M`xotCDc0I`2>xFB79`#mlurk9`g^0K;)h1PuZCv%;CHUucy zT)t+{m%HOQtzol59}V+VnEe224iUF`yR3mt)E=Dl5E~nt%*>W@1^qz)q(bLJ4znCS z1l-B~;NAs#{YbGvmEN#^V4R>!ym4bz3iN41vz9gmN1HeQylJKilp@cTHz_UJ%<7kK zFvC8tk2$yp!qBpTJD^*Og(}n(oyYAf8s_d;Q}cZ)TBUrnsMlm8Gv*wFIYrrYO8IfG za7nBLL@K}GgQ3MHS*&HW)9p{CnsQYtDQQWK>%>9h8`@!UFQ$=^CCl5B^M{uBoZ;a3 z&pO*m*w8qGuQbbVk-U^-u_Yu4;`^R-NX;}ZSLd66m7`rX)9|b|if9b@q;I)D{&mf99%oN|8h|e&A z3fNJzrJHDfE7gJV!Js&MKn+{uyoJTR^OtN;yv8a&8dU#{d|Ric7GPP%~hQ56^E2B@^Pk& zAJ{EP(CHA@;JQHwB;gwBlDMwHkdliv8lkuGVtZ0mQjdIkmwT@0o`@}3LVKG;eo0^Fx6w=z-|fm762F@P zjZT|p=iGbOGM&x5W{`|faKazbtHZ}KIn@Q>)$WHk+axiyE}2KGm|VvJdF~MmX~qds zpKj+Niqn=jJ01&BYTC<$ob|!?A7K1zPLT0u1k?W;`272UKSiK$(F;F{AjF&{UHf(= zkT&q=#z`-%A-HZzvV~@;*~*{Io&mu{N=Iqu@FJGrbQ&bMSR?qz#ok^u$Sf%*r2XdN z^@lHsfwU9Vk+kQ+Gfs~a0_lqA9lI^0--E-Vk3-#Y)mqG!I?zj@-kxXw@9m`3MrWk9Wt72P?Z|}Lq z?rUOchKi*hofuOww}EKVZtp6m@uWjW4xCp|l5maB7WF0fr=C4e6c~fX%Z(IY#vkrb zGgg07&Sg?pYYRGfl&9cck@8fo@l1ZEmlUEE5o8LfgJ{oBzbK*~t)rI=kmb_J@!WgI zC&_cG2uNMvv>c}hWhZ?SSls5Nq+f|T|BN$)%UuTuX!9@cr-1LyQG1O%Nd4KaTS(qI zp^rsvS5JIaSf8%P58!Y(VW4)<(T)HUn>vypKqaFACNb?47MP|H4M^>+daVI-(>!A@ z5R5p_w9YQwwAYb_C_#Z0;#0K)ruX{N_Ebv45a1tQ6E~dN zsxG$7JOG%c89XQnlZJh5p23c$VR|jq){tE1Yp}uoA?hvXeCv*LT&V5Lz4p4f3?6$l z&8BBKHl<51X_B>8h)(%H_RX1@$f8mjw@Q>`;bFb}&knE>sd=Wv1wswl%-+oJGez>> z1>C)J3iHM&%L`(yL~$B; z3R&-L2v{PBg8H}A#*Y3uj0}Z$wIGM$E8fPr-!%`MA4X5o8J1#!0PHY^V$RUb*P@Rs z`>N*`wiR==KYu}hz#Bg6E*~B4nJrc|2Q&Q|8KXJoL|cr5^wvh>IP!XHLp&{`VUq7@ zd?v)>+9m4`GoUV%ly>lL6?5t@&E-_D>RQYY!B-7rvz@VZ>`b&T^%XrCSWOV1QvC_?(Q~<1n`c1furMl%j?3z)$TxQ{uHeXCS?KJMqBx5 zaCZ4K5E*aX+yN5-4x+(IRgB9xw9CkwA=_@~~a0zzWRk>C$YS zwSh4Sc2A65`ZKZa6Z5;%nLtj=l@Kch`U}FXE5w?X;Qt;@YDM~e6(Us}=NH|iR+al= z`DBVAn`!K5J@Yqxf#ML6`@Ka3iH;ERx30AkUPeUQt;}Qy1x~4fG@`1#;oJT$4Q?=? zmE~wE*7;=5s;Rzo*RIn5zqAyxbX1U{8g($);t=Q~Cz%}TNpsw8yyi==D@}{Rg0mij z$#+L-68G(s)qZHBdCPUD`X@2=&)4Yp z-0$@z0*Kf7JBYVCX~MOo$J~amuK%ed5;m_e+{yG`V{n)X~8yZH`Lsm7J5$5fb2sA8JU zbl)&Vu~`1V7X;{EK*T&|Q_mzt7R0(gmSm2+o$O^gqUgCz{fP6;)~bz0NYI5b(oavR zOfDS+d$LqfZ`F}HM4DPL)eIFJq+vcp8Bu+ipsO`cPf_%q!F|2wkEXllsLc9#PW&qk zhoeLNYRFq*%hR%eztLR9+RX1gxpvR`|LEXaGlMsekF(4f3$zii3J@4&h|6YSbsrv2 zSrCK^j|x(MSKRtmz#nlf1e($NI3?`9L4Tket9F>Zkgt|`=mm!6NfsF+xI!hJ!OtjX zPda+)=o=a~XfkxpZfZXa7v&7iij6JN;o7nWTC6%3dJD!2@*fm9jF|TRSi|4^^w^&+ z#VlB|w)wdpSeiX~)mH0tC5`WM-;+7RnZ($TL^B-QGEEIyIjutIH-r2+l~ct^5`DAl zQ;NtWNWA-Fd0g&@xUBYQ+Z}O|GCrc+fa1coRx!m$yBJGKL8$a6OS2`+Az~CEIlSrZ zF!zbAi6~sueT#+;{%vByd*P(QEj3WtSct z&x5}&|0#n7U}tz>x7-e4>NXtmd}09kGp71PDjt?*#DObsPa(UHK<_qYAlH1cq?d52 zz*Uyq?#9I?>9p?`U|Zx79(($|Jh{f)0pE>KA7@APkvV7l(GXL)AG5 z7Y&fROcf%~&a%Po33bUQnXMTle%K2D)vb>gdccN(SGVFmNS+};L$zRry9je(=~CGr zvAxB_lDmIejX85j-$Y>JIM*MwHR!EgA%xo~`s5yQ#e;%o3$n*ANW2oBYAdba$TTe} ziyy`zt|p$WI`0GWvZ8z(+4>&!@ktyOV8l3as|@=j_6E!KQM&D2r3_~|p8DCW**MNV zQA$U&9pYr~Gav3%MKF&oRxW|u#!Dj*kK2xYC?54xS1i9}CwJ6N*UW(h?{Iwf7{uf- zP2YEB#(QC>w?$>3a?QHzyMNcCM!P|0*K0nv@^DwQNL;r?F^m}sXyweVIce_pMx~9? zELDT&M$uvGA|ig-7QLk4G#mbac<0$4V0}fxtBG{}+1UGMx;7aLR(M-tRBwe;-GN)R zh?3~t4$$vf0UP3je8ETBHV5lXJ#iX{Ypm3HWYN)1| z6Az#$0;2O64Ihp(t#W-AlF!uqlrDZIe_IRZsYKJMwReY?66X#F0CUWKhQtl+Ei7O&gJ%GT{?Y^cLrV$O8`-g3Lr%9@BSW!Qq^rYx?+&o=DCAiiZNa5zpEy^D79k{Q|h6^$Q)+L-e z>aFascrK!8?@Br^A0NB_N1gx*=~0`exa$R&1>l`CO~EpdrctO}B)IrSqX+M?O)%O) z3Q9{$Gj(nlZu$aUJB*2wABgRa6BLQDfX5qhUt|PmOcTNdg=^%&6pz`3&)u=r?69l3 zK03r3*Ma<3))#)Z^=sf-!?VV0+2GF(sVFL1fEfdM^puPlTg*owuFM1=TNB0Ho=r&w zn9ig^|1y!&t#UD~&@94h2AAPB1N3|8uIn?Bpbalo4d(Ui415HNLGOhWn-WpU&y3^G za)&E~+1q&1wivS!hp7WzAzi}K(nmMI0g4dB<0SGkZ*+L?hYJnVe5(V3Oz7AIcn_qN zxR{poFsCcgnU`G;)cb*re?G=bxv#%J9lHPB5#AfQ{4=pjYo3yKldAXPD=E zjZ5stR8HHq7K3H7R@SL_x!ah-_>3cF`FEk~)F`@1nc8my@roI&UKo(-)tn{#t{nvC zU4ysY+1>KYlHeoVF8fzM7Ar23WUvTUIvlvJq zh3`i+@C6@BUwfACyNj-F+B)oiRu{DMBz*q6I)JB`l%#F0=2DeikN(!cZ_#%r>*}r? z@_;*+N)M-DVs6ookYdi=-w`+yHOc^Hb1CYZu$(*u{`VPD8@;W((R3y^0(d9%ud4*s z4^&`GfSFCH@6q9O^^{XD7!kx@Fpla3E=WwiKNRbW?(3!8$LtB}aOC!c4Yi*ew=tNG zOU%r;MoZnfabgsWK3vy2uSogFUf(vo`y=&iCLU6{R876LDeMIA;NPy?FD(qX#dB;R zvvlFR$#_QsN|V`tJj8?RoBzj0P-9KG)mKknPTn325XC0dhuFgWN@_4_k9o*~0-FZ4 zc_|4>jSLf7-d+}S{&(f#YW1pk%6q%z9<_?k)79)`DlrGnlI2;ZEA_tPw{|=pc52pG zHLlH`mz?7vJVBrN2bXuQbNY^GaW0?bUK^if9N(1;lsLl6%*f&QV({v2X{^u5je-B&fE%Q;<`C22F<&tWWSm`D*0s0xsj6 zis$#w0k5EReGb99T=6;m@j3Un7MOwPJb*&c++YMNsgGq_P`#DfKdQ`?Ihq|b@LeCq zAv;l^nSTvdW}m)gf4yU0_q_bkgw0d#ma}oUt$r5ULhH(DrHT=@)&iB?sDkXGi2fuZ3(+>zm^-k}8TS$MemQey?yU@Q>u{~^ zVEIXG(+k5Z>=|7WFGGA=G8%oC)6#Y;d6=STkIeX_v&y6!Lr?xtw_U@tdW&443xC zm``;qiHC8sL8~s&RZGr*gmS$97D)0U*#)$b_t|6!FGh{MW8Y2}I8O zuHLkNk0P}Boi;yrPo7GbUh_~Y0E60MfZW_N+~dw=8GGRHfCLW*(p2-_F6<&=JGN8= z9=CgmxsBDP*ubr;06?mhppm#^PyMoi+P4THrB!`pRwQ?rpN!dfeTP>ORu3%7*PdgU z(E6N;-cTeE+OHwY%AzGQ(M6e74eKhW(|{XOj11KJ1^=l2(c1G~9aS*0?wX3pvrccI zwA}v;TE@QNo@(e9f3ARm{KpTSQl4_epx#z((f=@2ab%aZRUt!-DSuc!kiuBTM~#rU zvcfii$k%48(EfHav`zF><3<8K(y*l>*=Fw3saq6070!zC?Q^1ATmamb zSRLqH6JnY)D(;J_W;1s=_u5^Y^q8{;a(Zo6M=}+GFW^~8{~phpn(_J5N?X0m$@-SY zZ8nnxs{LZGi94k@@AVq|>n8UlpHFeSM$yXavcpBtjSg{&ze0W%quXBz^#2O|KY91!YfCB8O*>Gar0;Mk=~>{#tbw|retaf7t=>)}nbYECgZr;|})+4?ID zABo6$I0^F_c&)b|mCyNIo0rZ;_?W`wif**8%SBu~eZeUny({zL{#2nTu=_IORcycX zEk38yMR~dcot~O1z6T7*OuiuWfR9vs+$K$9$JSo0;9?-KkE)uK$Q|}!mQ(AlFpof4 z!}yn~OXl2jhm`{nHee?| zXNDcaQ9GEt`j!#A!NzF6;5=amFXH!Ag-P2u7^?Fd4ATlbpKv3ZGv2r|bx;nER&_FI z_i1;e4dyOGM6opwuy?QAIK40!&oksyC0=_Du;sjWzXANw_N1Tw(YzTCsTuj7@#lY6 zz@-F7IlF4=^hO^bNArCoLt*$Re3`w(`!4YYn4@O}P(-in4>fSXMLL@vXb3OSub~So3fisnw&Faw`N{(vw0;}2cvfiSR^NB!3 ze=uvjr?%;wJM0xmA#IfXG6!BKYZdd@U8g;Td^6qB(UyWJ3IuGUsFWD5otJ=gu{<&| z!p|xDsA60Km}8t_mUGH1G)93-h%!E~@9|upiQCE;vq{6?-YneD9sZC)61cWa5-Zo@ z^)#`Lpis9$LT{$_b1HMqE+pgaVOzfIZjFwS5G8h!Xhn^jE%0yY0jD4Fo<^H{>7A8m zq*RZj-p>r0fNR;Z`zch*HTuX^Z9f)2U+I1D2Nyu3m!@eD#Ye&s^|4~ni>k)SVSqJe zfzdd!;EKNGnYC$4{q#~984i+SS(#k3W&!k@jZ0o9l9Szb)+(HZtK_X zZeoU&{De4tYxiG!@~g(zBwlvkeNi_r*1AWrDS@)WKa`q1@t~OhD1}G*Dqw;%UQD#- zc?q`oq~$gU%dXb}MDqI~`QMlO$DcOqf=54shCgf+SSUeG3{u@1e)}UktI5%+0kA(dtN218JS)4`+?Sl{5YyF90{JlJEO@U_58h4`C48BLrt>#>Q$6O57T;U+Ue~54UfQ8ajIh&nJy( z?q>8H{*RWUQaGG{sJ}U)`Jv?>sS)Uqv@sk4ba@d9c*ojQT_clBLs6@K=l)zwe&%fZv8qyACzM;MT-~Lqcsj^03)O6eJuvS zJeV67BlvZK<9%U>!|oItt5)#u$gSr(t0}PuEvmm@hpGQz3p(+ zOuM}~k2}N~Ji0w$ZtU9~^kNEWo@JcuytiK$njVjL2xWA!jx>gjw=1k@X;E#3`j-^* zl+#4$;2ZL9PvJ~H4$#T$Wx}uq$|Pd;7J_SPe$-Xz)xIK2VYb;Rj;W;eFrc zz+xH=I2Qbg8i!R5;ev}zQtIz7Pr?$x=oq1&W+#uYUAs2J6*=$Vu{f23Xo5)FsO;(H z4$lHR>=g|w7V`s*&CU^8IyzM3e9fjRy@d~JXqL|PER7XC*?c-Fl?3$4ypxadOwXUab8ex)Bq5u=HAh$U~`aTjPsI!gamxA zWVD%j81A8SUf$TToLgH}X=g#;3~C*`HN!Yy1cK}HYDDis_KY0T=0S&tH7AbA6)(_H zET?{BP+DtT+5J{W14}5&I(|~*LMhPs{2r?5Px>h!7qb=Q?l4q&rzdpIk$RmJ97ugO zQqsXwW%*JYF@|1d$0K_bpEb5s#XQw^&*ppR(N^^@g7p6m=LV{-9VE;ke@sY7v{l!Oi=rba+Vy&lT9gQ-DIuEn`|^HO5FSQwoZX08K&XbW#wV7$+Xz(DAmg$q8d$ zwmveB-ruU%$0m-xH3RdD@@0wP3?q+kY!iCB^&Mj9SE1|8n)=v!d_DR`6nGE1oui81 zZV+sQR_j59r^cT7M-#JJC&CC=bBhtkC*2q&E{8%ren)G~!~NB~{hfm*$(Vp9fittZ zLHQahWWZBmZR~D#StZbd>RUzFTL6^6L!$>PVKGyJSYsb|nQnS1QA_)?V)V{3wfXYX zVXk#DAMJUKgk&{HhJpxb9qy59HLz|2GZ{qb&ofRgh%Y6DFv3@wvvIHOp%9jHSzbVDQcw70? zA?K$^Z|0gq6pOJ5Igqp&N-U@{Qgku9vXP~_Yn(Q`MfG#1zf||$H6`U%>#I<0UeKnx z>O5eKEWE<9O@&*9rn{u$cwDDvs`bYzBw$o(shO$yzy!=;nEc~cEbPY#QMBY!SIf)F z*!qn&>}rNZVAms)ult8+%Kbtr-U=!)Sz{(pURa-^!P|b)&hTPiYx(NtyhKB>_401l z58R=S>LZ=_HA3g%amg5tIPF41b*T5o+4^B?=KzgQzS( z6&THlgKzci*ub{bbYQu_%`(|{)7$3>FgRX~{H85va!TeaHE5wiLX3b_-Kf5FR;IC- zMZCt*GBq&*J-LF^gs`L78G!OQDr?E^q|e#h;g?bQJRe*XZ+f|$mq#ueWGje`YwKfA zS073^L_kJ%2xB`{0Zn=O%gux(F*&aId(T78JZ{YS30j$fXEg z&M1>f8}i&?0D4%}&W8(PKq*{4x<3b;>Hwk=7a(ELR|MbAZ11`YEVrT2Hc7Im_Xu`(N>PW z-t@Uv>wS6b@gd(6Jj2!}#}PdJhrY~L%INJ9w=BXuxI$u{K1OC*z)~r6#eyrPgqa6w zkpq=ZCWu4rXS}Gn+hIySTwY4Hyu=qj;|4p2qBlY>C7#vIFP4$3+dqsu)JB%njMhRI z!p3Xweeq2oYCG00$wl6i%hTwqH*y=(mg7fgLIy-A5M@<8V>zy)X9%y)e;pHp z+u)@kyHX(!0p-ek{S`1Xj+ptdt>R8h8tr3PY+U^W{2B?=_YJ~Fc0^W&qG604x3`A* zN%eJ~Z2;vxs9k_LXL8qr+_uU&-~dPHww1fG2KuFjm&`tVRw6DOIWG4!n>wF#z;^3`6{M?ev%1X_Ct5Uiy%PUMuMj!D%gk4QsTe&p5pm)3eX4iR|K?`OS2>Z zRaPKYkI4-e^re;mfZDFdfk8p2w9HK8cslS6h-d_!+ z`q1mCy+2`^4!O=u?9X(w!*F4_TH~Rn(m^f7oMML8H$sL0@4Xf+3i=(RlJoEn`o3z@ zE_D&u!P)5k;r8s-RrUB&R{@a;E#`6RNn_ntZ*TgS>qUrrVpl;H8X+jPyb7|&m>e`} zBAapY_bwQ!&PE@*_!h5xXy^inl2n}=$G8^mgbRW$-S;r?P^AQ{M3_k@{B>n4IKM89 zdVdR!o_lKW7RQ-vspvjWHK|0R>pB^GKg|ZB@v&SUSYsShc*vk-+dKJ|mclC}Ue7-h z^C)4UKq&834rEU6%l)3r%7jb0V7RL((D{&k|t(jl?)^GJpEqxw}a3dO?BPw zh)&B#MR(p-AdhN!w?;i)@*u6aw*W4EwnAY9j@0OBG^5|m5L$OtDi|eW_8D*eM5t)cY7swz ze?yxPh8$F;=pU&7*kk}|ms(|BszhTVynNArwE!pmnzN&z-@5YNKwi0dw`9bgfiSOX zhnZSmH|eZje$GSkxPbgK$0|+CIB#@ctjMVT1f}2g4+4{;hcY(qDUU}F+rCu0%^BY6 zkL5*cn^tVAU<XT9_%*0u z#tKNyiXhey;FatTIKgW8i(^=THq-}Fs6FchmFYAF;76m|DbhTdZj@UMN0Lpl!(PVp zFCBQdi{t8U6K|+7o-;1PPC28r0pa*fRI1=&G_YHrdk7l^^JNLWQp$6(c^14A)u211FQx4L+9dzzLkdJzxJyk6`_5`8H%rcQ>@hMG%@`LJ%vq$aR@Utmo zfp61kpam#}r`uv?=8pw7#3bpJ*)L~|jLqV_A?#+0cZznqW)m@u`EXz1%w2Jz7bjU( z`aix=zYcW#nnd3N?eikT%>NrB1{N>CtM!@wWQ7j6yy>9qasXc09BhS9+OPVs}h|dYu>QLWkS@EZVu$D^WDl?QDlCdPr2Yy`CZgp&Hw~TIY>; zQr5rSyL#Z9IY8q!p-cLFP&jSpLKE;+v`YRIi|K(bRF@aRV@ff>>bXOX)Q9CnvdG)o zo7BoALFw28-p?2P3uc!*kgPEYyDLD(kC&8|I@(deI!n2y80V`&Uac~>S!NlXbsTW> zSsse}f3&>^RFv5gEsTiTAWG9jNdh)G=OBn6shcE7l$=2ck~4^sL6jzECFiK*BpJa* z$w4yM0s=~u{7%meGjs3Uf7bujdTaHJA1;{TJEu<7u3h_7{rs*#T4GRs8f2y9BReF) z0Q$(bpV}n>Or8AEXGaUpIMbl6RT14oP)vUdFUOZ_E8s}Ta<23i{ke5T2o z{S}R1+tX&~9R5NJP84Ox!p(!moqy@it!Hn8bG}#Vu|H`&=K_QwB72QGPECBwhtJi| z1tPL!li2YsI!b2jgn0n7^r_=pJILY12BwAub^a00XHs_0t+!O>IXTuEPm5f9E;A@* z@uUhIjI4%8DCj~0s8qGyiW=eACLXFc+^N;eKPM^~eF z1&q<-1NjBYwnz2g_j`a zM9Qun%%+0_^_<}PN!->kW;*)8D3}jPp@Ebtx82c3A8IwD)nhn4r7$N2;!EBc2aF7z zn2|d&VX4pQJyp{y4aWU$V?$mH2q=Ocshqaoxos>KZOR z&OblJ!EG)zzA8&Sh3!UqcEy>ZiK)W%6jmHHOp*i$Z@z%)B%Kq{7X$;SAEbPIGyQ9w zVpGKXMX?L07^GP$eVoRNup}4m7y6#JGDI?r^>ru~3gPldvkd++20k^uwM9$W@X&^a zx8b2HHq9kvRg;NPnH$PfD~ZjSnNs+g&lGCa@^b^8*->@xuqu7JbsXoq}Sw; z(n`IlVBDT6`<6nzqC&O9t1pjt_R4b;T`fmN;h`n^H7-mr=o*l=%O?VP+6IS$P=VhV zMJ0`cclZDElMsv}+idwh8UeZ#tzFv^^~^Z9!ywBx8U@|vMjz%i2n-Gkcy74+NDprf zr|l%Jb+5Ciztr%yZ=7kV8GUqGu=&t_TA+Kjepf%hC1u~5iz&?SqkEc@qcu_o%=Zc< zY6fH4&m5gjnd?y+NAhzA=T#F}S3m7UGtCV))LU-&yx+75n4D#JmIX;R|XfN^)uaUx&iqbOq;RuE!Eb-sa>o zhHVgL&A*QQ`8M1)#pk|*h{DTLWw`j_J7ut4TqK^#S8?fi`M|TvfOhu%-{0upulA>h zKy?ye7mpT`)R+LI@U(u$N6wUDSeU$!owlPfWK3;2a|Fn(7kyp zh-T9MWO*kw2zv8z?X<||WHm-7!Oz>_Yr|&dedJvJjrqg09cmvs&jE4%?v}d{ufT>9 z_eQ^8n=_38+P-sf`|@ml@3xL_a@JTzr6VBqS`W&!FIz^*^+nzsUC?WE-zeXjs9d6` z`37b+D?=EVPMG|*meZsr-P4NP!2oh_xkgVVwOAD6xZ!qva3_N`=vPZ#fW7)lK6gfj z<(`zveJ3h;c5L?c1w^|LPHOJQmP%u0Q zdNp{|65yCFv^Sl=w@)S0Yh1hpVK5@cdoQ(r`>bx)7*$^e{_)p&fH|Uv?32!AW@Pvq zP==fr+4=ykFO#!n&Q5sHjRs?-lAbDx+0vLhkl5Ojfw0B%*bmzX(qY9)jrK_@YHCu9 zg!ks?pN=KWu%pY^F8nxON;_9eeYZ9>FfbtTX2xMI_pqeKKtw>mMYC~cwm22sskjG* zfj@T}vpuSRs&S+`cfeCFFjg_uP!)Zm4xA>Ug8Dun@EH zrGBUNvLs9Q#eca7oIq2UCtAyRu4#G2{)22;Rrpx)or7+>Sj_iM3B4yAnkkfCGqEG6 zJ0U^!=msIgLoa><@qAA9_)hU)rOC(#>soSe+Qm77ntg?W_ed*AV=+0;iE9V>k`55m zoC#s-UWQ1?WSzqDVot^n>w~rL4Rj2iR^8+mHncDsQn}^I*28w^%HA5)*O_&%?^GzG zE32DkDMlY!BYU+;?*Wwz*>S?v>{IwA8A}Bj!Kjbny)UuX&8%mjd){|Sp07gG9eHW^ z2(VV21a`V(@=spj->>+`hd>5m;40u1lw&D4vXYNA-%-;!brOft`Tlr+Oh_A$=yH}U z{+z#iRuOV|RV6b$ZO2^2yuR#M8oA0LlxYmE+fD12jNi5&Kp-iuBN{STdo*%g$B~((A=R^Hbfz42a za3PV;orqzXvSZTk`PQ*`xT7wFm;0aY04(1wn8(g#JdMOljz>+pGV6TpVg0JVo%>>f z{P+%&OWA{aY|=cXoW#Y>ADL6{7d3NISzUxLQM7WCvW{v|R{dy+x@Xj?E~p$*q?4$! zXdqFcvS>QRze0(~xw79}}^;1cV=XmH3ZHrt;>{L+mcVAMB;D9kqLTE8>_P&amYqTPaWXb7U z$l$f*0Da0mCq1qm=@EUd+u`B>6LiPpwDi4LN1GV_$rbzy2mSFcIWho7AZrcgXu%2d z?wFD3p0E#kmH*hPKGx;TVtSHeXjlB6X1F?V*7qRlNF5Rx13J9-x9!L8+l>E^*fL}2 z+#-?LCt$b(q__Frl%aKHK6i3=Ngf$9bgqM`19y4Mqs?!2>%wW}n{pCWm(8N&Fihei zLrM@QRVbb@SIcTc8Jp84SK1EO8s$eTm*-2O%zQhJqL}iHZXU5n^~JecD)#w#dH&{5 zeCx9H3>CI!xei8g7YcX%;?*7Y>tKH7BhX)1YCY;B<#BLysKX#&ml`&O;)axjHnMWs zdq(0QjVj~An5fFfiJNC#DQNYF1=GW?$hjPUUkY$F7KeTzoGy<`@F6h92@XIfrA?Aa^dfMJPz(XMiBHBP54TLE`)iH6?i|g zfqh23_s?`WJD9l)u4!Jm9lfJs_(6Nh19Z3!1681tMhDLBiGF;Bn-8S)k)FqeTgNQK zGUSb=Z+?im8S31au=|%5;DCoU*YW<*h$1sfzS#FWU!z*5TF(P>c<(}-!mVOJ zCx}{#Ig#96VwsFSu|&K(<8H=x+2S?!QI&7A=ooU|QwnSxwA>?--2JgJUAtBO^v2#x zi}%CDPah!FcKxJKhezHZ0t|+j)@jkoLO$y>e5vfA^H~nOcx7{#OEuAXqBJbm<>Yy8 zPWuDFbzeg$Ud)KqsacfdOX#+o;T7cSu;1LAmE^<|DtY{8QX*j~+EvywmZkc&n?HcZ zzrubr^Fd6iakI{b==YWEN4&nZrpNrY%P7*wVm;+k0=X?T=R==MRRuH48rh3_&w14V zLm7QGN>vpzrN+K3k4}EXz%a|e%7ws3jOmL~LZF;iMmMGZ%RE_OD}~M*yi7!oAk-s(+YdO{X&(<;RxsR9jWcr_7}T#Ja{Q&3aRqg+$UJGrBe{B*|ikj`)7TBZ(8i5_Y^0jLgC zv3pYXk3}(|deN~$E`|qsaYKqSn#Lg3=aRNHEWK3n>DG;NIAC^@1bk#Gb}a6dt^-4C z;l8LWfXR|br#H%+g+VIISIi2IHxclWeqYklZvR7bgS z>shN1)wNPrCAL!+kfY?Fr!LR9ub!mU$HBad9Sj&cFG)kcp&g9-3t$}CfyZPb#}8@5 zpZ^OcKD|qXm*KNlTD|d#`J7(PYSUHzgI@#aqvH^ljKU0Au60RF5pTkIREQh5wTk*& zezKtE$#r0A$xM}&9IwKS#|TnQMJSN z*^1@kizEWv04AJUq-@`@yh*y6fRlQ_HWO`OcqondSZOI9AZhSg-JRHM$#C4zj4L91 zrhK}|bxk-vxQR|qc!kdP`dj5&x97-qat{4`lqG|8B#>W)|;LrI>T?jZmM-^LC%Tf^LiC zC4;PiG3dmcGGuNVaFv5uBLagvx|AW+aYN_kie;z}pp^y?@Fw5K7>IU2(V_3n++DVr zp+SkkRQ!$~_u_{1hH27vrq1m_+1Q*d_qL$9QI}XPaeGK>;KigszI}A=oPXMbP-hy9 zEnuuXNaw7hnnR%5<0-aEt3G6_k6qGSHh8MF!k(tH1w-?}P1?b(!|_sm4@FQ3n9e2b zzH0Xb2qcJ}(HGU1>i|Jr#F8j1)1uAM1)VuMI3HDM{B1JE&=;;^OA60zMS)c?t(DCdoYd za1YS$=OvULT9Q1MKN2kk{>yrzZU(x{`K)s~^^VP=2Aui0*UL#JhN`sqxL`{*oL~FO zf2}*;%}7h42)Add4Ka*lp{|I^6$+~xNXolWig%Vx$-t;?4`h^xR+r3(zFca!1{PDG?rxkX@h51RsQ)j|^84TOh5#q?Vj)j?0tNoG=*WgwQR6eT z+Vf4H=Iu5YwZsiQ_L~bxFZ>9wy*gkZ9J?qfXuZF^8F!p(F+|=-1v$hc%8+uQaIsa- zxFNN}mH_{*U+PqI)9t)bloHFL=n1h|lQhDQc; zu|Ea3d^v5sbD*HGgEDeig!bUu&No7%Ua|9spyb)wdB}-e1uvpjbYYtredwaHFV`e(-&AhhvxuI7@v{vH6(M1ki?J z1}Kj1>p+|opm=K4bgZ^ER6K{j4#Y44#vTLGZ(pJT5>l5b*c`2u(AzsPiIU5jy>-o= zdoeA*7b;{%_&XT@6f_tExAm_~!B!7g`t0wwYc~8qB`GMmaXJ2s2t{06CegSNHZ2bU-fxN z{P}+3CSd{zQLOeu;>sW;@^wPDE=Ot!0j*v35q)yv9en=MD%Uf4r(y35^9%@|4hy3E zp2HxMK7uta;7e~Dr3i0~3fmBTN^9-`+l2?>SIELWsHAo8NF`*w7F4x8-EqAL+9Rkr zXVP+Yzw}Ch=7)|QJ4O8C? zZC3P5;AjE>cDbk~s*)?{m5A26@J>eLYAdl!G(%9DS?pJeTwir$=KS#)_wWQ+pH0Hb zYO0Egiq~zA&1Ju9{Lj#NaT3T?7vH(o`Z~PW0w4?gN1_^Of5Y4wCgY%`^g6C(?%-5O zB^NN5`Z8hDp}I$>X*=13OKN=G{tjL>M|z8;X*<;_7kRr|WFTS+JBRvW7@g7N3AdRY zw%p85yOVulQZ;B@b^UR-ado5z^^rX$EmIkSzaBm9cpPDYw)yskASkL*k+MbAWh}sR zzAZ3c3z2*}_Tuah$IJ#drhtN=ijDl+&}U0xZdK3WNE2@!?oFOZ^_3_w>IDv?7(I*r zI|*&@9S+7!b{xw-x7@1OpqlS~^=A(4i3r*62@;8eX5_^0j^j#(;f&oo-F+_nc~y`^ zg<-OX?on-2jFyNBWfV~{io{1WU?!-;0ps||!xpmi5eshLk$jPT^7Cu%#%xYRbnw-T zjHGR^-ZI;tcA609p-XS=^S#iTSDlB<@0Q)GM*A+sbFJ!Y%4W?%uh(z9g6`QN-q2!W z&#pYs)j59oboFoOOyGnXV4(jc88H=MGT3#7i`;}DcZ`BAJ`Tql@9(_H6m${<-8NFe zlx1aLyj~K^d=bLa`;nZsGhIYr7G-aAq%x?|OgfW02+nR}2=C_I&=P1pIs&yG5es(l zC^x6RRg(f5vcOiVy{^!80Ca5Cv^~k=ocjST=<~Y%4Bt9^4Ku9(z(LdY9F{<3QY~t% zKBi=Eg3+3+hpOI8Y%k_+u$pw3T)FA{!_;HW@8Hf%PgG?qb5N7~DeyL40N<2Ux2&$N zPVy4wT6E@|-$5Umx^*L`0k@_I@yaLz6Tm>Y`lZOTv_EBSWz`KhxvP0+S|vW|4>rK~ zK{W?KDjukeY^!J^3rdr=r`{jrK5wKy20H!D1DlZIZOb;B$>@u_NK-1Iel&!8rDqiV zNF>xDpQs{Qc9`6aP)xX9awPl?g(1+a-qz$(E0f|;FjULpp@L0)V9a8_n|GAW{Ax~TxhgXnnJmoJY##O|Us_OCz85q`eO851Ux=XJ3_7yT0 z55IZTw}8fiJy@f0ZE{$}k5-Q;O$#NSx7*eW89HzyBqP^5<+j)inh#z&=$G*Y1%9xV zgIKzQOP+-9RF~(e0KTA6io(O9baC8Ym;Y~4pUXqM$T94l7T(<5rj(2RaV}?%x+l!> z8KlN`n*&|W+8xY+uLs>GGB3wj)u;iLH)tsFZn~)hwig85y8djQcIK<`<{&_lPsqY` zK&JY$gdWmj=nII2+d;3?9#3V^;Jt4pos;^SWInODm{nfrcolQPWw;iN+9+cVI#D+S z^@+oK*Tt3(LAvwFNhWOT#?j+Ts^e~hd2zEE9%!Q}oj&v5^659R5|8^c!||xY&DV;% z#5{cTd;oPh?nJ|8U&)IGs=IvsQtdcpQ4|0tSA$+rd}EI69HNKJilTx|V84d|-Aj1r z=Iyc4ps-0ZGb3*0_^3M7uH+~2W0~-pcoMXx6Z57CGHu<^P6@Vz%u8ej-F)bevrVF} z0kM7)XL?&6qC`pd)+NWXb~4!xQzp7cFFhHVzq*pVQzItt&oXs4LAl&N5f`bu>~ywh zKbqNE*;><#^nTw}9fT^SWl*_g@`z-=q*Ls$DWN+iNL1;4nX_LNy&coU*&NFv9(YhnhOr~cAuCq!`RkJQv(JyT+H+HB_NLrvh~6EMo4 zj4>VbiUVAEaMgQ%2e~@hvZcW09jUJ0J>h}RkWY3bXP>cneAYUpDFDbq-^Q72CN#u$1n4u=+;7eFnIXoM78x`E63J@?g=9yrWlQ*%OI`zkf+@}@fe?BeDbZz z)R(Hdx?hXfEc?++gIlPwipr_6p)qn*$#?EN?P&7fxot6@ak&b|iq^PWrW#wbG;Ny3 z(e0M4fSS#m49w}>=D~j!x@Ibe_M`}oKZ#wWAh{! z*JDj`)?w|UUK{8%(#&XRPGcYzwJUh_E%DCxwIYu*&a$3@)BzHPubT~HkD+Px390<@enf?0K!vr20p8M@w*+`Rhkz2z4rSvAd&F9szxu+FJD@twFVh+UGn4dj%#N2=?kOJbX)mx+yo5d!&1S}&p!4HofnX+fs59k#}TcHHZ!9R zwdapO*;uN$ApbpOV*+$LG+6lNLYl(6k?6p%&23O;20ZwrMbp*;r18e_+4F z$ijowAIim9P%trC9e-Xo$n(z=?0tJWMAsw82`A ziG|c__5ila8mqA858+<;lbs{%mtN~dm?Vd3ANA>xE$=d#iNtV~E1hS1SsD%z?E->i zAeXbQ(9AF;+nPf+L9WJ=(Zz6Rk|#WPQf+bdS*%-rpu-Vdcs0#J5_bRXw?MtLRvFa8 z+8vSZgF0*J@@hY~*j?n}1?H5w zz=B$33jC~D{=lBMz|KhS0b1?Wnd=mGYu+)>bW%J zuzpF@M|p@0GDV`>>hC82n}9IPEu|^gy(m}g&56qZ{I`FT3kH3@#vcg=U*FskRZVp{ zGe0Kl<1!qV{OFWseU%{^G*6QWYmQ#}LL0K677R7EfA_FZG11Kcl(rTCWilN8njGIF z#>R^bb5hz|W*&E%m!JQVIY?RR*Zgy-22g^1yZY2^tb_7)h{nsY;Nxh<&KJX$P-R`kg{Q6jvrR%f7jXCd_F9ZTq!HC}oBKSYvMJapsQSg(MR zQ57ZGRU1LIsdecAB`yB&2jJ=je{GIi!fBOfptFpA8X0B4t25$JtzQ&;f^L=<`?z9j z?Sq?TX*u9Sh`wN^R#mb?{yk1()yh{xHMX+F?MFNOTwP7qd8tmbX$UUGTgN9^-;wJ3 zaY^ky8L<$iJOoT`m=F|`TJlyeOln-em}s}@<`te(8xG~?IH%^h2bD|ze8DVra+hSlSiaq#`sDJx{VBq z)lHPt;bsQpah4nvOs78N6GJOZxXJ8PrDylt8_|!x zeM#L>tiW6qWMBwv28ME{e)sno7?^bdf46&E2kSX3Z{dh_PS30M*sQ zE}nOfQ;QlzY{+ayGXDm+%nbVT90qpLeCCO31yWC*iMMjpzgr>Bg@5=MS;xX~7bGl& zt*=DDKn!0cF-J$$N@*;MZ2v>It_A6%lM*|Bu^aa@b6B(-Nr_2Z)b(`^@m5JN+k2So z!gfsw?RsvnGmS;l3;JwtcrKD|WqubbJdV4JUd1aEMW}scic~qw7boUiEhj1F=M$tg z+`L)dIJhH{f$-ju4XbY~YqV&Y!`x?HCA(5YIwB9RvgOgGbt+GU-(8_$JE(zHCePAb z3TSBI&N7kMlmLC*Uz~2|(2MfuNP%+UT3$n!h%_KN{lw=OCe>s`%u} z3V!6;C63EjR1_Ep1NVZcX?p4dq0VNBfsXTOtrdxVH_QV-WiHjhk73pAR?y(@GLY9P zcjF2Yk0q)2IEeGugL%xejO;ApgUV%d*nH%QAbG?!;wz-ES$L*e;-@@_PXti2S}WVc zvcEKCzRGi{4oor{;fW}2S0?H#xB-SUbn}qYo4B&h zI3&7HxB$arKO-ml9rdj)YVfIUFZHXmM5pWAv&sb%q{hz~1)kd^I5s_Fv@a*&id0d_ zGNh<1^_3=gJ_g+$qT=Z3-^xu{PEvaa-Hz(>y_84mP~Y>#ROOcXmJ$GY3mO0@UjCK@ z13)ngR5eozHsmY+b``VXz-Hs#@+C%tvS!*yW^dvv5{jFkU;*zj9v36M^ui zaw>rHTeWK#y_^;Ku#O}U`Jo|K{5E+=YC2*2Xys?TTYQLG_eWpl>ZJVJ0m(nC4b)Ie zHc;YFSImy91bfS5@-2A!!R45={4%{9RL1_ZhhZ+l&r+PxBb@`^AE)=`1w6^2&W_6_ zzVt38`N$`KHh)MwG?o|Ch7~Carz@a)4aDsj?D~DW39$6(&wGx4JZ^ve%nNeu(2mM4 zz#zE}uABH{zpFr@&246wH8+u1#;9ob3q{C6A;^{I)?OslT1EK0+CJnNFn7l%loI$PJG0WpdWXBL>{(}-1h%0@x;Tulk@W_9O1@2 z^xxf-(qfYm!7TG(1~|HJC(oEI+zAzvbWlODv%H5%F_eDF$tjB*8s_BfF-a9wz_1a| z5h6s1@E!MNHluSC^tqz1^bx zLe{LsmQ)dz^ZWCMk}J?<)0R`Hst81%1;!Aj)rgdG*jpj-MoRkK?-aW?z$Guvu@-Ds zioCa^T#=&>0=R$yfJt`KH4+Z(#rjDXa3R#qMl(X%KP0kTHq)x`2NNBw0(o5 z9nY3Wa4vAAYa^2STT58J%9*7s_mPs6!qQpKzNCR3SE#BCyX26AMkc(3e$AlPf$7gz zWtiqvALukWR41`K562Xe?wiFpwlGY&)Ec^cKKZ5!v3htNm$A-M;Eh?VU@afRAbnf% zrh>bXikZP`NT;KsS+Sl)>@*qA6QZow06NFNveqltp7^*PTp)5peL>xEcYyB}KvT=u z#LLSkI^y?8^FL&V@>o?;*j;m~TfpQ+Y5u}t^NQo9%Qg>W7OIYb;c|NL4l<+=Gddo( z++v|8EhiE3WR>ROh5T<9*xQZZR7iwJ-`&0-OGhNxnI>Oh9kk15%{ONo*>lf(_M1cJ zHhKi{`IZ^U8;dc{rasK4mUQ1UWS*GV+V255Y%k!?qW$B_!B1v998#H4Oe0z&-ELO&B3rfRM#G%`8$z&-q*9-!Fk7*&y zEonzr9WyV5<7ii?L?qNKe{=LvR33e6*G=%+AS0&q9FZLSCtx zSq3L#H=VKVqNlKZyDnX&4lm8l3Y(i{|5%CCO(U|)tG*YX&3@y4&;OCM-^=2G^SRZ| zl09_r2pOcbrS@|$=ZCXkj%!4&54Z7?;4}p1Te5?pg`xfCh=@loYL;v|rB#}Df2CqK z8U6omlVN*tvW3oVam@A6RIsmtUPd|WUS;=-UQ|##cHawXt_myIRU0#b?TfD`uAzeO z>B(|lWF2Q3%v>Vwbo$mkT_y|O(4p;{HQ3Z^4dFqIh{57mhz@8Li~G>MEq6L0z%=q^LA0F4z#q2?fcnQEjR3&w%5>oP#nxg9 zD1T2w{$jJhAvTc(F6jkCFpuxIVEb7x2v&Pm`C{!M55Vq{i)(sa#A`^W5Q zuK_`JTw1=DqSZkmgUms2XQQ7dNw8fM84^=6QIjW{i0`yrA8%2Ij$?Kz(^=rUA<*hc za0J?pc4JTo5Hoa&YOfk2A(CetPj1M&d7-?(?@uY32S7%WBJQuR!ZFj)S$}WT(!h{e zS#mgSyGb2}XLYLIpLIwOVEOSJ)K5@1B7KtTSomLdkc-ZLlit>WC zp8`Emz@x@u^+EqkuKpuv{>PV;#J~)0;l8kq!p5B^|0C}Bb?G=i&W+h`ui)`V((sKiaK|V9!{;E!OWvq)wY=HxCh^&q1#B|vv=U0aNs++5E+LVJ?`-6FE zWAS9EyFB3mk6_6_^GZj>kd_}eFyp>*xYIG=Bjb9~hD-Z+97IR7aZPNKJz)<_5=$G7hSi1r-=R{J^a+|t(Zp~X7RqXq7BYhMnjVy z;f1()uifB|qqw{DM64i#URV|Jj`*#VHGcSNZpiDREz*Ja7P`%#%pdfSh=bTa%a&H= ztHZRjKRlE0J7|bkplF$pl_OFirrmwF=2;5hunf8}wuEvucFB324JnNJ%mt&0ZT;K^ z-akJKMwW&9tWp*gq+YXx-D-q*T;>zKImTTG4foCtAOLPo(q#Yw@QD}Lbd}kRY_tD_ zR{vooIDy2(PvJt0pOU&2ajc=C7ys)l;NZzCypAo%XI{_>v)3>)Y#4Y}!BX0U>8!V# z|HYG>m>nVS+*map%d*h5Xn<&r3@wk((Ya0;NpiNk1xBMJ^a|J7_Xl5$*`>!IuE52h zmraXZ%Z`Z~Vd{g-H_BO?`Y2>uzFm$Ha*WHUrds^c2HScxk~(VoUgu^Y+m<;kca1N{ zflBkBGwrTZZJQj=DgpdA|e6zG6WG_SAP#852IMV>xN~s^|w>2xb{1hUvDb-cCxm z-DM{&IuEiiyDpcR&|d9DLxnG~HE(}?DSYqo$t=ftj$!S^=J5k@lK;EYuBh>k)BfbL zlj#5OY5s#BQ|QHAw$qxbs(Q%-98{Ay{OzEV$j`1`uM^}c;c>Fz?#Vhi#8Jv%sNr(9 zX6)O1g6D`)WR;GD_>!{J;arl>9vNg2Qg0B_* zF5@HLC^EegNV!EyN_Sz*Qx?NlMQ&Q7>gf=cnZZD&O(wGvmG;-JFB^W)+AL;xa}$OK z#G5gv=f7;UTw1HY$B_*S^X4$mtUv+=Bwhu0yAQKcC3JsbmVbIjlPLVVD5|B8$**!3 z2vvLKvz7mAg99t+pX`$v6T8J?hJu<6TnaNKSdJx}w+$H<)vPoXM`G_NqIURAc~0*W z7j8ejoBleRcpp?mgrA&HAe;oR`l#QY%FkihY_=07-hK5t(f%wc%r0bSOsC3#x!{Ga z1$9!L&aDfM^x;0iR|~&Pe~^`1Wzc|g-S80zKSUaB~qjt=#fe>6x+U3g4I?4>Px4NQaPMoTYU>J2>&hX?0Fs~8E zIh7eF2w;;@j3$S(y^E#NU~lJj$o*X6wL8YOZQ^+HeEDCrXPag=hWohVu1NUNfCNv9 z5kQd?(XUu2A`5T@v(vb`z#j;ue<@b}6o=asvG0g)rK}+hcDU!ScNjQdiikDixx3Yb;Pf%QcE(=-{?`26{h}N7B`{G zkntIQf#-JUT0OUKspP@xDHq+7s}AwcjQOo|e~sNTruMgV5ixvqVlCptbDN+LIIJ%wu}Qce-2;GpMUhnq?K z*LGik@{))iONuBA_2m`F>6s`Uxgj!?6cS-u1P|o5Jd`pbPpLsJto0fqe3*zt87Z_1 zj9cCUT85%5-J>!+`fMyHu{?!Kf-SE`M9{&ThbmPL#x=gIL-fOYtvC{Fi5{kRHZOWr zlFe>Y$Eis%Gp2HhOuK9O3J@kB!sUP7UnnFyi10D@$pEK${9q{JyxB9_<+_t|z3}B7 z@VX^fK&d_Io<;c=zx$s8Z=e$+h(>Ad=KSN&1VodMql^94ecO;;s0 zS6HWtQVoO$2k%`hhOiR&!)6})E{5>!!j1>Ose^2nJVMs_H6 z-^@$3D{0`7vc+)P#p0ZI+=~Y4y>aN6H^$?#H7U&ELAT3(#*dIPgNc_iv`#AoGOR37IbA{)g8CUj~)|NyEw4$r{GB+;lq_CI>Dt z>cPhJ`I_!PKLz_}vd?Yr3;D|dZ0iH39L*CC_Nx~?#FvK9=hL3k@t8r6U}GD4;HF(r zIXd~%?2h_O8Md-Di(a21eEh=foD|;WewPRxqC3_y z4+IX=T<&#J!8q}b4QCpnT$Y#JM?S(+2MJeF=#vw+od4A?3ukAUWjkHrx!U)zk2g0M zv!yhvEv#BqCOBUVC(rBWfM@raEUqYh&G<(S<|cdtWfYUpi+=u<{?VW%c|{&`DcQlK zu+v~sIyW)5WN)xkDPDcb-XnCTpGJmr;0KDC0gt#WGaUyv6v5-|4R%kBFmE?Hhl`IM zAI(M5_Dt4QRNy!{-%aBq1k8d3Rx3Pt{BrHT&G}E^8cfYq#6gWf7tCxpa7CYMCw+qa z?jhK*<4T@`t|1)w4CTZ!X&QFQY<^{1$08RNJ}}QWY>FdSYw;O+NZBV9AoVri+PcYj znc>-oGH5Z_Axfse>e$KZkzT>klRa4y2GXSLJ`!u& zTEo0=k~s@@Z5K`*@B>lharPR*izgd8>8+LMS?QtS&l*G94H5dmryJ}Kb3cd54Gzc6 zTX`DK91Iw|E?71MvMn zpe?9h{zJ&(i-J05U4IT;+yU)A&^U!AHV6WZi3mQ%!&M@N%s3-mW z+u=SusZ5%7v`3d;9^=VNt4LC~b(X;qT+@||c(D~)3=_83R6ZWrVXDapa+`}6-}Js{ z@Xz&`Aeu9av!HcIadRj8jxwuq8IpTVv*?w@;(w8@gI_}JkFe;+HHXC-noEDrs77EJ z)tGJxW^Dd0e77y5Eu1BzZeOSMXtaTsFjWRJ?v&;x18D&ruMdGV{f>zg?BPEOnI$mX zYvXR`6uG;8hHLfdE@x3%8atf{3tqFiGGUJ!aO&<5b|`^sg@Y{Qqy)g4Dj1)PH9F2B z!;)UWHopfqJ1-mQ4JhTS;fWwfhgen-SG7B{PWlJF$E4~&e4Gst=Tr#q^l-v;wST~i zkJ@dq@@8APMNXRe$LxN_o&uZVVw$CqX z?D4)m-iIoOON=cr8n$^Z%={v+w@TO8fi=JU;H>i9ZIba}{=&Gj#L=Gjt_{6}u z3&P&!V4!}%|F`E(UJXF%Z^`EJKtm~+1_2>8DLGlJXvU!xsQMXYdP(pZ6jC1Ej&=b} zDcaTANPat(K(2MadqM%9fkIEymlksZi*&c)@5ZAvA7l#pVpG|f4#uGdu*3bYYS`f7 zk|@s)^tXN?-gCyQ*JZg9&=+o1nWTW&wZ`N|xT(AJVK^cl#8_uTJmmR!V1W^4s{H9w zQf#kp(IiN=D}V2^=7RX3L<*QM z5P)wkz7Lbuz7YtNovZGRr*z2GOEtObNGTvMUbmml{!QfiwCy*~m8p)LGt}xDI*7?6 zU~rgd4LBctRinM{&r+NZB!NbdpBgf~0*#ux8=#?o#yj(#=ujQG6;@opW6a~Bn@+y! zA-l$$vobj6SJondNK_cgM9yWmi6@lS45$CRJx=aPvq~Cr}848L+ye)?IFUA_tmmW4QqPK6hkS;@374>I$bmhmWbX`hIseYlwIQ*B1E!6!H3JUW=F$4{f+B0&5x1FIHxeGg z?0_hjGat#psAzMgr=Xz(m|8&7GszS5s}E?JYJSt%*-=CV13G{({VB$ahgin9oA0EDk5ZpRo5%C_hn3k1h0&^Ut*vt&$a`J67fBtg1U0R_tk>bp6 za%4KxDseYoI$f|iJYm<4WKx`(3yJ^YPeLhzE%BIT~NFGl&v%%Izy01-SI|Cda|q}TC53WIYER? z9xKH0=K~wW56HCsp3k~~<+CdANm}qg;n5%GzQ6qZ$OAP)m-EvDV+A%p%a^)FAH>dG zU;2HvC&Y(8n{8yvt?pWp!`ItzKlT7ue2IFVOzOgbb9sfY4DM%h6LtiQs*p(Dr0MY* z!%EbsxA}QbenD-(g`jBYHo*)k2wHyi+5FT`J#KAqFP&Ev58ZH*CqtLcKE}i$1PW~o zMY@ShGi=u$g#l4=NKlo=Gb3S@0JDs^92R>G?(bdx_jli6L5j>{FNN>TnyG|Bs@A5S zj|-8WN+J!SKHk?KhuNWCSwq_iF@7EzymVfObv6M-d+ zZOyr9!IFwn&_hX{7#=*AM0y`X7{-H4J%v!Q+Jm;+5{}u>MQSp#4FdxMi5q?1#nyxB z_>c)0xW9q$=K1{A5sdU00qAbpo%5}}5MCao`8nvRP?(qH2joVZ==9s?k4H5lWdJ<5 zEDv0$BWz_^Om9MoMSy8A{*s8$&uDnKN0AB1%e1{gW{o3itV9S}og*uG&yb8U>pRKH zX|17O;NF&ynnOy)hsHX!;MQs8Pw{B!a+wVIU6h07%kU@N;@)KaSwZh5Zd4heGD7xv z@nr3*3~&0{qqa%;>RLOSH+@&bcg8&YOo*%qM;LI_iLPA0Mt~4CoVn$YK}vx?rLTXL z(<-xqH6m&FX|5>@T31dHa#~J8>dKCBk*gG(ZJ|@7z zX(D$TYaoru$qNE4Ej&s;4OM`~AAt-o(VcA`M<#gM2BkEj#^~z~X>1k2<4=)Z^`@~3 zyUAS=g*mFBGUvM;GGoS-eq!o8n>U+zyzYEB`sl`Vs5?{s4~dgv|P6D%q7J-6&(2g(OR#f=sI`WIfY<3EoZ z7$*5D@{ra;kV+IA9~2@kU~TL*sE^WJzw~#$@Hfy0?l=4f`jlHeiLU|eF_0ewd#%6! z1jW;)4|H1}qvaP80%^jw;_xe6#Z%iFGy1j`r zS~xz-bbB0%)e317UFe?B!Dq0xWpBXB;YS~^v9AULuUQV-nBO1wI6pg80PQ^v0*$)m zZi3YuRl)}Iw_2cbuEWamdgX$Fw{!u`CoS@6Gb#DZ3Z_fuWSum&1}`dxiMQcS+>@E` z&m}Z=%!52k7a!qeRNZ8Ckxt22z0;faon)S~lgkzF|jAwqFHo>8)?9wMGBIUDNI7@~% zKqIZ$nq&}vAb*8LaFZ~r53=N*qhHm>BMH#o{c+QoYvqv~u$-{RW} z^D`R024122g%27GxDhgR<^mwzI$rsi2>Wwbi=?&=f32Ur=B<*p9_EA+%@1=*&$*p) z)hB{Cb}*G!Te)8KSq6Fl%p@P$WX&_;{K|B-gyHMTFu4xvtGw|&_Nc&KtJvOzbdsx; z%6CkH%L)AYLkeVy+PXxIyh$P{hTgV_o8<)eCY1FhdFGQ$Mg&ui*`~ivwPtqZaK?VVhd2Ce|@8ZAz!>YR%`@J!D;cNKPHCBxJZ^9Ix3?Hc?hO{>4dW#G< zISLWpKj$awRt=n-wGzH19t#xxirpQLkg8`(I?#~I^F6a*=p*>BuBK0ZAV_}jT@Fk2 zq~epMYA@Co)LQ8W7Dwxn(}oI$m)(ph`zkug$B8b?FrN;Rn09}J%0PIoC)82Qm3^su zH*duuh`f>|(_2lpoD=uaXnkWrqK~5W$$2DEi?8Hvj^koLY{>yHrSAGa?(^J+)gQH= zYfJDBQ2v(4ic1A9{RV;l2pa$5nHS8Y-wVMZyEg{#PdPcapphRQO}Fm!{jpl?^$`Iu z>aZnXFaAYD1F=ceDI3Ew!Gqh#RW%|S(I1^y;8QfM`%lv}iSmm94f5YnAy*+bknucOJpo#?|jzDB{ zA{BYd6#D*4##Coj0v3h~7BCzivPRVKm=J<1y(NUzhN)nb_<$4kqhZXHt!jmW>!&3b zp42LYL&JRB&60$f;n6ww=c);^X16PQ2EeMt%C#L`neJ6xNnC}?RhQC-E}{wNdy3>VvHWQ|59 z47A+;wZ@C-p1_(ec3~TwIEXET4ZS$QyyF_>lK!TYO*4I`G*hNkImXQ8pK-G91sRmf+e{axk{N88rPLXhkV5M^=zY!L%0jDQu~_v zEgJz(up*ZSNpJl|$o?*l{15W#Kfa`d0bh9!H`25esHv;vgF}j6JzZ~jnICtis6(aU zsJ##}&bf-klRIq}i3~V{CKTBuQP3)&kp?%Y1saUg%fuv7Kq_BCj|)0>O-3&bJWlXs z!JPDQ6p(Vd;ep)I5%d<&5f~*$AL`3j;3)%oqT(4O&B=>#?f)?L=7CVRZTonMq7)HH z3L$%z$WBNY+4pViAv+=a8nUG9vWBsbecwf6-x)iV7>p&$m>A3N(tSU_@ArM*=egf+ zf0&eFKG$`f=W!nAc^yYV`Zam{I<-dz$IrL8;)M|zd0z~ArEa?IaxI_Mm#TjfOs;KA zx~*Jr`$pe`EZ$Z&O4zjrruMT5*WjJ^K&1!|{<*t!Rrm&f!l&Kp7vX(_lD9{CwWfLx zCi$SepH-{S(i_xO-N-u*PYps;OJB3G)y1!`O3qbMFkJCWulm6XMsbycBn9~2dB%-; zEMnooYm%ize{OZRy*i&U<=0|q|H@-h0iO*F!u0M#&zt97k~rp;H(|HQ{qQgR znKmdU1lO3{=sTA`A+((a8Cb7zk#?WMByho4fM2dM5KAs#MWA6}DbwCY!h<}z(1!WRYIo9gAt zVUA1pCJl08R}`*50HSudQ?i<^{Y{NEp$BJ1Iec}K@zG_I0j+QAuO|sjr)g?j^HGSZ zNBp{jQZ(a8o5n|&RK0#nCQ7}liD_PGVRmV-DQ8ur;Vo-f=k)b=$zXrA8|olX6o3s5 zjxGR4^I`afmTARtw``sNLwx?XZT0u30jlRCNRXK%1u)NWfAA|KIrqu#P5Ph0gNIBE zFCQ;AY&i(M-@~KtbPE1}3I6c1B7~CO+ykk+@M;V0ZwE8?oRIj@AA& z*vEIunS(`>)u`V+kp-j&P|F3X7ojY$#t!Z|X~IJ-Qv6zM0~h-*(cz_NiPK>q%Xt#A zBzOg`Te;E(s?PvW@Mi!~{lf|F!E+avB(03k+j8^tud`2k~O9O+V(-h~|>-#e`h`uJZB& zVNh~j99)IXVRquS&dH<7I&8&2aJXU-HV4+kDN}Rn*~1y#Fbgu#0q?c?a*E-qB_@ZN ztNyTH1e>H?zswVcS}DaEgGhP*X#@(}{X|G*&w#ZjaWp8WuAQ2G=s*;<$&!}G>`Pu? z*Y}&$EK+6-21BusVG^9wx%ol0SjcFM>DVkRrjhwyOVsny5)JN(jR8O+bz*THvzi6mb*lOa22p4269Ykxn8*6K> zKi)By0218quq*h31br}#wNUqF+^7JqrFEdfryv=C4>GA^!<|ci^gg^t`BwV-BPZ=^ z!3>fuUGTFkoiV4>)uFD391E+sQ=@~1T%I))&ABR8F<9uEK%^`U_zK&CPhPSa&bEX5 zWZMUuvsjW>@W}GX=cf6*3wwenBgkSzCmblwXp>6$Dib<|kGj#)o2QWe(7$r;^)SwD zV`}GDCi2O;69V(XeyZ$y<_Qkl8b`V-8~lM;*V$SXQo8GNHq+f&wIQwb#Y6CDH9J}d zTr#N-{;-%8m!7`hCml6%cpM6|M05$EM)nFzU#BD|Coj!z9?TNab8REy)Q+U3x@ST1 zcW&K#?gAvc?m^bn`S(uGMONmKK|{>Z0G-_q{vcZz4=e<-fixE^{`bRZvh#}?azv@( z*suM4eFZ=9QeHrSIA(Q^_~LB^>6Wj3n|Qg>||em>76sY|Ekvo!S5g1 z_o;IKZv|9a{yAW!uNNXT@|rvN|T^= zglgF~(h}hRb6ZncQE_ve5Z~Q-FvB2~`=;6H71%x#wR~|^4OrR5sd^Wb--JcE!R8$^ zRbD+t;}c5ypM+@|pG%_Dj!uKUj7C*s>Z+t4qb!9`t+8^c+u+-NjTsWi&CP{2#r)25 zjY21zN6JV|;aUGlvj)3_S~8*h-KhMz#T24K}9WK!?{rKnm$ zA8arueKrfaA__+o*)K*X&UfL|y!+xkqIYis{Lc#+O{s2OHlT%;G4rbWl&J3S{1NFs zU#CgPQ*^xvf}b$wKEga`foC9G_PihMJ$}>GIhFieyB7nm7If)eaWv+x&jmg=`lq;}DPTa!U$`rNCvR}?oO{|+%_5Id|n+-Z!6t$Y9`6e}n5Z-MH_ z1B3v5=xGv|i~o}v|Kmqv3ea4t4Hs~K8+TdN;jQhhglU042(AVoI1?gsph6V=5SU&* zisNy6NdHr7_JCqj93kXqF;JhOKv=K(_@#y{vmN&jS0JV^02D_zO9vGoK}BOu<96gX zOY-o6wE!)?(WC(aUdIPLPeDAuzvd)RmcD!e;Nk` zOVB0RanFa?XZfV(e~AE3j4P6k%4zjuHDFtwI|lo$0&DvnFU{tsY5wAohtclsP10JsoD>eDzVxQ5~$Qr!_?;B%yYY#xUy*LnMHuDdjE(FpEV@jDra2YR<@n2NZmXV zB?pBmLwzv{T(^$`%2&h0q%aq;r-VYeUG3+8Q~v5}6Jq0q5)Y!~8KID+{{<}I03rbg zAU&3}^yWSiNWs|hQP+}6Dbe6ZJkNG|on|gFw0EB&xUMMC^5fRwb3!4&(JVDo|1D@K zi~azO8_fXnn%?s}!@j)dL5hztt0l_ibfG8HpAD?DEm3OyA!7PNIJ_c{@tRHjwwmlz zc1~(l-dYBc^H++ly{;by-(iIEF~79Qy!*dEUp+`z?`_I@_B(=&^gBYNq(0x~3v&G? z=D|WUp_R31RzvjHotPoEuNBl;@pds#)$VnUcDnBAs}9!hMi0jstD<1tTKS7L3&N&9 zGs!PkE{(q}g@0p~Q2B{cU|Cu1snvy9&WIb0l!$}U0F(UrPy2FT3dnbGjSpY>-3{A1_6;Z}cGu*7 zl1pvC;?h?I2V;TP)WcxWQ9fB9^0W<-<59o>HI-Q@eekmf=|>Ifgie=W<9bKxS4gG2 z#ew|zY_VDWEH~o8VSY_jY%va(x~GmS0NPSoQr+Cb@kQ$L6R6AGiL?RAu2Xl#6|nuYgeCvid$$BK4j0P>+f8xvS1@{Im8?2KWK_mN-STuuX6U@#dot z017Sy;9V@Db(AFj-`DnO=HHv5%lI79aVSSVSpYSj(H&!mR-EU7QSgV15R>0E;J7G1 znea--UZf8`xV@kKIu`)wD)6T+uEpnIri1%9i}Xi%gGVqaXl3ibjJ3KiJz+9XW}W{S z79{&hSVtk^N$mB;ftX_VlbB*`qgel#ds<|9J(J05npnRNGd*L>uXNp!KVraMR9;|Y zy)SuRbX%0ZGqLe$*zYsglHek+fd6gH=u=fO_;v3A!ax?MPlR11*F)%O2FR5)s0GA^%xnu}cEz zX1@JDJHc!(N}u3nWg(S0<=&5vk9oM#Sk~w~Bt8Yhk6oJcb1tuaEM#n!`Uu=fXb6*- z^5aDxGg-q8Naz>qSEPy-%_zON<{^XyF$Z?FW_M<$ubO>w8+}#O$)YgACZxMY6l#@Wt+n~sJsRj5Y)Xao zHCD5G`?iw);5_HjEbqVh79fY9k1Q7;mns>JG~xU)p5&LW2y}jc6{nK{pFCBFU6<1h z4kB$5pxgEJdG<|t*$ma&bis8(gn_92O75Y(1?J%2ZLb+L2hib;l z%K8PTR6l-3&V_Cvzn0~xz6IMCgXTdzl<(ZTlKh|Xrf=gu;%oH=zdHhD|1}4dW7vHL(hzW6($j=3BL&YUG|*UKIRn#}-4S zJ-+85RZC(K)K)<%XtsA-BnvA4p^|i3c#%Qmo!eVlwTcX_AFWhhn{sAYdH@%w#uDbY zqwmbCyR}sC&cYgPZeSty_4|^CrX<_8Lt1_Q28P#8;+sypdu+F)&Y z#?Qx^IYnuNiD?%+43~pGA2vB^^*%3f_ER)9c`K+Rikh~v#>GEibNeomjyGnh1#Wu1rXqVt4M zT$_29fx4}Cb5z_q}h@1kbM`hrEW@{8jPKC zCFFFRS%}=2QKY>1mkLpqmLc;+_lmI3dLY6-7FZO#OIFU$U+lct%TeY^z9+2Fw_>zg zbFg?XtYwn@3)F{)E1hm$-6LI1Ds%KjEUDmGnhn&RX*;YraXo?>y=0N{?d7aeU@LE@ z*kK!EqlLc1K(HiJJ7+7cYp0mZEdw4?^Kyjv?;8yZug}gayTy>I8lE>$$?>~Eifv>7 z{5?FsA^C3V*jCL}n z-#n$dvj$QX3j7xz1dFPVAt5nlktYoFIYs6VgMSCn5sGK&lbdZklj@G<&zK(+@(WSC z$Nm}U26w4J+^vvDs)HT^JtY;-5L1bS^sR;?`5?cm>%$Y2Zf%?w+&duH1lki1dN5oi zG9XIr`u+mQcgYfZ#JmSXMb|;dkH4;f(0R?3g`;n#=}Oks(HY*-9CtJrl#XYVB|J16 zJUpxU4w{$BZZxRSSFvmvQ5c2BM6@4TYN`QC7$j_oM-fRfy4`+lkm4eA>p zIYK!QYkWZd^d7ieZ9SvkGg7mcIluk@{6`s^f`d)Dh1)KJ%7%s$S=3-nC2vmEtzG_n z8GAriBj2~SpBOXtbKcr$IB!^GQ!o4dx}zp&gDC zH@~KGUzRnR+S_iOUtIgvo#8^mCiA1;v)|c?*Hh#(mcTqPIbFl~To;%J_Fk&QfBW*k zuk8O^ca=o2D;he7eTTuCU;o5{(6Y7*F7x)k8F=Sue7{Xn*;fVNQxO z{-W|jcnR<{27xrg}Gf?;u%4F?uHwcF6V_g@{emy`(2sQ09dHHgM2r!n9TvOc|H@6m=8#UYM z$&yZW_YPY?xsrV@4OtW(BoF*9v7!YuzAH;mIeB38KUufLV^RT2OrX=V=}XsmSP(6R+;Z1t_aEsBvy zI9+AHc_4yo0(BCL>3q9bHem2^{ML#x0l%(7$atk9Z{HK%(}?PCdEv=5c%OKUL<@NC z@Zd7*jA~4iz5SQjP1!5tQI&~6r|E}HA6!dh-uI!G9#&NT%u=NkRj^N|dT`I$)3rsL zxRLW&vk5uk_xb9Ql2fhvd0)`@->Beon3;M$ix#qc4iW0zFYIv%fF?CyXwAl5!E}XO z(S)a?3R?dYDhuXu3Kal`R?1e@da$3YV{;U&4$la8Hh_q0r`S^8UK6mwist7B>WrOO zpIE{qSF=b%8A1;ir&Msmp2l@Z3_B;mV&5nB9t;AJNj;OdSufgVN!@<9T+*;D;syVOjsyy%GvFI1GCX7a> zdNC)jDf`*D13%~dI9{S5H_gW{uxk`jHBqH;o^aA1G6|O2R&bJrPs8@*_>e$Pd5Zbw z2w1!Q&8Kae#wXt<(4*bbQ3vn~8tz{k53JyH1_z0T$wwup@PnR7^|yO&iBlBEeOGKh zDJnkdPNdE}Y!a!TJoI!~_(5m-(hyGU_Dk{ zcaY511@65HRO@5z_tJ4{gr~ z2-g+kGJ4*U;$!MAM?6x|_{s(>4htvIM)wk;w{dyZ@;K_Z&BOR3ChnNYuS?{!($zlX(>HA3L+Cvpp&A^RDcpm_IdqqfQSBA5gx}v-|4_B(bLIt$ zn_R|qI(bKXHv8sSalpo}fP+vY3Jf08f`jgs0`D%=I61QtWBiR+EQBn0>yQ^=01P4> z_HC1FfN_IOBo9~3zE?c*XtZ!?#FqyjNT2FJ4xRe!(Rwz1Jh!Hp@CY?1eg_0;mpTXp zdKTpCF-ON_uDJ_eZ4f=`kI`=k(SPrP3E*|DRV6& z1w;my{5N^7;^iZ+1|@rlT*)}m>DFx7VEZ9o04)UXq8f;#>Hyuw0(NVKhCj`gE^CCUL)6$;Rh1>4W>dxhUfgV2w>8*9voi z`6wtOT1C1k6~z6tzi>B=xZ1+T(ZQr#{j~y!Cc5yf$;S7Od)&xdV%1nNi&9}Cdc?p^ zn>f&TfNG3F%ay1>vBO6NaspQdf&FaCNAKXdr2ycdFo4n7!n(4pzNd0v49)Xv?EBF(Bm1ve0DFJy=j`hgnxbf1$HZx-)$z^IBgmuu z6}>y9E-jjPZUtT=LNf>A?D!~ygIO0^R-?(@Hzk|A+v9;MpKaQPgu*6$@7_-9s@on5 z^R~+f40+|L)G8ENuG#|vq^fr$s!fBCj{6L()yi4qH2r@%*Wkdvkn_)62>zB_!_X#R zn&R{;&u~(5Rhxn zn?6WEYz<1@qKTWBMANWOw6=8Ol>~_3S2zZ_h&z5oAI*D#1URtQbXJa8EEv~b!|{QJ z3qR{}DXNPXp>abS)0lA)r?{`QHCwA}XJ_}eMvThr{yiC0vK3cPU|>9Nh97nWCcV(Uh9~crZcm~y ze(MuutMzJZM65GxR;TsIBaf5!3ay8f0LjR!^)lvUcx-BjoXjazkKu-}ThsUO2#Fs! ze%p0t23C+z1E&%(&&P6OhTqul?mLy{Oj6fUa1=u2xoS(Ydd<>~mBq^@HynsI2CP)L z&?EP}U&GB&GbQHYlm2=zx5Go;S{~NTRh^v@SrzVkRGfwxW*^{{8wW=;f#JyluN!^7 z(#@&_eoPoP9>H;l8RfRE3!Oc!)|P;eiKDWoPXYFN>=1*R~-etNsI_ykpr^qbmF9~8%J zZ0qUi9g!%U0h>)rtvlMR@pFqm?^{9eUGRu^|pf4)~V?`rT5TtfQU|ePb0MJsvVk>rnU~?sf6Cq zoVHDx_kFkK^K7B6oGQ4p$+j0)1;0iQEv`PDeI4>Sqa4^Me(lY1wm5a&RPyJUX66Eq zE0C_Plgg~9>4*o8!AggKCv=Obn6}B&5V5P0mj%tONV3JB<}Ee&T7uxR3&?%$2njFw z24z>7NGs~S4F7adSm6xmsn;Q}t2?0mL~z2uosB0>-%|D4Fk$)5+N9{-)6%{p7uN?t`x^d)TgI^rpdWWYstVA6j2 zIyklZnW~yYVHoo{-yn}hAwjRQLUxC7iej71Lb{}?yM(seEt=m;ldn`=IiNOD#`F6c zf8lXJ^UH6J2)1?19rM1!yx^HDBgdB7q?L=|j)!}$RF-OzH7|ZuEq;T>U$7AVfch45 zc!5RFDE`|5X<%SSi?YncZ{A^*Wrpo}j1&7IC$n8Dh^3+M&@bI|;G8~Hz8L5!_Ri|x z!l{4ZtbOLEDy0$vp4XMZ!cnC&F%0B_H@7SzQF(T?Z=JSYopmj{VL5oR^KRxPMWo z@mew`!*BPAPH|ApY#hrrf& z>qO%>I(piyhi0aXdd!KHWOc0svaH9#)kx?>;cH1!8g7I)hlCbQ7q)Z~T~ioj`XSs? ztyjR-!J+jt3;wESNR3;y-lin*yqP%whS*-?XxCZGbvpG?_Z#@(co-4=yQ|n}p-hu& zAfwWK(3l{f{h<X*U`U(68S!VN@o~P zN})K>Ek}7=c_JXt=u6d?l-TteEN|N!A?bKZO13$}Kg48iI}nj`H=j{Dh~Z->xBI7^ zv;dm8t-xwz=H!uuJZkRP1>*}`IdKveceWMv(3YCXc{xW~y!{vNbr$G*rN$tB@pYe+S=;U{aF)QWqN=^o_1)+B4eo z;Jb~Bj#2YvS(Jqw`h(yBvntLiGnT|*@0u-CH8-Cu6rzhN62T}&hU@On9>&fo3@<`d zDExuVaAhw|RT@`7e0=;^Vq#*eZdsA#EBKh1(#Ge(aJE+o98TMm*gI8C^q z?*545E(iMR*0S(xElpBx^ew{Fnap&v;X)rCVY(X>zjzZ_Z>ylGOZVQ+QM@3y0$-|b zS*6BD7Z&Si=?>;7FwIQHFUU^&vR@&+=biU3f9lOr4M`Rz1nrbQAmCLw%ctd7yelcE zQW}obnI#*)%PtU0@85=Z?bnr^Yu>+Y@BLF!WCuyHswHwk-#-)Tz@XP(3*O~Wf?5kt ztAbg-@-x+a?T(q|ZJct9TFWdil&rrYg4DFnwImI@xh!HwqBk>Hjr5%s1Oew+1&Ye7e^Gy{3ltk-d5g{Q8tvqIXeRn+1|}O8uyJ{PoZ$Oy)zZmm z4Jw-rMdm-N8P2m}SQDkdN_#it`xOHfFq$B5)*mlp*B4j@5N(_q5G!vQt_aNo3*Zda zk+pE}cGu1c8gqmSM}Q~+VO>_%OK)X(ztp#mIZ7-lHq)_-I53N>7Y@(LyJ2%5kxUYh zbyg$i#tbvKW7+*${Mj@CUIc5I@2_>&n-S6j7Z&U*tOULxHD^jLP8h0|PD>nmKFo-_ z2HhMqy_VTCAO0pe*pS_laZIS);6oXYLx~@oaa_;cX*PT%JC#x&F80fC>bvPrhYKMp zjx#~FVwYY??{Csnn!!4uKtgQ0vBW25QRKa(3M$jWhT$!`@3LKEP^(P)3moc0UtUeq zcJ)tkgG|KSOIKG{vG+@iEDM-j(e1XCl@$aKV*D$jUj23V``mrbyVC!$2n8a$*Ff0a zC*qcn6x(Xcy7s%}>^1AM{w_G2x50`YnHpe7Vbs#^BzP3c!+l|H?zq%s)8js3$LCB{t(Qc1t4lwN<*3{jt5Y;FxFb45Ex3-N ziBmJWEj2e%F(@mmy8qpGQs}*JY)h<#C4KPk$8d}?);WLTl_hA-I@CP2!Wu%Mr-RwD z(WkX`#eaa00;neAjk4o*u>gI;pf@&A07Yw0y_4QJUw1mJIyZe<4veRQY6(lcN*D4Y>JeOkkHGLj&vZQZWnJqq3<+- zV=NyN#DoZ?_;4UUxTr#!O;9aji(e)lt4JIznO?>xO=1ejDNN^iIj<#D#8Ng%__OCF zHy7^JX@kgQ>~%UOisObr*Mao3)9Fs5C*Q_W1>*O632?lvRRnvxC$PES`!?wwKpr{y zE|>1@(YjL!&C>7T8MiNIJ1@X47HZ741Q{|>Y^tHE8*eIPmRWw|G%6^klhYD@|CK2l zy^cir8V1=Wcdl;vZ1199P1^JLJD@gAcQ%QQZrv(g)u+0(F;=_iPByWB6o1!40&o({ zFYy2gk;(8>CA#k);Zx*!=^AEy2dV3ynf{pKI!Yrm|5HNTE)%eL-y*F|!`~>mEw#}X z%?k7Op$g%0OfJg*qF=OK`U>7C^<>QnyE*Hg2ZwXoP&wGxDBP3q6?N9_%Bo^LdRQ%4 zWVN~2dR83Eth`|BVgRZgF}DpJov&|D_s`HunsvBk9F>{xlkhA?Zrnxix)aoXuoU6sfuc%U}> zFH3kCAc6%fk@u!tN^&M?SZn*WzB#V5t;Z8&BS@=PD&DP}Fo$vzs*$`j(+1-aS-HmPu;^(gg}deW!HBY;=rJi2at_D?X1 z7eHx-Yzt#q@cLebFGc?%Fs@Bd^PQ_`@2f`2Uof@eTEu2>!o0KX+Wro_af&q!4M+C? zg4H10QPDr2Dr%~)KP#<_nAijRt=rmn9ENKwS~D2U zNT?sW>C)Piv2S{(AGZyndSq>tqj=7k!?S7|eve)T>Cx(%=y63636Tj>gkp0Bv%VXT z(z{1Zfw6^Y>R-&XNrz>FS1bHR=sqmq!$diFFI}d8#4KXe!|+q`#kAt%Rr0q=yG`I_{h#rae=&@!6qMffSU^vOT)?venac{rhG3&pS&2og|8|v-tT~km{@OuP1mUv_fK-Y8C z{oSk@b!S>p*m$B{yhN<)U9-V=SuBUdXitC{<`GGZVaKbyB#FIg@Lx*`oJ(^LVaF|5 zy}_FYlBC67osJXLdNEnmR&aJd2@Ta{2z{qrIx4Li>|49UMsBN=pIe+4cTI*bUc_I4W*TZb2nbFZdJKjSQ}o+ZI}@)k1&G)@ z;-r|eSt;4QShXUWcDni1^%MV@1vc?8(^$j-d32ZWgMaF z#MK<#|CWPjOLx}7URL`<6}#j*^_)uy&6I;WdFs@wcfnSuPOA zYwyY`2YN24yu5K9$5~YlNpk)FA*q91>IjfIYUNqc2+=J93=kSGtFN!;X-K|vC!qWq z<3zkK|NEt`O^w#08j!eo5OWK9UoUYM330%VKkz&(@I2W3nURiS+HAeowR4~p4?p{$ z1_dZ9O4ALP+MIL$EUU4>DyNQsM%)CQiIdtP@0G{Q@oMe~YT}IVmoj41#9>zFN9+$z z?3q{%V1+#E^4r525tH^mU4}lY%7qnh5KF)UNc2*mSlP@aj|Q-r|?J zA>v~{y|CchwX@W9mY1(cia&YRY|C0GG~;T=FRmi0E0Vtyz#c(?#34>0GZIu3$^WiK z-H|!gUyrQ2%7>DXvB%>q32>TVcW$I%PZ?lwT(_h|GXp1a)yU|K16*d@5dkm{o<-J^ zLypyGx`seCLf*3nu2;-kgLih(iAF*P9DtPVsXl2){V3YS1#qd`2L}g=OlD$xk&g%8 zbbap#eXPlANW9)&t!up7(O< zcVb-W2R=hLQ!_}!E+o*%7V#SG(Q%^U<+*Fpv3D=nxDOF53s>q5_VHgx(De|{pKjqI zK-Uap?66RCxepcN9QR$Xb88i5PMNjtIcKCUJ|(zbkSmb?ts6%^epiTx{JgPs4s#q)4RLQj{B>so5FH`aoLLWow_AHG&7*3krr%|3WXF15YeZb zSRZ@CF&V znut%&X07XD0r=~ra@i!r`b8i;x2qOI_26!|hxSe@Lp$uG!t^89wlS+-iD^Scr5I-| z$jN)sABlZ4Op+R`7L9n{PG}a#ibQXt)Vw2u2bi~Ibgqngk83sOZ$vNMxF=N_Ljj_A^4m8aGEl4K|fmI90y&x%OkM|GV3_9)y=U^ew zLdO!k4r=hFI$eZ~x2J8tYVyXPtuwGW31?eEzN)?g<{#GBd>}ImZa+HToe0doRww)9 zc?r;f4AcG+mTRBAZo@OIuHm7{|vkY7nN+-Kb?+Dl_OEnUTJt z;?kcfYxZM$8M+q5@>OXGA_Z}tW~HxeEkLKU2pFB@@vIEgxaFlf_b%MEucI#*6qvxa zV^7#&aS}q|jhWTrGKA$5HlQ2dGE2307DDq%Dc1)?fltuA+hW>~lQhyFL^UWxQU+jM zOONi&Rxg6AV+gnvFW;3eC+{=Q=2M|br=lve>+^axHa6`C2X17YwJp8A68Wae`uLI6_WS!<#dGTCMk0IF6px^TP!@0ku0OXywBZHmjY>{9LG-J5MJmmT8IuZ=DDOXM z)I%o{*X0&$gR86AIdQbqT~d94cL2KTr!u?dp>@NrMoO`Z)2q-1e&V#@`avjnRq5db zr5~6n$tuM0AZ$~8kJglr4{BR|0Of3X_2_m;XYQJKF&w@*zp!v>N&gI9C|!D>XM@_7 zz8d%qEH>k~cNcU?ShRJ3YOh1}me15@JCj&F_(2e04>B{`?VM;zk!VbgDN4?$WxG`Y zau^pQFP>BF`@(%X?K@`#zd2{T+BT(>XwO62$ir-waHycD&vU5@0WuVCep_q{o4;sU zC~h?&!LKLS%dx^g$)?u0&69qN!@ZYef>kXDsH(%| zAT2og6s9tG#`E&A@RgC%#5t$B%NKBP!pSC?1F@>DfnoNo>kQp1>oL348gZ{n<8(#@ z$Rqg*tRaD3hFELCj5=@#-!J9M0v7F+2|^aEiW@=dnLjb#KZVf?0j~gkR_HO}`5U~3 zIP`{%v8ohJY@s|BT5ou{25E5U>FYO6g5;zR=*Lwv!)-uwd42~p*Ox*cKz)Vrtju^z znDujk#*~+4S4#B(P)m|P|Llnsh&Gwu9?_Gix4I87KMTxT`LsT6HxzUtEhjJlWR)V4 za4_5*GjURx5woKA{96ZOZp;dSK{9-_Gcz-u>*v|o(!yeSe|dS?VTKyQ>c>Ld;Xa+N zIO}8OBkof*RJO29WoDnKwj$kMYX!$@)f>TG4eH5leXZhs>#>q*YrhCgs1~NSsn&cU z1%9MbqFKhp*jMn3bv1s+57ouR#lQRd`s9;#Ir;>*1rVZK;ScoRxPP#I?4#T9D0CYq z!2pWUn+|yEN0Rd>mqa5og4bOTla~J!>(rztS zjP8~TUV@IPWMqDbdn;xNo`gVnY9`fM3oK< zq`X>oq8yGL0Jeg#a&7P+IYGfJfH-!#B_sbddvP*GjZGK(SpCvCXk3*+OZ{Mrl9mM- ze^Gn96jHA;T%22a*oBtJ)p|M2`n7Nwh@_u3vxuZvwcgk@%brsGlbT@CuA17~v)}AS zhrNCYb&RNAm+3oQ)a_0oHYc$s8to7~MSXpJzXvb_hbXKr>$1yv>-3IbNgq>ut;Mr<*mI|}=*571jtpO+rf;Sq8aw{`e!I*{C8X@0VV9eSn9THjkTMv$(pYzS8u3Zv?{MJW}`G zbi5)H3a*c5>)bVa08rW9L$GHSl;TeQ{p``7W?9?#dB5dTw$9XB(sBK%Y|ABe7BGbK ziRWc2si`8Qo>|wFrt8c)@{o(Z^CKAOF;D6PY+rLD&x+R%!S`7#!l^F=`giX1DbfjD zq3>L;ZNb2Y7QOcdkPusY27WO3cp%TZ&Bt)DxgVLkbKVVm3K)Y0PlqE!QVOu47Jt9Q z1!9Up>E!J9!9i%uuTk(z6FzKChNt+YwRBX^Xa>!Zw<8Yv+hYj{31G0L{wd)OpbRSY z=_hh{mpKthW}sB0eW6Xvo-gbUM;>KCFIx=7ve|<_U1SK>`KY#F2=M?MF&fMy+`{qG zcWQ$PM<`hsjroz%Vj!KV=SH*~s~;0_$551TnLyli9yyBu0R0O9LnZFc-0h<)kIJrH z^u?Wo>`X&F0$u~KR8J}Q#j~6ztqkf?x!IqRw6k8@eI>6f31(~p zlPO)m3w@(JgE9XQ397zCFdf7}^knywQve;IuP)&Zjak3gy&7jDIOg`fnUVAFkPila zrZv&!A46l5HhWzfoiYR4pxWG}LKX~y+GI~>24a_kUcygn!e6f)c@bRvJpVLPta}`a zTm>W|`%h`;x{>wT@8frQ4)tG_uI9sp4H|tl=V)3k1(4RT~h9`suufYT1r3{^kpLTZLDvsZ$e%IvP z)1Z}^kM1id6>#ycza#8Xm?LGxb~)ji>E_BrgVxlPTjiJZ<%y$)o2E+*pEbt&E>krI z+N>GJD-0Jj|HK=5@wNlfULRDltd*rL!O`ohZA(b08(SIY6D+p>!#K7tzpE@I9X zAt;E`CPt_q^+TVkv#(w-Bgl&2Wdw3k)_-Jq0kijA{c|+vIGB^Ny-}1_5A+;!yjl#1 z5lR{VMaJ99In&Jl@X#U+!9$zsN#+Kv`DpflzdQ9Ew^SfLDug>$eM!p0daVvgE{dMT zpg03VvD-jxb=HjY9FusAqAgC_@d15HA$6=99TY(cNQtJomKY_e>wgSYQ5j%I7`nBntojdch;%0`gktcth$~?17iYL$?-ju$8(E2- zA2cb8WRSfLK$!SY>mqn|{YB3{Mco{18)hJ-%NM}Y05?Q0;$OulTzBIChk_WG9T1U@ z-@VDvA$$#RGgZiAvCowcuK(RZrMn5PTwFvX4NdXY)YOavaFy*5D~w8Y85DK;(P;#R z?efhX9FTevE(-@ORyfd{ya_TC%QSXG@O=h|rV_1IPd08}#DB=_xKrA4lF75XJKwER zQ!#CSuhIK!<;thW?afH-(Z=Zk*6B;0^X<*v+@<+yIyR@?!^I+wf;uwI`)^<+JFn7P|4(F1Y1RguFa_J|M?-2j=SV$ z{cGSZju2D~1fauv#uzNfzSvJyc~fR2z~eF?69d$G6@p2X`!@G#1mAPzgF#y0V9eFK zU`qj*pNz+d;|Ao>EBa1kmQI;6=-}KBaMR(^{iwiQTE6=w+CNlrROwU8=#-EC`hgI@|wLUGa2sHPkY( z*|P6x>V0k=w6J&;gCWk^sw$0$B|$y}4IDfp!?@{Ya$&+^FePUe7pGFqVtM_`J zD|Ad`o&dcz{$B~xdhwZ=Fg2)TrgQBB{dj*vcAb}o88&StBt}L06p7yst<&EMFsEYVg-o0;%|<-1ZUj=JBAw{JR|9+=7j(Bl-dC7iT2{L z245O0esSfzvH}f}8UiE|xmDfV>;ohH@tHuR-eM?SEWB*SHsf*8pD#)B4alj0rUqkT zv>(Y^7E}DtdsqL$ageN-+?n~3)ir1S*7N}rS1gy@3O{6oKQMSr`%#!h@O0f4w&*fO z?LtX=$pLjCKDOeT@(EjY-j^M{foqcjPs{pPRgy~alj`{zZ|7SQG_YNi2+cG;qDeha z{hVd|&`MhaSPA>j-EC=B?lR87h*RJbC|vUQ`JK=Z^~)vl!vdjGmjIt+^N#Lga`67a z8x#3gzWKNL>A!dG{t72aULXPAH;eT6#Iw?!F6MQ590F1~%xdg!kb4SXQADqCPF(c$ zj$fBtqPs5t{qOJsA|CnR+S3Hbm(wm7YMa2dWSX{O-V98*J4+sB>fp*blWJ3lX9irS z**#qe_Nd9X9ul~Fxl*cq&VqX9i>1QMjSgN5Ula<>)XvRgvYl{rA$G)E-4fDvR3NAu zy)vn5Bm8ZmkXr{?GVj15`E_5$`IqbUpVzC?IP;T$)v{34(LOO>jlb7n?ohjH>szc# zB&}TVJ|awPrhsrg3YjKe(&&01dW834zvMjBV_iB81h*A=0B6 zd9=9gR@<27cC?na2xbDjOs?am?1gE+RhB_qK<>1cc7FLcC8Ubi}0TXOCD zJxIbb8E66o2+L=$*n8q1%v^Wd^Q}MSjNLMZq9i>(A%myepf-jS?CaIqKdH=ZN=yZGM(iO~sfjU_S4Go)21zY zjgkJ|B}>Dt5_GTE{@t>^lpEjBG%U87Y#bPN!R^6AZ24lv;V8;@UiK?4j>wc!!{$Y) zB{u41E2?>crNdWA&S5HxoHjahp)=Z_08ESFfgGxz+3~XI-=2oQ5F9rYM1m#E%bNd> zvFiY*x_$owu<p~Y_Mh9|ubjNQT;9)ZU^D9j%G z*EV817Rczh1OC4BpRp;>Ha(_HkO()}x$3sRb`!wEmTCT&q7qoDW_)jU&egNY;3e-6QBF_be8xoRAO7ijF%^FXf%lm1tfvP^) zhO%FSDou|U5_KW|qduTa7B#nZRz|Sub?U_A@M33qhO!^~5>HV<`W@En;muTy4@$VM zvA*326a}+5&zb#XuF2sH&wN~89Bg55%uMBc654AQr08&IVsUr2+wJrk0jW%-aCH=N zbV(qY9JgPvw9deLq(kE6uFturaK}jG^s_jOdAwYR`@F01&do?l*Sidj0B*EkI+m}^ z1xw0D#9UG;`0=-(=HFP|jP#zl;!IUFX ztQ8bjVxZJyAnr2vX_p`rBm~zOf>7Tc$$N=d|G794@%75ja6Hqy<=$~N6F(M{l8HZ@ zp86DJr^k{)x*$T79c*Ppip}*9xiLcj`WA+ZaJkF*vmt8AWh6xW3(qrV6^HF8y@qhc zW&J2)rht-)ommdv_esm&zK$@r;UQF=q7>64_Y3Yxj4rQv?&)K3Cgd3A$%sMox)X|u z6}Iv`Afn+YAm$@OV9VP=8NV^{-_nL*EnxS0^kmwEUu6;dy>0eEhzof3?)&@bCRmnq`%2rb%` zzVfU(42EtwQ+Y!rC+F{cW(e)Ge(cdhmpOVgZ`y6#<(-c9)XG$dYm{eW{9xNL?e_6q zA6L6;9#0onGoZ!1T{3PEdIDm5@5JyF$!{gLup|F-jQe`go~DOJOJqsCE9 z`0r@>D~o>wfj7wQy*5#g-1VV(uX5Chidl$(jr&VJRfhz!6b6C z-8DIRs&f8IG{jU~Dl{$6)U05*(-3{LT?E95*B*&7Pnbyk;|0GQs>GgzU@U_7f1}Kw z+1ygl9kS=!eA0XX!gxE36pDrCE!&+A5JZZ}@p+)KK zdRI?o^(ZeHsQz$2bq5^A)%$@;gb8yT@cDyEWFtksHf4*=HYFA7&BnkZzfT)xULbDp zaRtw$r>`KDPSkwctU_XYX&5P$WPYK<*Wa0;dwg7jnXektJY?37 zcbtsh1x5J1-F2As;}^6FOIXyqpFQ<)Z4?Z6Kwh5+`(6>#a=gEdxEaW7ErPqR@%y%Z z34_ZHBQ?ysvXQf=;J>rmudn=WJGk%JyH zKN(_JPAVdeRW8fWV#Zvv!R)U|RXfNd9B9ARQmRFWx<@;y$M!{V)^*e#Q^>7E*X;+gSi|_Ih;GIhP{fb`>gWAn<45RzU<}W zN^nY;fYY4>#3ar~@8bMJ4-EPs9hjCuG9j|M6G?((4{HwmZep|z(WY|P<4GZymm?}f zJcGe-7Jn91WNW2|_v*Xg?Ur7qZ+vrO$C4lQhr>_I!7ndczHQqZ`-0I4At`+v$SU78 ze%k*g2{~6Ve5X=m`eyej7!DSrZM*jm0%j2V`?y96w^+nbrV48{nd4ffbZOq!<<)I; z3O@?jLo?hKmC=sRa1JT2Nzc9=+mzL(t)hxsc=LWYT9_8v*YzI{<}c_J7D0Cn;0urq z+jm(|!RkVh*zvDS{*!-Noi}_(PH5L9J(?f*GX3T8T^^y<$dheZC31_S7Br*GFF6KDw%5u>7#9&kE zuCc6y`&`)PQ*xJt)IVP)+OwgY2y?n;Z)g%Cesla#s^toWkEEhDxxW7()=x?gLbsZr zgt=)JG@}oiLMoF_3PV*-1Lt}QT)wmv=HpLaCZwmOMS4J}x9-a~9G&dm!*-ASb(=v% zA~uWe7l-|UI4m|=>B*@hHMe%EQxogHWsYdYr1#uT+1EWjcgc;;KO_P}sMEV&)W}M& zYQtl<$tqnwyip(%eO~49<%Es{u#&Ek>ao30IYNls=JM+5FHis7h=j6$W#+fnwkuTk zcC~ZOw4g$tL?&U8>4M=9XK01VwnEy46P-U_!Jdg%SCgB@uE@DYI~m&9*%_zM;Rb$r zBE3}EIh+BV8e@uPN8`TYu@rSYN}ZcoGybTasR8RC&5lpEX=l=4;zOIBYa;uNVV`v6 zb#c-XZrUBG%Y`KTN=tk%Deuawsakt2@#-haf=t=Nje8(dl~AD0qaUf~{*z1*g9@;D zH3lAp9qB=#jUJir@Z^cVbJ3$0nt-*SxGXufP+xZkx-DcBSF+YUV9x6BgO8#kCNUG= z(-6da7E2vV8hIysl;XMxij^qmqM^lG>D+RtB~O4MDR~MMl2hM1S3rp9ESWIMTaII(qC7?#pq1z+|i-OxhGh&%5HvAB##j-KjPzLOfJ8#W$^CJ-H42r zvt&W}ITT?$znwZ4%^!jc(@P*#X;JO#>3Qr)(0qb!XlU?X>EaWG#u^1=dgM!+rvumb zbNkD#?%5)x(`(bv63SZYVO(FSO0zMTHs$MZ?0e(@X`E8*!5039he(?WiH$!$(H!s7 z=XlBb%+cF$H=55JVt3jlO3wTs^<4A**D*^vwwPVr=B462t22`aXUwDSree$?klS{x zO3*T@2xljG_GSh!q<`LO2{$Ta2ILr_IebZp;bp~!}qclWKqAIHd8l62& zui_gDe^qbw*!?ZzI|C{G&X$-`jGryBxLPiuTgKCeW62*m1f@YcZQ75NoMej!yT?1v zE2!ACm)97PEu9WYy}^|1*5qKIwiMeeE~$K5@XN8v(AN7n!sLpRgAEbfJ z+ zl4I;rE8i=65Xb!>j&`@7So~sI{vntNmt(n-5_5*9ZzfFF3|q;`Nn@j|FBQs|H(Vx; zDcWx)OMVdrtU?C#SAsw&(;4ApI&z)T%#O5V+EY!@g(V}~#Ut&|j|azkB0=GDla+iu zOYHEzniRF!3_g_G9#(X*5zM!j=U-RGZ;Le-?ULXi=@3Dm$J>n#X|m7@2lk^dhA)RQ zU@u#{F5;5+vai@(EAP{|*ueLS`pafEiJg^qKehZtSAQzN*H2~hNN7~Tuk)Ig)_wa9 z$&)Vct%%p;4DS$sgNX2uhh#BIS~Xq{^ zz{+S+o~N0aUOs&2iJc{0f~sz{c6i)giLB36oxc9CTTIFQ@;4IhU2gUi^1E5{M{JfT z#)O&QU++Z95^EH%Bw*nufC~is@Cni%B>EYKeE*byoQta;sg~h>5Zf!mC>kMe_ySap z;U4wcoG-{^`zsm^4nV+cgif8Uv+bkv^51_6#dudzFt`PpHg=XF6E0rNx+h{apC8q` zTOdC2=&swl8E$YL@3oS7LkqZ%W$jVU4i=v6d@JqX9~N#Z0}H=+xLJ?qX-uFPzPfIhI#dyU7bhJ2LJ_95V^A1;>Dh3d zgS0;%f__I8Ix375Jzt#345IWECM$9!S^W?+a2lPLtH5o8PbJoi(@ouH@Yy+img=ru z`d^gW&E(1pIEbp+u8LM$Q=D1s#1Mo(ZNJZ2l3~R?nWAUu?W*zmhUon-R*Fd~gzl6- zl1v|hWEvfY#3Rks)dPRh>2QQ!gwI@6oweSs(IZFNySux!cTzY);dpSLilUs%qpKbg z9Amp2iRCiBOjU~gg9nfpUR)?TPSB(gt8?!W<`uYLV|@=H{d_0J+y*oF-rB}`Tc5mF zM)vdWcup$j#gAS5H*`KW?r%x;l54(+SRVhxH{rm~Xe4d@Vpx7LeZSOC5Y~T>4-2Th zV;S-AH&v%%0}FM?Z8bf5?Nu-Ub5TJkCQ~|DilfiQ(-ZX7YWe;71eH$8o*deHa%fFZ zP;kqgWZYoUeTyY}MEo7S-5!>=9m$49`sA~;Cx@!@^716%EWGr|roe~2X_WxDhJq7g zGiWAr@1f65&e;sNj6RB2*l1Y`3ZnJAdGjVfsqj2BavmhNfE!1CM0@B5Qv69L!f6*y zV{A#PuF}qgbH2j$@iS|^B1Q!<>xZ2`kz{($W><~LqV|BOALU(rtKB2!J=Su49=%#5 z3#+PeI2ww;4RN`wdWDUSH#B=+(ZHh(lOUvdQ2WmC521ipSq9RUUGb6>_$l*<^K~29 z39LLEXrfp4u~P=RMCTo-wpF`1JtXt!eBs1Z#T8+QHm$9!JZj_Jbt3F=;!Ze0==n3C zktuFJaGYz@E&b~VU!U}d^%7ic1|3|=CGVGw)M}ZiWxDwU7ois(N5N8J;pcLhMsAu` z#)2-tHz7x&IXCyqbhV}B#$SPRGAD;ZAnI_Tk+WucBLNdR=71=q=)XXczOeRySK#|; z0utsou{KeR<;18{WC@eDJb)}nr#$`0IVIA%LvFWsZ42P6X+y~mPLgDbd@Nn82K75;`G zHOZs*oFAI_Au~i|2O~;;S#3K~3N+q5DWOFAF4Fs&sANh_4zb?NqU)sT%sMznh2|IA z`h_@%a)+&BgGh+#qD9H}=$^QuJ=+9cXQo@te6q;{e*r6VJ8C>|Fb~&@1X0Kv`P^?2=`D5G2`{!%f4-$XPtC{gkZLS~j@N zn%CQPd;fcf55%ytaofX*ZyI|zKU{rwGHhO&PZBDoLED&8>>MI3kO{rtjLnZfCLjkV zXQHaEW!#VLy%sH6ByaeX_}I=d)~?m_=?j$^YhWn6Aa%}|bjx}6wUcYQm3u?)bCxt0~ zKO}Mbw<_o_>cfi#tx8z3IACI8B8`3PW&K7wFrrLiFMW&@1{c+$xLfI@BwTU`t&`R) zZRPsQCD%6Gh9jFQfk;2;y&opM7D zKiioYMKbSqMT*B$G8`>#=i+j|9MBKR0Z9DW^C|?yWxUYfMB6(9bdqWJpTdV1ds)4dmC48 z-EV%XVxYI<$k&Y1*!-FLi!PXDvw@WD*5(Q{&SM9JHbtQ=($d|Zk3p5@h>h^4tg|my zPkhYvgkd{CDhPu~Md(b(FwUPwMw{o%RTU@q%g|^vAjatEy>`bA*`fvjmB9`QZRUAP z{K^jd;x39}mJ~iR!jjy)f)mV^g&PhYs zqqx{Ldi8a#PkBLr?qe?lBUY(84}P-9-CDJrpIn&CHLC3jNEZc$33NwjPE$=$em6uvNsP8d2>bA~T_8nZPq}6(xVG({#x#jY z0Q{O%v4m(s;N+0+8{Ru{@g%Z4nwBM~aGa^dS}--h=JSH*Q?;sMPX|2# zu)9pet=e*~j{90z2C8i_-*Y}Ey>)(Se(gq(Zc%oEB(==?9qAY)5X+R)jF!JdUN*F& zBnzGB*T<(@lf>}5J^V%U1s^c$Za;D4HGi^LGz3b=*+XZ|W6r5ISxNOZ&ZOJE6lR{z z=6lU_=xlH%qqI0ak8?6XC00wtdITpS@P^lZ6@BhXzIZTF7`_+}zARbnz=1!N*$GHh zfl=8ucd3apsao(_{o+cND@lTC>?lJS^yN;uPqe=p*T+wM8`HeuiFCog*A_Luz`>y{ zTLVKq3DDx7i5CI53`Us4q3YZSXj|2)t%7z;ZSvU~Z-JN~H3_(YoWad+7MX_+dc6YE zQ)`yEZ+>pFLpi4`51=88u{%3)-zNUkw>WN<=Hxi!4OTzidN}U<`j@I=tdE!Q;)KJ; zcd{L|w(rPIXuG*Ih6{;3CN*eCHYqs?Ijb7UTzK-Tp^>`D=QhQ9rGJsp9_jX36#euB zWyTtpk#=4MQ>z=Cd*yg$VPPR-^(<67XGbco*c);|PBJVHLf3tml|lLsLU$7>FIt|+ zavuLo+oE(1S}vnL4(RW4^ed|&QS?w-(}yymUDUES6}Qk(o36Du$GxPE)rpSm5m6|h z-Yy#I08a;(a&gsV>jFF*6-%yO+w;KHd(;C)!MP6UIRFti8B#rX+1KYA`tihWb2dwT8>$D4(n%Q=o`soZfP8<)&99rwGQ?jtd?OtP?p?Yh@H=;waAq&HXLM`-|cK0ASWI9w#0 zDl;f-SX#*v{ew9SjRm=+7~pr1Eo1#w^0{PAP9hIKiA|AGRRKggQ3yT{Xqsd<#09!$ zP!HfNa6TOKC%-@_y)x`6P0#U29*>4@3)z^XXHFask0tA%^w=QiM1gdRD zn-n9HKiaRE;#;gKS$r1Pz6^hAJ1ZvhP_Zm_bXI*u7kz7VIQ$o<))ayS8>1OXBYLwL z?4fNaWscakWi^HS*uTk0*OE!vJI|kBhcQ-J3YvsSJ>o!lJr-P7()^b6hTZ|f{fEPX zeK_78cW@Ndf2&q;gMv+bx`;GHoq<}3?cV>4C*U7`>E+G8!(Y|dcTjV;zYvet1L)#b*qu;c6wH;dJ=|9cirMG@=-tyH zYFA}%RE0h~uNtV{J?D6peRd`c)RpdHgm)hg;6_YBt1fs_1Zsv4ztT0S zsi`?tSX_K3wX**UWS#}2Dt*v6UDL)EFt~>J>!r~x36{5mlsg&e<=#tb_3WAag;+_5 zy9|?!l0YIfw%fZ(f?k{ZY0;P}X$+gI_u3^{z#-wSb-*@s<7cJas$Gn*9p77%t+~5? zvN|4GNJNd36Y%d>-pD>H%0dkVELbS1U{37am2d^@lzlW+D&+Jl$jRiES^e%`-ufd* z{YFYd8Hm%^NN}%@pj%s87lZl(~rBcD0=`k7t(Q0GQWgeMZSo-YjvIKX!5oDdsSRY_Z9f+9$~K-+51@k`^8npo?fhfLjcIT9 zVyN5i?8+7j06de`YPW_4yI<(HgZ9Rb1*oQW6lD201q2N!a`wg(+xu4*?HJMH9L_k{2tHXQ)ychuO!dQ8zAJbZm}tw1#WztsFbJYW;S}Grr+kcAw~Gb6)65%?q5f%yk6VDR6|H>)9`a zbtf}guNt(NqoRK~D&XZJ+AmReX6|-8QPHBsWNehIO=I#vl!8X5(l9-5KHiQ=E@<)g z+W2}FwnTa{Vr=|y_c))yKp;tXV;8+!ndPX)0@(TS^m-v<8k=2Xv)j9?oY|4a1}Z>D zvNj3`_x~kw{zri}ALPqlH*&>-Snhe4z3$gDcWR^|YX>d+YJEp%XJN$`w2HB1fG;3t ziH-re`2xlqf#@O(MM~6(0S@SvW9!OVTGEm{%iHWz<{nn`4?MrB-%`Nfav#@5FO(`Y zF|*wH{oYrsunTR45PT?g={5=t?%YC}H$UK@hn3@=bF`(!$suF7uSg2x;t~KJ>zh+7 z%Ot5E&VJgWWtFeaj5}$ApXz0-K5G*)9kMQhJ&d}?-*WVq9b*nT%rLEFA;Q@O#8 zMf^c2@{xX&#D$K~X-b#g6T>V?PEv5BTa`J*Kgi6xnbvMsbZ2ny-#aW{X3LepVq~q%Irq;(0Ln zJ6`4H9f>0H z!L**pB{eU}{n0Na>>wX}0g1QnJVV*$pA7nSFzE8~b!D_9wNpl}^f4)D6lU@Mic?UInyv| zPxDm+W4DCBRa%yN>8^bS=IKQolMA`|?*c~)+h<4?0r-^{NN*GSK zW|%idDZWfmTKb`)*OA;83-Nc{3MzYvTsEuPx6GMsAq5p{ZEMr2 zgT7#rlCOWBW6YN^7hg;XF_t#m2ZqSd8}W3Ha4o;^^%EIXV4A*c@;CaopYeOihuF1k#{^&OB@<0L^|zCcTUCIHS?97m()tH8NI zQunDBPz0U~QyW{mPtBAgmTI1kPZ5E5JzE@?s#{g0Y3}|v60j}{8`r~>J75elJ@^O% z^IuCD*4#G;1%Tp7*E+9PdGlx23@;b@q6&sT$=ona(FQ|%eVkTGsZE7`!cJey?bzhv zU6Lg=cR)zh_x958{@W!_m}2bR8mexoYtMKCm?PvQ@_;EDlE+YzF7F}ZCi{a{{g`ZM zSNv8HW4itMC7grp(T}|Wth`UmDvgbItQ63ah;L!IPn}C?e?F&e&2kVY1}OAfy8KChc_ovFy5OOmJJEcGb8`o*GxDM7pZ&+f0?hiK=DFZ*Gyh9FUt zB$>IU;oW8!}3K_U4g>a0Qn4^rLSGv#Gg*CtB@Wf zz<@TRP1@H2cXJ^H+#P`sSu7EsCimVB0k|%J{nUbnNQ(L?-c(G+P_ayf(xn|D>CLl_ zfr<^~mgnM>gnWiQj%rO&(10B=#f$n6O;JV{@cMOm_{7VQ?f40;6nn20N+d$C{!9A8Ii2XnmFJ2dL>Lg zZnmPQ*N8#AnX_oL)xpDKQ?fp5qI9gEuwPDf-TvXLO7XdLR7G%4BY#~My62g=pY#=C z5^`#0DKE%*XxSxJ-4nCLbl0-)oQ=&rK+ai>ZS6;eNatQ}PSM88krq2dPkLLJ2dxxR zPC3|OtyUG3dq_wLldH-A&s-Biuy*I2&;0mkll@RzD7igD3&FbFI4kynf>&8P1$fw= zrw4h7zQ69b54_yOFw~Ky_Dr(Tim9=Nus#45-kKt2UH8q-L~TR3kxYd^J4kqI3BZg@u97J z+0;lD*IM~1;8t!G0sgM!hlce72M%~NYvEUlj&yGR1vE?)7&Ei7nsOR(+0+twy|NuB z&0{4H;MD74hSS>9fQ>Qfz>Kvlt`J7fTlW^Pd^UBDAG8L4UZP***mH|6p*!iZ7|yf0 zwpKBb+8{5)y349*=~M2G5%3k?X#MwIpfxt`uD@~H>tW3r8Y<+&AbP0JKbIIR+#J`an`2GEF$_z=MlrWfr?zEkcy2@PvWq4>VOgZ}yFC9pwHs>n}8KBpdyD+8F8#gf&r)W?OR zbLZmT=YNnZ%Dp5)y3lq=lHF(EF$gr?syqMH)ye+JygrFSxMt5F%%sCppQh4{L2oNh z;E;bQ_Wkk!IywW-<(ZE-Go3|ctbX*!*73S5CIcOvl_Xgv)xW1Pj+8YE`%EKaVz%#)m=A}qU)JKK5n*F@x;9%jaGNhxT5 z{4049`9f);sW@q@?}}il=)8|tMb_=8cQm0fG1Iv(KHZLc|41c<4td|a4wx2PY~iu* zT>ie0FE$JL#&IHP@ufFO%^{ad78PI@`XAm;7vejDY z<(cySM&$Gh{Vui$0Noi81G5=pqO)ilY*JdRpOn!+x~T?tzi#ySS%UWWH4xTIr<)c5 zmfdr>j;D0p98vKlZvDb}IrYps!a80cqCqlnWW{>g-d_vlG6a(_@6Su(8|VjY`#~d0 zgG@TCr(SIE*r$}_y)WRa24iFG^nR<54LGr7t4gyaM>n!7E4)`_>K*rR62*6^Dhnm2 zVcLkkb=tzRrDsQ7p=C(nBz0OU*^r24t*=KRc~;JZsK51HF9R2=#6f+{ZYBIQz4Jt6 zx+U-J8%B8pW}fr*!F2jtYxsmb_rG~Y`c|c$+`PiROI`u|WT0k+HaT^uCeaq!@~T#E zF~tum{_h7gnhMgX$ac~~n$fe+NyTqB+?iu8yOvAfPJDilh`5V-R3J%uggf3I6pr>K6jr-m!aX7B@lLx>%nDw>E=RS$@*Mf z(!i3Et1#wau4MC#GamU57fqU(R-ZFN#9z26urtLa4S<@wBcJwc(8q;v*WOE8z2XNf$$+QTEVzgp_= zT=~8NL<=G)$;T`eUzSsox-JV0f#LFnxm&XbLSi9Dr3}l6;f&0U(5hvC#desqy|LNL z)dfH?9-GgI?tQR=`3i~u?JX=lMbbqfeTJv~UvFAzpkZ>o;&IQ|&@iymUkCp9` z>M^Y^-LW4z9YI>s~-*7K}w;%SDK6@#?%K`{=m#fa> zWrM;NFT^)}SIma8^C~M%vscziw>GBxMi;toY_&RjOr*}PGwBNF-EQPL6VcFTGg#$+ zii|!ktCui>+qx4y^0BLKoaf9Te@}$uSgtzmJmFbuFr-Y=@f9tL41LR0XaM@}8Mn8B z(7+f{sKmSvPI)6c2cst(xjOfLw=RAU5i@r+H3e_klo8@%WAuchk5(r)z8@9;`048| zMc1BOUERbI5^6G4qs&2fd%mq28*M`FOsW15ta9*0fnNW4av8$;ZV6 zlDzOrW!6e14BevJDNAX$=FboG6%CY(;>@K6Cvg4qGO<{xJdk?}gQKI|N2R>V48Z~K z7*BQjt^nWP^<8TXeGm`rJbT_|47&pW)l2iUvQ9s-7GaWRls(FO_J_)D7{h)#x#|{a zovIy`+qZg0@Q zW;|D&Be>}et~!ZZ0ui2DRsIZ;**#t;2MM@*6(b3N*G>oIW+%D5y-S~%bo8C_8Nlh6 zdWd`bLw4_+*pG*?Antjl8^9T2l#)($s>A15?Q2RZl@E^$gYpz!-anF;b=s-i=UkJ zZSK)1?z&}Cyo+NtC}sV`6}^U%KZkckTIVfYVv@Dlp8Qhl>8Cny z-71!8U1wXZj*}Gb8e0+U`&*m)F0+*38=x@T9rUHg-)a6c!u zMF)cls|;0UrlyU0aM$|L%fPyePs`rCd4q>oGhWpKVobcT6-ckLDh=YKan{}Wv$zvC zOD2bJ!*qKd1eEEH)g_dlH=coGHDKg|$$XD*;WZKQAg9W@Z_L#guCC0}sCv5-cP->t z@8T_OFusoC!I&#@<(F1(7QAc}Az2bwnEZ??45eLJ>!PTj_`L8beU=7s3wY0V(FWIo z_czF2D3^;H`H+xLL_*1{v7_kp@6z>sr@S8_huL3JW-KJGu~236erMPhZPx}hY=)HO zy`w%D^g$$yj@X``_KW_{Uy5B`iSdbjR}et^QD9ea4n=NI+<1JJUIeA{D|18IrIf*O zntG+TJ&ne99mmBA8@4TEzST|A%RA5{8e*b@H#pZ?@hEFOqHn>%oyV{<;GPuE*6_G* zhy1m7JH5A|Dd=V%{zYM6*!-VClp`c?)*}=81z0JbJ(lXOGpFjT*b$CADciGNwx^g- z%Zbsm7G!OmXS?zI(*18N<|QSrJYPzXX|ss-*z2dXzxS40zPc@j@%6ZDzolV0RzSM^ zfNWmSyibmyQ4eNug|)Aw*U;oX7W2l54su1CSDkhT>wUFQ=Y)7r+bKqa#KDWVYmr9n zZGTZFzjfqAuSwkdCrlkLvYWvdH>GD0eicg-N4Zbh@EE!yN|U~e!I2#nM0oD?AvYk{xlg~H0&9hpFUf#6{$F0NxzP_ zXt5n=ez$qj+g|M={^|89-Gq3p5em>&c{%Qm_Q&%hMIKXobo%j z@DLf9da0YMpO#<<#D3W$rKhXvNpBWzSGEo~y6pka5YR zyz`Dg)cH!xSiu78p@;lD!;syC+0rNR?YnV;S5ng8@>uTuITMR1CfNSY^T2$e#tfP7 za{RkvzxW&kpN4yt{-TSoG}Zzxo~?%~!MOv09HFKKNiN|R!I0dN5x>?105z?|B#nQ3;ZUj^e|w78E!Qa+|0VZ}2KtSr|F!o=c=5l{HWBguqfjMep%O+nt7upg z)3ripA9LnkW$WwfF}SRWJ-Oz=kY?yW#wSYsGivb5ma<(q5uA1BR2p4D_UX}Fg|ZBd z*^Dk`)V?uwf6P8-&Irep>v^qS`Wveeq!YIWCpt|cr|)rBbHBv12Aalk#D@1KW%$|d zW#*kkNaaBmD*2zp-K_L%Gd%CiVlRbod^wK#JJ@AP>8{1_U@2P zqzIMTB_>_G{g{KocT$2n?04SVR@b<3P0C72I@cj+_FDZBj#dYYkm0bZyr_OuL{q|w=S@$2^A?weY8qk>pOEsBihk|E#xdNJ?8R5N@py|=QaoJ)IhCCvkFCz9?fpwZThMwYw*dZ`%HtjyHRi` z*B;4yCk7Zt*Powo3s3ocX>m3D2)LqCz}wz&v-$s8V;MOlTmCZx^jA+dC8zYtON005 z7K3lo^B@u6Ru-%c>lnxFGr$C~!Uo+!5YtBU8F^TDLip7_DqjrWvscg#nws%jFw};Ew%*L91@0KYahYGX0?Q{^LM- zX6GiAQJfKh3!9hD(yIX{G%_qm_&S#D&Vh{6bUFo7--rfwF%Ux-wQR#h` zIOi_S98v#dHV59#*4ymHtt)JxWPzvh0T6-L5tPh0@A{65KOX^s0s}e`c@hM|k$&JYTrIw3ckiL~jUO8) zS!GjM@3z*hHTAJ84b2{xmC88WBabc~sQkEVyOa5q?z92a)AlnqA;fngNJ!V%L_ZwS zuPqT;)`Y z`%b1owWSg|@b`%M69yPWK{8GulFs?9L^cT9h0j5IApcKHB7p@!}Q6fambji|UDGd5QN1-70Kji-)vSz-Q%~Tm&0o5|8 z^Fz9_p`W(#e`Nywjuu*IyK|ymhgoWBKxBCZyn&UVwGmdKhyi*>(F*%RBO=!L!GpQA zx->wJ9u7JBMw~)K>dMb2jcUd&7xZNX4w=&~x~4eiDU@N1D~mc^B1lQ`3(aiw?cp~P zV-4LSdush$ijEpImM)Zk2*tgbp?FO@D)4ei;|Oj@KJMd4nfp0#JV*9Hbim>q@}qkB z!|Htdz-9oHpTj#pHOjbAmMQZC4Bg3_=z-LmPap)$0tO$)WEA|gi@z;Zlmn(vQMs{_wf0JwEb>-TL~WT?aidvgw~B<}g*suC_Zt zEjeSvt9u9S%&LYhk3S(^%1}@^I%;8x&`LVLx-+HdJBPdUtQ8<y<Fd&v?*NLJWDw{JHjEN<_vW&1dN?nZgLne^%IqEv1wK=9(#1$(*|>6Kgq>osU+*Q`ul0y-1gRR z&Y65HwwgL;$)?i}rzDAjD=%K(UWIakS%!WQTfN9T!Mi~3|F`D|w9VZ=I$-o_}tz2^IyT7;^j2iznVUwlCAg`9t9dl2o4hKgJ0imWr8{|R*}xbI|2J%yjb9p z5tibr^%a)i_?*kFa%Z|)*-VFEA3T(pbu_nC>!BfPqF2+#dVH|wIsdWHB@dE$xHS9* zsh;B}QaGgJwwN2b!SUIDo#PNQ5pgq!vtUE2_0)fgvlLPms!b)ZP>_%BqUbdFU1I(M z*>zgAZ>v`}2QoRr$je&NAYMeUhSIed#GN|5S~YM>50_6`S6C}`waOa(n4;8SUj5CTN>>#tJ_&5u8KV4AB!RP`&%lH z2%B>fDIdr4g-1U}&f%nJWD4C`Q`C|@eNLCNFt&NrHx{h8y6Wq%FFVCl@ z_kvPU^jKZPj?8U^$H4pX<;xD!zfp8tK^uadhRLLvt0G^_K9%#3Gl3PjMY3bCWH64%E5 zO}^0|spbzg!6_W{3DnYl$kqDZ3lR44aSJUH2*$>ME8b{n`Erc-PtqdafUJAOF_aDc zy*|(!84?#GANQ&yanoM*H`D*+32-h3Z`EaZ!o$O`T$_)q{=qS@LWj+?{*44*9+98| z#*mWEy*9D?b9CPh-?CnTnMlQdd3u=*N$$R;D{|IC=7UJNhdu$E4R4PA@AmtrdUF7$ zldoDm)89`orOo5I=>K7g_uu45UfGb72pjqqLa{IiZ1oQpP#mO2w8cUR)~0($`449~ zP{p!yx|r=%)_Vq1Qm`LhUdB%GRyHo~OtgR12Md8})3m{Bw<6;>dBJTt?lEzDa>su9 z#P{s+0cU2bO>ZZ=`^gTQkh)f2KCe0~`uXi5`v26{OGq)|b?0!WGPB2yIWb8M=r(04 zT;lcBMKT8CqfNLc?Xfa?vu>@KLS!>;1DOw|L=bYFM2q1bGTL+b3~Wdr+bL3Z(n5HY zw&hDn<6>5sK!uCW`U)$KU{cC-c~1YD75t>;w6wSr(SI#O%1{bcvfaCNvYldQQXxu+ zt9lD^t@u~6e1^m%Y$!ufqMUS+532_S_OnGScQuoINk63eK<&aOoanO*%Fm;HN_x0O zkP`}RikGE$IyqY_QP0ZjM3|?)@VK{i!6c>45%b)Y?3}Wo0P68^Vll z=C{Qc1(YpQ&L6TEKU<&M%G~A1SjEuS$_sdtUw?=8ps*0EZ-2aP&{4*BGr9*ODlG~5 z@?EKva6043ic&JJHSbrp+78B`@KT=OL{`U>1;79JAJmWn8NoG3#bV36w!7J??kUe- zA-}HErZkft(xl6S&XW*pF+KH5=0z`yPA8$s+ppNINUut6yVIIWmaWMQy`1-KS@7@W zya11N2Kn_)y3$~Z(FK?1_U9q);zKk)uJW|{e=WkW7|c?YIo8i{nfUJR2^U_|Wpj}R ztqB2|JRRnYE^0kVM#hNYh136Y0A!&Ni>JPZA$p6FeMtOJS6^Y|QiRlMUWCUE!c(L5 z!Y3muSuxc&i&E``tAO(--G4V2-;L7=Wy2;kcAL_3tn)NO`ib!aom|X$s=UhV zR#{V|*(_4A*;1H2&!z)SFneB2w+QKnNJ)|uZrELr%xJ|(+)(la@s%L}B+Nd>q@z2-XrLuo^Yn(1jfXKDYw!$9w7zeq-xDII|6%0c0t>_Tdpg zI(e85podr_wiO?ydrVd%b~qQyV4Iz(H(wG})=fSA*QB&i#5QyYiK`2_Gk;0C*zsbW zn9oaStImZu|32&0=Sh2j_Sba;JvS%vnoWcLUbDqYlL>7U6f&j0gnx3BiR~)YnqPI= z@%&Acc0;)y|3*hFI6~X|(wotVuDynBDi~O1C=DpLf{p@=v;B*KL(HW-a?+@G~d)Gh#5q)jayB-O(J|Y!~%q}C{n5aclz=&!(NEW#~f2g20ljQJUm-xUD5lO zmMeYYDz%2|g%tclyL%&!*GTp4^hgTbWBD2qL&z8gBdr*%EX-QRaVz=G3@4Bc>o9ND z)3mC5z%2O78{Z79Hn$)Vp_u1?EAMiL^#9m<^LVJ&_ka8(O4=xzN=bFvkZH4&Xj*8I zLdI@rQFh5bNK>>~(kUqtDPaa<-$#x@heR@VDkThKpRwzEy{s<|pYQwg`~CCzJ$|3_ zc$~+XW9D_g?)$p0>v^sBbtehpYAn88EuQzJQ*X`YZM$4JYKLg))o)GhyX{+L#}NFjlb18)2Q7qZ0`@6(KRZKW%H22=XmXh*Kw{pONV--SC>>TC6iV!;nQY6= z2s9mt|6$kj9OC5&tvu6l5bKv=U3s#Tw;~ma!GuodUTE`>BUu z*}tDdR_OrmxTk+%<#QZ~yE7+2bxWqh26`X)oB$&r%Ux>A)Kl|UoY;7Co-Gw|(Ro+@ zb;EV}+7&hpd+1+}x_91gA0-)5C3Bb-mAZdU+e8>F?BG!g`zbexzb(M+AGP`SAMM&; z-6W>GJU9*tWt=!Vi<{WQtCFNPPSsGKHth#jD3{|MU)Fv=!u;R1QY$RzrO+Q)eL3&=^_vSIh%=3Xrz2q<@5Lc|Gs&4VKFC0TKlzL3w}H{QjSERY z*5$()h29UUoxR4kq?A+ADMQfPEM{*`iiN^|#>8&-L~7X*URKfBTAR5O~@GW$dF9Eq8WotC8aBeY<0{K#*jN z#97OHr7TgiLJ=Hf;TvL7ZL>p@capypMh`x}X$R$1rh~e@@cuHUYj3T6JVk?gnQk6vY-?-#?#lb+8{{&?#WdQna^?DP zL58tlOh91ZPSp-+~pb@y&P1c7Em5E(nEf2*G1W zpctxeO=klX`6m>^R1OAi1Qn6Mk0 zwnc_t(`+e287V1G^-`~Gk%4j$#Uk_L!4~JwZ`9G!`sacmHQm15&N*{%WO%T@wbjh7 zr7TJ;Y502R^^1bk`3slXEcv+pfK9lqSndWY@sC|>+AzCX{^KFqrV5Sqy-w{kjbrXv z5|px7s3cnjx9df{iP!_Tr5te{CO8a_^RLTF&>(`L-qW^a8qc>NX=y$*>opN3V9$ME z%~%R`|H?rwR$6cAUkI7fR&5Uoi&oF2sOM1B-&54zQPi_2>NFWR{h8Jn-(H)pas5R= zfapU0^XUU0gyfu`zGao^V3+oNesvt12<=LY>;~CPu_UOX>wmcUc-jl2u-o=+HUESJ zZ9DsdKjQZpb2yJ)&Xe!b_0Kjf&q3rV_r^^>?S}_g*Xm+r9Xh_HN%AoieFw?(0af?R zlycZS^7lOr-euk$(gpuoW5Xc(=UZO5{0whJ$|8T|bfUIU^=q8ET#4 zUeRmwt6^Ky8k06Z*9gpkgLZ46rBsm})T^c>{w{ub)T&S{iKUUKCv^ml!I}$NC=TaI z5(2B_W3aiy&0#@F;=Q?omI(u%xbqSf7_UuP<8?QV(~xHZClh^)A3&8nsf!k(fsg9;* z3cyHnTIGcMcxe<|I7K)btV`roF70@V@$LHbLN*{&nXoP>Ft8A-m&Q-8&Sys)s7r6p z43HPbvh2TQk-A(YoHe4{{EpbEJd75~cIx_h@;a0?$5Dt+#X0QxxDPsX7??H%K@X9Z zwAkO!03LpJkRU7hfexSTMgPE*Q!~kV7hKm-+dZy+g@zHF>k3Y}(7P!?l*5@OhM8&) zsoM9jK|q7+jQ+lhWfj-nz?_`MH_aCi$A$I*!SZ>fh3$J!X`6Ce%g{f zF{BLQmJB!&&_KU$zvF3dBGqLlhiS`;TJPdo()@+m3d|!8R2o}5uaOoc=Tdcm4ZC~{ zx2u{VxgZw2^dtec&_M^E^|Hi-yXGw4Y)3e!^x%{u-vwPvh0{QfF<!c&WjeS99X;&zZ^Nx&Ul?IxcEf!znk*QWI$awI^V-yQW z5uif?{6i&G=0xiXK}63u$c2Jz%jX(J6=1W)T;yohYH~I<-r;fIara-<$(D^_ zXM;Xj)MZrid<~Sg3R!oF{M7#EhlgDiaK&Rj^Y2>=3zz_FYGwsri7ipww|xgUnT>uw z#Gh0Ro7}R$mAis2T9?yk($U5$*ViwXyT|=zv*(#foh4@<;y z^{cXI+#R9H`m`Wxb4DWKB{;ieEHcGCbv?ee&XQGxOrMV)};M zTnqKNbGN*!=K-GZX>uM#@s@!ZV){Ek z)$i^s_p64};Mbf$NUjBte=@6+sYC1j&RDoy=y7|CZ`Y=9`=;WT*XkmU5Fk#^8?lEV z@e~oHoA#eG*XDcl^2I1pO^Pkr>uBb;>|=k!Wqt)0Wx~rr%~vldBe)K;uKk8Of&#^` zBI9-R?oA&xmb2yMGfl2yxrWGtn}RVwp%{#BZ@n+Moi5s5o9QCMalhf|rWkwBKI$ym z8G+u*SFU(~QIL4VNyWTW#$F6&y>$lT?{u5%vMJ>|VleYEO*ZH|cgjB$ z8NpLJaz>ic0aUVyAbggd@Hf{0{mr(cCfZ=drhi!B~*dx(7VqB9`JSP z`+mofcC8NH+YK>o^@a?!-B^4vs2R&@vLo;b?E=(Sgqk?A{-_hQ%=k?H%P-rWFIZ8a zUG-_S_C{)LC;xlO?RGde<>B1hR@?IQqJC@m!d`MCD^8qlpeK>lTV+b6sj0NVxd+{3 z>OgAMS&0h2y1Y)}mP{Kl_r4Gg{S2x)3NdPHTMI-6kldk2uWWR_ApPpKv4TfaURViG{bIQ27etz zqge59I)8>!gRe%JY`Na>5U_n&ETFmXd%6q0TY99Y`3rZ3VMKe+e)sm0#)>k1gws?G z2@QtSHU_!ApwD;i+|SMvU9UJVlY1o8g%wOl93YTm{5YIg_njg_2;?lkY)7UDGe=t0 zop#2T-3DgTx`Pfec*AxEonrO-anO}9smmG0>t;CYmH|5n?h3ruRZ&u+%EMUH%4%My zCp9K^C4e(q&E*RxT!?=u`_*fJh7YxP=A#@WgCz($89ZEq6!9J9y(>Zlld)m%C&ZHA zAtLxRXeZEf;dgDieZh@7fbAp|Rt7OAL+vCLP-jH7A%n<8BX)@6<6069akY{EPUkL` zSEg!nQH4G#k}~*{Jo1UnF1Bl+j(VW$paUm^@i&b>j=OjE;S@%fY&n7mCX zKoh1&2cFWNv+hfyi4Qj0F`~W>R9U>;c^X(wm%%LB`Ob!~OcBo(I&@vPP3G!1oE3o6 zHNl|vuW$9&$vLMu|7KTPV!-~Z+Z$a_8Rk$A zL}IC5Lk3V)PX>P>VCHCBh=7^iqQ^xN;FC1?Vf8hRPDpmbH21Q0}-kOiC~SWrV17d{Vk||Fu*v9;e1nfuda*8KG+BEsQ)JBoyFFOQjev)IS2H^48R zQGa<+6r?gq@ya(Cq9O?wCPEB56PPbI)bZ{h2NBq+5)QZM$_-|~%r;104R(HO`8D&J z;9(jV2T$5J2lxI8uvFls2z;VwlsG~AlY)yo!jl0@xPu!kGddMgYhewkO#Tmwhcl#K z9C6@FqActt`*x`AqEnfyOJT=*&|>h+3wjKj2^{dhVUTLdqv-c_tS8(7rj1{Po8VNiD^$VFYfOi_TX zifivKg7eI_D(9&Ly>>9zbKqp>Dgic`&I)niWUJcJ#{I2IZG*F#mep^0n#Q>6^lhNg zrN~!WQ~GZP2X5XD6uaqh;l!vMk3HpyL>xYddy$aXQ87p7LF>$f1b%@6ZQq`d`vl#- zQEU@wUcdj^YiCzb`py;>(MuY08o66SmFwU(zoe9w6z3C2WN?MwX0qgcRBMtU5$(Sr8DJnqPgtdWUXCPp$`=7WuB(c zVS3SyU_X)O&|*Sj2!V$Y+O2wndfmd-RrW(KK%zQShV-uryh)5bbdw0aw;8?)>@O|8 zc{H$u->Ru#*@s*P5?T*&^)Toxx&EJz#K=r}BWzXeTi3TWb*a{2(${{T=pXr27|_6m zg~=zz5n6S$weMfdCf*wHJ@R`D-_S^fqSO>wSDyuZ>(rHch00_ZTmgKu%dN#DX2m-W zM-~3cuC8lpA4P*qGeH%l1_lN))BYm3^>_=(rbQzkmaRX)kN={LXMyblwpbic`$u9q zq2QGXfw-ZXKTGj@MuL|zK_Ui2O7{|JN(A&8J}yipijjZOZ#l3!2*VCND+dWq%*0ku zoTq?Ii=}FSHE;Qz0lJ3&n+FrLMIAXqV`(d1v^7-H zSU)|OV5^^)M&MzBYa6_TA%7LS^$EEJfmSU`TU&{n&*j~^L-f?jJeY|6ah_&kNCWe+ zs}CSLRmqq2DdiUxalY!G!cv z#0VmRv%3mz9|NLB0vpLYUKK2h2{iU|i^i2TfvUL;d=(-tU#;_N9INuUbVP;Pgr8q$bYL5+9}GeufK zf8Z%yIgWJPS3ux6?uO7)PW1au%sSW398AN(DtsA5y_BL}1`~U}@;nU?@i;zr6L2n8 zqy;&IBLn`Y6*#2zqv}_HuQc7JSyc;f$$YVMQ#hu5b7XSTpV`FfA@{$V$GGx2=!aZK zmS^CUpNL8|_FmB95xJ3wzTZngDo)reW8EnG{DzptMocKT{U;UT{rN z8nk0p2?)$P*Krl#vntewpjM{X9(2&LOBGM!ZVt^NB$Df(y^x-6>b1~FGDd(EQk0XE zbIqwCUjWZqFx2ASHpZAKP{ZL>aB2;bF_}f#fI^z3dNykrn2t>0Ohtq-uW)J$uwb9s zHqu)sY5|UhtCz~ywQ8nfMHrzbM6h%V8h`yN0|AtVzrbB`f5M5Zt7}R=#2_Ts0S;t}zsGz}C-PEjGr51sM8x1g4R=e9_SXqOy$hFvWoqM**hfQVI5{k&?$gkwdh!^{mLkk&F7G*~ zq4*T39=WG=Y9-oG{)jkqXQzyICfn^;;aWn7_`B* z^hzKvAu1fx*jk;8rK*>Lu~6m#B$XpK9o0U2{D^h=;U$%C-ul2r z)cH}HcaBwmlsM>niPiIN#ChoKf{iryIo#lOTqze;>`b??Xm3$z=VW88Uws0|k z{vG1%M5mye*Xh5{!{2B`z3%;IThOJ>{ocTLU@H2xLx|&?#Vdcr(X;}wPubw%fx%ve))eOWFNj5G zcZ9f-z4%e{7=5CEg5dD(M^k8;ZA}sf3Z~&o+@>I49)~G#v;`ON^JywhtxShN5DzkK zdjmQRJj$%Jd@u#FiSWjqqR<$CXkSL@GHBp3#7j2UNO3xqF4y=0X zgnPjRp4QOeX6toS^>2!U_RK0g__UOlw2B}$0EWFF7x=rqX|zWnssuHEibjQE&^c+U zwhA`ej>T};5nLW&0+t?VmPAT4BGXSztAmlaQW+kA^A>5kcvG*j8*dOnXmzCAdix|si>dR_huj+j#XiKiX3$8q@Ex#H2nPq=+T<1$rtUlB71iD=R;7jvR zCJ;kKoFMiz*n;mabJq6lW|)HqZNkbh$;sd^q1J-T286OoSq;ZmMb< zlBi6+vuyo+7cEZqrpV-&7@u02Q}RJOh%Y+Z*(KFvLfWhnA_`sM(V#29@C9=gFp?Ew zE)j#|h#vd0FSw@r5F{eYkdBwzw_I9eSfAZ{F!KHmvEHCwB|iZ^!5vR&To zU;ioP^>(iy)#SCuJ#2n6GfOXR-Ukx)%2C25LW1lxmw3UO*8L9Ik5-7??Yt*7q#Aahe2RSoV(>#Y;5bNxLW z-m`j82j3Io@}iO4v~&=&sC>*i-i(m7e(@!TImz0GkhQ&P&3+VF-$9L(U}mZ)VgdsuyGD|ps)grR?WUyM~E(M&*bA1@XcDGI$o zaO*A0)p`*m7nbOOBYZS{Wl(pyrN&IsgTIH8q(tuq_XOe#xr*lJc(HQFaa8TymgXPL z=R4B#9gz@o(!->M*pw50*#oUnhVU;Dknzm{BY6rw-kqNUWi*oT zhpAumE{>mWMM|7&7SF|He~{)Oh9b9mGU^wnUjt$u6Q`U6#0=J-2qpBlTB}u#>fZzm zo(*2`GkK$Sc%Naw=HuRvt07>C0^^@=_bE)p>s^4`{IhtGab zH>u|S1?(#+=;(J!z?r62{GcGKb#q;U#DsLs13M z#JG0gI$FSfRJjDH8L0xPOO!%RGWBdy21}w*K9m+7XU`~&XnnW*kXVqHcNS6^!S2l4 z;1{H!q0tQ$*j;;6jIl{OAVBVn#CvPs8|o+qbspWF0XL}NMM5B zsLsmZu6G$|4vP{-JnaGl0%uoA{{taek*^z|AVX72>z;{61EK=er7f9qAh;sQUq9#XZG*vGngjrJ#rvKFRL4pfB$sl$IF{z9{#LrFv8`M*SE}&mq zBL);wI1{WFN?UzNt=a?0F1MsMTZps)#~~#*8#BWrkv!qGHdek8Du9M3nlU+K#9oE_%BlMH9 zpsRk!#5SYvIYh1o!~env0N>cq)OJ{!pE1;IgX51;#ky)A!KEsL6egk~AC!y!ii)yF zELwGGEcsX9Ay>J<7S~}a29hm?^bDi0Yjq!nw+oJzw;Iuc)hoB1Nq7;I3Hxy!y6uUb-T(lm@f0H`Gt zM{xG;jio`KFp4(8t?PnIeWFTEhNbIZHGsOvFL^++21iq4wL-zc4VL5i_E$T=6!34k zpf-zmjIKM26H!}PoBYvz4Uev@IHTN7xhY+M6xtfyvfYU2#nmz-h(wIrS=gY^Gx*+# z3~ECre;wqCKFW>s{V3a84l-YacyEA3t$+bh!(hyq(jJ&NFKUWPuFHpFjN*akBAtk5 zP#G3<-9IO3ZI5DOD6c-bSc8kYa7a?ylj_1W1>klmGg19r(^kVDqnH=MLGaw1#Jc!- zD;M*ET(iL8j2<}-N7k-7f2&o#ZP0Q+MN-zLN!jsC-y0FfPDfkY#Jjdgrc_NqtU3dd zr#XY|Z}nebT;8kaWW)}}?t`NTh_bf=nOK?pdMlhI9>a*44C8qpR91MTo7Dv8AWtph zsO4lG7?y>*yw|B$tVUePzo3MJ;cVP2UnhgF8&zgmn88;A1hWV0Ws=(ZTf(z3gI@t? zxF1JR;3LG(@~>0A)BfapN#pLqgUHl@bjNe2;YC?X6}v$D@trWkXHxW~+}lUP{lECC zO>o$%bo& zNC%={bfBJDYQgMlgde~e4I$z6aB67GvmX83Q{MaLRw2aDoEw#UJx z4Q26VqlHot9Qvol$sI0@GGG5W3u(Mh68L(>m7sDu(i#&f=QEAYQ9o*ML75TYNzWSJ zp9i#)7SCgIY&f500`ks~^e(RswT9f2NH?Lp5eaY}lh5cm<=a{Iu`JPCV$I*>HMkkuJ{V|0P!H-kxpl1@IO6pUqcoB;ws9Hx)=*ZoC4i=%0cYZ zkkWfg=#Rc$%b)4dA6gF0DG5B^-Om#7qCavnxqn1)b=JSyMR<$Ra2=ad*7)(snFo?5 zInKZznXDtL5ETF?5#Vn`fiI&i`xI1!MY`~L_^5o$U{RoqAAsLT=#>--egM)J45#12 zy*kwZ((9et2Fs#! zjH>)w!W{zDsJv~6vHJdrO~06TYFZK#C*b(BKM2+9sBlzQfN4R8c^sRO<6wIf&kLjW z)e{Oz>sE{XwQ;ma7B$iJK&zT{s9&OF73BHbuHI4{D6*{bHK`&$c-M0 zLIMG!G73Lki6HjF0u$BAzl?q`FacZp&E*KCxsT>20c~P(^rrd+{Xw#VGiQR0^u2ad z1UijKWeHl)YG*95D{7?D^XQ`C}P5o zZzD}x%&f*;)3LS8mF|H9TFa+z9QthwkR~L;nzg9C-}Mjy@6^bxGY%)bQ2^a;@}%k* z3d^Rk7wJ3%SC{J4uqMb36I*=+)?!7Fy*R<66H@`!nm;=6=_?z1^nY1vI%)owwf?`v zTEPL}Lz(ARf^whW_HnlE1lLpWYvSI?vyf;}M1EuN9xzwInY08AI1g7eZFMVG3V7)M z`0wch!Wjw#&?OYjL#8v#lu%{Gz=*eZwh>W3@QlYvZOpK!@{A|Nj@z`8-~K>~WHL>! z{G9yDj1O6NLCU@QIM$%E^qaR7D{)f;BsWX!xP?m1jZWkL8E)8*9e!E27CHP90F8&J zF`d)rI?*;qV(&DdMM09m9ay`a!#v%|4!xj?Vv*v-^eubGEY&H%Hc&$9>8JjGiiBgj zW;OtSj{taUiT{k#!jp%Y<}7QU1V4j58cc54_~h+^j_Vob9+UVsr&xNZn^6gIt>@an_lsxl@ZGtv|6-U91l^; z9rx!AMZ+nR_;R*$*^HBI@`D>;%9S26VAG15z zX$Ia-0>AKKuw6N=mZCBiI1Qcz(Qu5*^_J#lN6QGc0i{^Ybxo&q1HhM`hB+$$%oQJ( zBc-M&g*5dW+!Ihuan1wmsT^i-Iv3H&X;48y0sPNj*HT}8|Ni~u<<^w-hiN&YJ*=Ld zB?3I|BbQtAR?ip|X9sRfoOPMqY;8M6XO*A{$@!dbZ%{iJyoKX8PjjJ`O`NWH`B%>v z0_7I5K`vRs^w7@zypLv1Is(ouE9UIhb>{V&IH0VBtPLClAQKsPDyaR$x+{eBrsYdL3 zr6$7Knb~sLty&w(o)YfF%1Uguc^as5K5QnHz|(0InmO{+k5<6hbw%#7n5n;TC+G^F zNr?&zyArOvy+={r&AA0jjB;DLY>X708CKHz2&j>OzX z%`7;SWA{7rSEl(5*M{G{dv}y2zNIV%gwCjs^~!A5kF>@7tVQ$RaaeJxokyANvSx(WZ^^8jLus z`L+wP(*{JzCaxTUEVhLzRa2eNNFNuMs^YPJ1+3-p5B=5?*o2@WKaog92^w*k64HYh z%{2zlJ*-H6R-qo9%b&n!e!P+AQQU0gflD;8x#+7E$XQ9JdS&d-bax&+8bZ|d5h?KbVs9`}dIZHjFpYm8wX@}l>EvZeFtMs_yg+zyzm^R`LPBK2(34y&a-^0r^~uqa zX-k6@tkwYT$(fbxb8=5Q?m3w#D=T}{{m9F=Pja>jlu~wCY+vP3_lnZsGj+kx_@KE0 z9g7!WPzlfJloOr}A($K|@pQ#q;Tj^hpnqkx>{BQBLPMEWGq_8_jgUIR9 zbNEf})$xC#Xue0BH*AtN_Z5AQDN~`2TDu%$zhUY=i}ANrf@@Y#_pEyNj44lUK=CHn z_Em=iZcniTDT(`i1X-g6|rAC(Dv!Zj3c#8el39+_n3u@9tNkju4F`_Ne> zv`|3!Ko`*tsJ$nNajQ|ta;2QTmbIj-{2jJ zG6#I_;^cy`zJe(el=n~h47JpoH@j?fTO;u6sg~HPg)1sE4o?^5KS-w$6Jnsx92M7@ z{@N3&K>3q;7g0>;qd>YAbSt+^JIY_L1Z*q(a7mMi(zGc8zYTzhmAERXqw+G-H`OK_ z6vQ2&=Z2`Em6LI)!L=1P=r^w^!Bx;ZlKZ3uh8cA!-ZMx41*7kA<@#efx}v6j5>YN& zOm$J)ULvLpS3@Pn=xzpIrd2hfRO9c<2>zUi4@^Xna+C`LA$V7wdgV&Pj2BNX0f*cL zr+OXt1p1myycV~K!Cf7OR!wGlTND=S9H=*8YbPJc7@9ft3ac9hMWj&td&tG~@si*;^s7A3z8{C|7Rf|V6F>LbfT7U`# zCC1gFD605!=DWxZXas4vF+k{1UBO153SJ5FcA&E~62-2lhv7F(+&~L&>8&0p?hEGP z@SWU0SF<&{ZxTaI6ve^5_M+VWSu~U0Z+Q9)kDc4Ubf)V$wY?4q6q=(EA?wgYXy@1C z?hoUB_~#%E9!A3$hKWQ|!;2f2OmM^`4WhvAHg%6fvom^J(1eJ9c2$`BqjqBt_{t6{ z-SQL$eh~v7^(NV`@F*bN3mq$(gm!` zt2}aygwQtm*j6Lu#5)@#=Pb+f*#bK)7MMgs8DGju!O6+VmX|JFIumNAB9UR@+e>Cq zla~G4jz0l?iLi_C7f}zRrwL4@R%*R)f>OwgIz&j$a{A==e-$(gD2A4JB8pn{tEPOW z_MkDUGH`60jBbPO*~{;R?%p(=QCAnV~n`9?(vHWisaiicJ+2LS0cwnHW2lc>| z@ig{MP&t%SuhFe~*YGznnM}boYo2{+jq06<-99iqIAib|IHY}u^kmmi)H){wlK~h& znXiOFJ(-3awrY7WQyJj40{-oBY?VYstj8E`g1`yD*HnT0Ulae2EN>-uNeI#z-tE25BI*(;;TqKE zQ&;_sZ76OpNS*&nqdCun-pG;$GDc5h+kBWt>iQWR0j3g6?u&61+^{m^tMRS&LHcg1 zt-q6lny}BFNqt|a{p3Ee@UMN7H(riEru+biA!{;#dTKg_IWGXTH0paNCSGP9YKL8VubwjeO+AJ#04H z3zH%G#rm?ipWq5OZB3pg4G0mmfe2;9fH~WDeL%7wx0h9`qXgRMgfPUbFDx02wLD(T zjtIvbRbDw`)xwBd@!n|Z%%x0{>=NA6hHa6`8bOVGS5&#J`^%=`<41{~fA0*(Xy9dL z3>)kV3ymt3oU_Tl&klI+>GjEli+3zmpQr^u86CwpaH$=&Z*LFTpRhE$e@&nd;xG2~ zYo!iRqqt81He46okd^!W%5eVDf^l=y!~4&0Oc7pm<`Qg? zCB#dvM2$3UV|{j$u3^2|<0{V6szMz7OA#K=-ZkW!#%+gX{bsu2Cjwx%s?NM8PUlP% zpG1T5Y4LxRfD|O#WZ=|7w?_r6UbE&Tv&!GN?8}$#y35{I#NdWUonSSQ;WzR2J|7vc zTs5AY3e_0;?Y=!+BiB^n8UACWp>3*en}F|tNi)Tq+e72Pr5Sfg#v%EvhA+v}`m;}N zJ$w*Wfo(eGxb(~o+$pZVc-*!K{4z0gFzNhu_|${qxXJ1Ji}7-u2{?F3W68FJA;v|1 zq3e~$cvD12b|)jf36KH*lPEN3Xne`9jXpvo;!zVXw_<;ReJeL97RkoVh;i^P;vUHC z65{U5EO}M^j>eHbB0Fko2O{EkFYI#Rv~5CRIqoaT1SfA zMc~gCT>Nd(SKCkhS~KrcL6}#bE9B}|&0s7~*P@Qp4amHUAtgN>i8ZBHkuUz^=2OVz9tkbXSogyd-`1@TA*wUF+3ed?J>h}ps!mjm-lilU z%hce=KaBQTcKzHP4w&jL{v!r2&4HxX7uO zJA$`#Jvc0^k;aT}<)azxbIE_Kl65)5rqprULvI18y&dI|!ats#-pk?TAtGf>ZI+g$ zpj9ut&1><#AH926Zsn`=9BGG>Vt4z;-pSn}bH(l4!iC2%g2QcFTUzHK%ERNgFiOmpsdEr4@I{jfwHs>M9CUZ#pU+P=5FNi10m3rT=<6-F4*3 z?|JfhXR|j8EHpe5LiHf(BK1E4}`AG{qF15mo zNb1)mE$ndHUh_zCQ?Y=3+cjp~HOyMu62ynu!QB=S z*SHpxw(v#7Vv@O9J-1XreMtEM)4jGyI2m%I7p-))dw0gSipt(RL^4Lf*BH}O`t427 zqc7LvBgc)saD1@W-?Ai{_Hsp}R%x~j#B|(l1|L}Q_ZqG9gIpmzL`bDXY2Ld^5Pe)Y zOS~a?FK6yeCl%ZFRW7MFvj|somr}ssgP7kX;fJ{|6|M6x{Fi3-ZIn}AC>=OfX zmxCCpYOD~U#-O-9X2`mClLd`uSrFnZ?&M}JYE5+pOldj-Vk3S~O=nK;y#&Iue%{tG z_j|a<6AG`@`Q5uE1NQAW4ZYHMhuhI51XEzWPvq*C49D+p2HO@}zk2&ApuFj-M=z$r zG%36jepHMET_x)SP;t5G%Bjf$e$XbTpYDP6u@R7^ch?8GKbZnS)O)t_;;RDyuP)xo zh)a$k?3M6>gBc`A@VpZFzLw7R2*OG@d&G(D?XS{1NLsXV>&G3wr(8O<%gavTsFK*I z32s9_ZS*2bGnBUVHk*^}Z+}n$2!_)EpQ|d{VnzkY6tf901~$=>Hgpo(HqHi*+=_c5 z&b>?6aqh3xRZr2wPaA|7A~RI&TLh>w|J;Y__KxBzC7)`~OV zmIT3{4cr?)-mTrJaFpJCLD=|Tz>loHnk%H3aEl~+o|Y68f>bm7$5EWa7@mDHueSx4 zyZgO=GPMok;t99V;DR5DIE|NM9GZVR31oy6ja+;}{*kceUk1noxG4dpj1Md|HFnQ( zTeR}mV?d$HAGZXTFOk1EIW+2n&jJ#W(mIhn+t>Q{T4XB0X`nl8H}9NfTI0{!LUe$h z^XA+_9>kvlhjL9JPqUunF5Zyf7qfTOs{K>>ih(7PQEb|}&)xLzCS^DM8Y2#ew+{vv*f54e=F}Pb7wS=l~7Y)wZ(k?xC*Mtm=^kTI* zuzoff8e6b0pjwTOg@BR5iS{_sJZM?Kx>+$RHDSA2VY?8MS{0E2|5q{7JTw$X;DW0W z8DJh`UO63HQ|rB%AOG3o2yu3B;&n*OQtJNYF8ky6GQ6#CbhH3@>a{T=wH4~hBUoDksqVWB?fJLbbX)=2;KG>fhC zsWY5EpWI~l)4O8p1aF}k`XBD+2c%bDE_|U^@D0iL_T-q3Zkkan(8A+yB~&UN{TLw4 zaj@uQuGALFL~VO5ut+Q!g;af1~nu9hvTXDg!fZhifDPl9r>gnQ6 z6a`NH9sL0}z{y^>np>(c0ONEX2~s&YIBd6DxD*l8U_{-fHQS(qS(A$!`=EwgC3y6J zR&dSmddssS&9+@9b>MdJe^W5lcTft^0FCr#LjNY`_+Lnxz>u*|_nmr(t1%%S zA?`S^PjR;3f^a-X*s^7}wc@Xor~Q5g<>g6k`cZq0AjL-VM1s*h-^^#HD!k_eq)o7n z7x(Tvi#tJee1e!b(25{!rmdpF? z9TzLfsN#g#7Y^IOU*eHXOguPQ-zkT1)D4_1uoOcvkuNa4WH}t z@Z&{glUDhhRhAQu!+|F_=m|S+RNa&CR+wY#Wysi8%hV6wz)6DH5$D)uH)n_-0SA@~ z*M~RZI>Atb8-Aalru6U9qcJcF+!9PCTnY4}uMnOz3uY7bQChr4>Kx_ooZmrR5w50$ zUay2rgs4nvFRo4kEOy zQdXK_ubz0AHSB&Qjr!?Z>XhiY51y)MnZaB4Iw#vK4oa5LGlSGs09g&*Evz4qQ^)f)hp(1`#qV z7SD>V2*0Gs0~_j`iF0DB>i$5S=&{A-8Vt|M18i4mcyN!kU7z95M$hAQ;--#jgafOJ2$ z4aIpHw*z_Zf}4+-e{yExXM%fR=OEllWQ8|BdW^HQwEX(v!-wbBek_J+OiOe%rvJu{ zudahSeN9<72Zxa%t2+SRe(UUOjBI6>lU{R$kiBKrS-q#cc~(-6IV0Ei?RA*c*g2QG zJI9qen^JWg8s)y#67d~a_4{2@5poEk_2fRpcp$kV3Ckt3?dB+fCY>TZ;Pqzt`KuNz zY+0+Z4fuPKehEjuwfO0-_|*6LfyfyFch>b@rsbV2*HYER{#mzXRZ2H$VYY?-P<;x= zTRB!~hj((O#JdAYqjl`YD^iT>GI;TH(e95!ZzJnGJ4Yfl#q#}MHw#C(Llm5^16$<1 zUu5x|(X9h;_=fuBOk{ewPMe%Uy1EiIy{N8jYHBvOa6LC2dqNx&uY58NbH7K4){c61 zXj`#QR*h5p(hbkn7zz@Gox6*VzFU}F6Zf*ycC5&+kbirxwOS~7H17x|T>@jwldT6@ z<&58UHuaK8LhuUn>P(dTd~#SH^1DwDO5iUpf3uOafD6uk@G7IzJyqhn`h#y}4{ZyI zO!MCR=IruxvrI?2-CACwW8cc{YxvmvJKtN~k+yjtn|Dt)v)%ES z{!o|Rr9oBa?~s}Mgp2QHVJ%a)tm|M=AIQA-ezfgQ{IWwC55Kd<_CyKubeG|&)ZMRE z#O_`7`{-fXn+?!OgZIN-Fx(WDRY}g}v69LF`x48l%^J8s$p^^Unmkn>+2ZB>diX1b zz^4TLDR2swwCVr`RL;MW_Ezt3b~YqCoe5;EzRyK5ErC8PymP( zChnKzNf2_XbEH<^$D}cP-@3{NByK&;9k$N->eC)}a-RS3S7wWsEL-ZnIg=F?$6fw{XrehB zWmzcsS1l5P$1ym!o_@Bc%|D+D08W29|6`FewI{Wqd7LpgSBl-unu4<}4Pl_-SlGh# zPBwap9byYu;wcRBuI}u`rgC|KQeT}D7`aYT6~bcaJtY#o7=7Z7Ay#wPyK_Yc zR}wX}+?|+iOIgw*GG7-p>Ny%#?pD__4!v5MC%4M3@`4AV*Y@r6n{b9n3Y^!8KfJ!j z$6wA7swY!jqRcA9d791b`@Rbtb#>}txgYJV*!)w2u%G>W?N(7rWDXDFXFWQVUrt&KsTqr`0Ozt};9*#*AUFET7o5TTA$QdVb z4Jq!eZnW&35|!zIg*yVPU-b9dq^(F1VL;lBlM}T7mcWWX4mdx z*lGoQja(8l>3qN8lap@D(1#Cs@efxXSp29L!3aQ}y}uMUf{Tl=+;5LHlK?#xWW`F_dfdK~? zayZZUzI&fgPS>L54lZLq$q=CmD)6&ZuG@EzkO_wg=yRlVm>e8ZrdvtW9lBu=_XdHpV06Q>0{AvF` zg8%UcC0fQjHHwfw`KlOCn=w1`#)QK%K)MTGVpbiys^`v@1dJJ^|MbGZEzAV$ zJz1Uv?0w8VzOhx{&K@&qU?ox{CrRgYF}TRxM0*;CE++3Kdf)6I}#Wd>qer zhEo;6OTq+$##45U|wy)gbd(1SIDGo6^_=jleB?e)IDQYB@wpr zeV-l(QTmz0t?SFrrvM%wely&DH(|(mYpVMv;^AIUVLla)wKc~j6ZOx>tF2CRnfm;4;rS?m#y#Xf1Y@W``&q*Ve*koVvBv_PlF+g^ z(jK+Ot2bNsLp{2Hrx%3!1j-xJI5@4e-LLPK`Zc!gme)zSQ5|?zw(b@l?W!p;iMR^{ zY&nEA3H@R7UiVaGJ8C%UBY8@53@M)5155I#7InAoKmLmU4Xs}T1d8LXy7cTbRB9_x zSY`$|jrQ%bN5FBE6ZB$x&WyO|_$^E2Q}K=^2jej$vKnZ}{kw4y`~kq>tVcbo*9`ie zGaAOocxC?fZ5XAi!o2s*=|0u=!gdyRI+dG>cC{0v*;Hm`&ENU=fP3ufE@js>7#8Xp za5+`zXu8f^7K|Fcc1;5ef6VaPrs;Qz**zcrE;!JGJdQxm)!eYquBS z`r-Q`hZt8Go+b+=5-mXK4w;vVG@5p7{iSf3-fMVximaMnLG6Q={EYg{)`5lE=Q{bn zwuAihS*pYX3sq5{Oh6}R@`gOUlv8m2m(G;HGEUc(!TB-(2N|>dg}?M2bPcf^!zeu- zm38bYMtsUXdwGipA?bGaSf%v$y*Y&pk~`Y%xpaB$5$IIf zi1W;qV<<}{hS9|e0w4TJos_Fol2n6Z*`?#O+eMxubI14L-%ewz@>&yOnLynC`dC8% z1SYi%#$>6|Z!jRd_PLra(OBYJq``W8*uG!^xq*2pqlTQlTeAY9igMMYh@EYZy9?ZQ|$Q?fzQ{Hr|4g-($JLKJW7 z-FiLr4aL~>0&cI4rpO(h;_M`rY0vF=E8i?{cZW5uv{o9wEqLd7`*%NFK&rmxz}h!I zC2ifWJSrhjvf4fKShTCF?kP+U?Zn|%iLNd9&G1)}Ba;PBgC(joNy9rYyC)Xa%+i-n zck+9{xZ+RS{PJ<~BYUWqZOV?2(E^Wc5oe)UR8@AfCb~czE`%-sH2J3C(L#MAY9tv0 z-Hb_OCDgW(l-==oY*}$BBM>zObN0~~rjOc;f&JWFkNIDJkc>z3-LvH`hWzV@{Hrni z^}xOWtJG&)>7ORW=IrN!oRyUC@c0_9RSA6Im^*U6Tpeg zo_T|eYL}PYv-?%lNxCirAlC1HhwFjObcX;k5TWogo6%SS9kxIet9%S*K&Chjov=<$ z_g4!r-Uu-zF&sEc6-A*#o#MRx5L?gE%Jr6a=uiD@;B1maO?|Fq=M^t|=Tt*yuf__#~n}*WhzpklfwSv`W=^1J7n8ez$mT0K@9{{T)wRnlnD2Y<4=N z=Rn%fx{16>?@ZiGHUKS0`d-1>UC-AIeP$>@zDQs>hF%@&)hw>UI9gwzK$+sI8Y9m6 zIPi20V#I}OO2eI5^F;78_^*#<^}-sweKy=C{@xG$GyL24I*Sk%$M%|a2gIWPmD9EV z)eI~5#z*W|0YB`wc>tH==!<w54jxLfVtOKIEiXVFIC2#|I6z77ufI&IK(>nPARSFpHv8*)B^sK z3GCct92rGXc(yOMzTbWYq<_wc`p|bRurLT(53~U|^_BLH-;iFKI4JJEc*pyf&Co^L z9vv94G=H z=xAfVlH4Q1D$6Lt*QLkRt>=!i*p=P+VjoFZR8>pi!IBO@L7aZcNswLg&*vQ2*4+7v z`Q&umn;^lm>)fd0rKhJOh57o0YPS?f%VVgMk902R@ zGq6>?gEhOe(vC! z0QS6Z2v1XQyETP$J3+<*s7Bkbe;1O=a8OZK?^i-_%nij2&#&Q%6;4gJz|Mn`UHEgMFCiL;T(XZwE9;5-_g|H|%>ogwV3y+s;_DGDf>UB5!_ zSPv!dCK(lPE4kaT(>JXQlo5nr$$oBfkA}YBo%Xg`Hgj8YU92Eo!}j=&(CpG}+L8nI zRhHX3=;1}8)?*aDRf7$!$HQ9th#}#z{=5p>UYqm#f7S)?XYcq1q9d=jkB~i&%UZu= z14DlP{pn0uW7+KcF!~q_1PRtm+Rc^a68Atc5L> zftnH-m5?Z}@5W+Vosy@gH(t;3(adN9ruuL9)IW#-qn9WgxV9*AglfVPzK*@caP$W6laUpXW@hVD(=5B(-ave`j3`47+v`ZBV`8|R6E(VD zwjB`X<;>K|bAaoeHy)p$rUF7SU9<6`VmpIi<@$22pOT&6AkQWaJDN!7%T=-Q5&S;9b7_|0cs{!=a`NDr9h(St1 z&#$iw)0r#%%T8ypNhp2n)IjqAKWQP53wT)!k%k#E7T1^NM*;w$@HQRQV#86$R-0!B5Ipe_Vvk>GOk4HVXLzUOwh|kaQ~=B z?iT7eCQj}TG&e{FOJ(?}6wLGftHiC&_EmplL`74jV-3#9Wq)^w@mYotM5}3qEHBwG z{lc#8_=j;e0ORz^+YWcIhY!4!00_`CkZdS?c%mBaG%C=wP+3p_TFGAVG(Crw>uAh* zKqlnEP$bN*txNpItr>aP)-&eVDI`P0oPG*zn^J=8DErc}HvEbkA)6HZt9F2sY7PM? zSUp72sp4*)Jeu-^U(YhL^kAKLey=VY2T+Qu=G`(Q2znBYZfq8^osE2V3y#{tsIcZ% zoaIm)P|v+xa6fXfyms;}ZRx`fe*$LoADKM%2kQ?`*(D&s)sQsXM)O_rgC?UstSE$$ zo2qstj*;Oc@uT=ZCS5<~-_|~*2bg)UFL5#kpsd*Su&x-hEUN#BSpKEd$47w44EAl^ z1?v^~kW_eIPL~)jCNS-SCCRlwCkvK;h5!_cs`-2!r9<6ZjpZ|wsh)m<2b|TlmQ4Wr z-l~8OK2$m?%Yj4fH!7r9arPwSvBWVF5fM?QqcI(}Y%lbSxA4?kGYpAZog>72Lp*NDP>b3&wtXJ z7j^D`#O5|4s-fIAQ9?D!~+ zfzA=TQl6q(37oOy)m>%e@1V?ZrdRlXw{|+FXVLcH%j>E7H7=T1__b) zLa-qyCZ(H{p631!c`R~Y=-{(-w}YO&#$1|_=W(CY5-fn)-xUGQCt{+`lIW^**3@pYHjKb`yA|9M^Y?GGoKPwrHA59|`UIVnU!^(*LfPh1^OblwjHowx(npUncg1Fn6CyWj;+_634e3~>#ID94KNU-E2A$BGM+ zeZ7JcCKvk>IsomQqZ#7)I3qQgfFxv zUt&AS$3X(te)NM#Yk09{;ZJ1O?en+b6r;VQzk@TvR<=pN9>Bmx01-4quk`*)$^VlE z0wSRRDwqi0E%a~%Aj98bhysuM_WBC}2L4`xyH~3mZ6BjzMcD0X4qJ(|o}!)Qaphr| zG>EE>AEbv-;5ULjH*{P39IjAC%>2o>`=(7*(@y;Z@jcT3<37FIpS!7bQ6qk5@!3yV zm`UGt$=)*ZVqTz*q-jSbarysc9sEn{!AEViyEF4y1C2o;g|i6Qw4G?H{geV2S(@Y-z@@*C zq16keg@d_wimau@N1HIes=^~x-s1rO-|>)d1|Ig8PnIP_DL{h1f425Vmxe<-He!sM z*qRhhWu^4gMxQV`p!K#kR8`i zOl)HIPkjA~pHz+c-sgs3zo|F6xuR>^=W(e`UZ{-0&J`0piZ{=m1u5iF{} zISBzp)=Rr?u-*&bPIU(l)#n|>d0Cdefw1d!5=b+AjAO|4vfkQwj;%3#gd`+3Z_r!y zd}p92I|dAMrA9gluG3RTfW?)1S`aJb;@qwl8HH6=btA9HL;O4znd~|bTWROwkpbqJ z$R6s@c#e*GsCb=r;IqSAnD@y93T?11iMC!%NZfxTj#3yqLC1ZNyt1koK1DZ)F_FDu z?L;LLoZ`P9={Rv<$e09mqu}{ksRT~|(v%t-Jzz&#(O_fTz#~LC(@Cn2t16X`r5r#x zikTkA-vO+|BV&}oc{{*pIsy;ch{c6@&9#{{90b3buq?97@?pQ;gazO#i=IyJ+{)1Y z*r}4}L|&G9&_{^*-_D!%B<)vdx$n@AiSi61PpgI++u$FuyU7i&7E3p*{%Qk@&;LQ# zW_E!);{Mt=jb!_byiHrL{`ah(5``NB!)DRsD+#Cnm{k1cZ%LrssciaAGaC6btb=0z z?8GDSD26d{G#{d6K-2~kr#_bybbxiF6USVkNXdO=aiOzo)gB&P2~ppiAZFmooJo1( z`1`o!=lHN?p0P)$mdqz)mV5lVgm4A$!}=<++{Rx*=wUua@`8UrUtLiaSy`)L5qK); zPFgUK04DLYHZ$WL{v#zwjAT<&`TqKH^o>3?c-zR6w%xjqsAsS$z;DOEJaHmhZ>4jP zx#aj9Wv}%yW<51P0L__<%J*dAXdF-?3X-(x+IQWnu*=Hp-+68?tq?b^x=mHD{~ju` z029Zxf76k;cq4e67safUAOl?H*$uC3=cX{qEj>etj}8)u zkDtSW1_Tc{WHe4o#ejYK!Pls>I}K4eIr4x1F<}C!j^`h2en#G#In2m0+{AV}Owv=xx(57LpWg9e>?PbwiF2z?5Zdwr@zFgk6p=``OTWyBcIxXV*tGIvqzy7kd z9iGOx2!KpVwQ<@q_YG~kgO6@DVz5~%s-A@41Trun#@!a|!+s9Bhi>bs$=U?!Aq1wZah|m4wkHjE>WGu zcaO!0NyLMNWq`Fc{0cjyy61_UI-k(?Px>Cf$mDitLur^3AAT--!KF^dR#CPe#mRTv z(9L9>!o6lPx1?`J0knzX5#+P1?DLchVrY=ZQa zMEr`&*#d-bh<)jV9{jh|sSR4L^1GZe9zHFa)cs(r8sd}1yQ(U?ND0Ld-%!vV1 z8eE@BNed4Ks@h%V55-`l6*9Mrn@Ld(?WjFnW%rVN4lmA{G%%x|m;7nX^=dSw#U#M3 z?fhumczMY!P5K_-quJn(&rkym2?%|j*Ny?$%GWor zeB+)SDmjiVxc~#Y9b#%IcPmyb;T9GwDMf7 zza3lgl9Y3HTe_sld&7`w_mNX5#g6-4JWmRR457}wz~G-WGWaxcc=&X0%z zj(-qhf%=N(&7Iph={G7?4^i*@=a)BzWN}d#@gmtvN%`V_C2wePmg@f+PMrN9BI$3K_VL%OxgE^5g|FFVjg!0Rz#XL@@DqtsyBufmr8=41s6LmwBBBx zb+K09I~8>;`%Fh0^AJbZ`A2@`RFm0u0%^u7Ay=!Z??I(%I6DjR%jCS(%vgp!MpcH- zL5L-%wGVU|U!n&(eAJ3s-6gD$a~u8Ym0BF}O!JA`y?L?;%(7TBwDV|@A#Hh81N{tZ zJu9qqPAqD#8trC4tUs2Vp-Rjp=QT|FO&Y)j))7y0zgs6YHTZ2jVNwfvYn~f#v(0^>4*Qvu;yuE?4|Bvv^p) z=Pl{!D>_NwK11JmEUm7-(5F#ng^xw-U-8jNsWnr#n4%vD`)Bxzy1v>Sz0y79==!QT zQ>M0w|D3f`PT~1lM_?aJ00z$QAIoB*=6{jFh-*HSqG~+~(_(#Ct-;AiyrlI)H(`*N zS?2lgZGq`bF1sm)v|5vpJvYzj$)Y!=WxI>M83K;0%=gqlGzvQi)pTX5`ycOMeE2}{ zwf9n^-{Ru^6PYQ@zWiFZPG;Vaq0b9{5d2lnnkOyYA8{SVy2v^ZyBtC-~(-# zXjPtn@9ti6>?%klQ!3RxC4+zluT#LsQ6#&gMgdF+8Q(y1Z06q4&zE>C9x=IM!+I{=1bL(PX@xa=7VY%GwD%4w-bupP4X5N;L~|50=zAp$qOj zlY35x3HskgtqF~k;&xUm%>$(gFJ6{C&-68qTEl((2pflQsY1(wBKtArS3QUxQ| z#d)~cZt+#dtng7?S5Q}%Q_16avmcmTsHANJK&}7Cm!&UXu+cTqp zi3%UY&SZt2+up&v#CP1sJON8cPx)9?c6s-1UQ?Q3F%e^O^l z@5^E}aSdIw1q~@nOq@#GsxMnzm~uJ?3~}Fz>eMlhjtSM7sRrj>i+q$|hgd0%`-;Kq zVlo>Xk9?^`RrD)-cpb{ylW8UIJdpghfKiCPL$-4xMQFM z>HUW)|k!QL6dsQ=bkO6GhW_FV7 zA8-_L7|ntO9MHu@hd>hOIteT%C8xyXkUWJ{#SF$Xo{&C@8VY zUTaSxtNsb2;5s(CQn#g3dFPj@QW?tP$T%!)8waJ1oAdYL)LZptRH6*niph_t9W?rdiMvA`7o-ANwDvN$Wrd@iMNew zUKpVh(9svb=v~n%XKV85Y6a-dk*P#Pve-=yd~}%>F^>1|5jux8^sK<%$SrTn*q*`K zJf=P8IdjM3WhMGaSZR=RWL%~rWU}=Y#L#P@fgb6(@peN55}7{V0>8`@8SJ{gd)FVG zTkErB=^g(qct7D}X|>nM36lmTr=)8D#B!mt#npX8xy%{1I~77LvWEM-I=r*u`?|4z zQt)HayyhGR`LyY3p@C=XH%@t>Dli3^;E@Nr+^2i?Jup93o6YT;g)*&Ekk%snvRW`( zHUX+)J9-tbb3c6|VwF_$a8Wj}Of$7Tz*8cUOv3O<5Z#};OvMNn|JdgmqB54MN#TlW zp!`6cl`B5I`0o|@995B zdH-Fn&U3xI`CuK}etx1fx2me|o+dH8wpdF!TwnX-*LoxLdSIQ{Hi8U9CGu(ozJ-Zk zX=uG5Gl6)vEjBQj|NiA&WLfKHh(jw3lOI|S=f5~|ANwY`81VaRpJ}#Uqk;yFF|&}7 zr71BRe&o{=V$C97uRZOen00-tvd=`R`z4lsl6N$!KfZ{f4DeRSWF~yDTjHKnR75Ap zDX`M+Cx72!ByVS~MW7NA`>sk9?*8`_y4pfxqGqWWcCQERK6l^)_3$fi7}1g*|iKiVPtZ4)!!^ z_m=Hi?Nq#e`AA+C!5zPrmKXH#MN=r}?KaTs$s;`0$oFRl9x>U`i%`vVy6>r%@>zZ6 zEXsOl^SIU9C1nZpeQPALu3R@myM4ZNw0mooZLdF1ty4>mP1o$qC%c4e-aHn(&?aDK zRb$#-avdPy^GYO!s-S-?JKlCQL;aaDRUbr|p;@`=n`GbaI~KFUm#N3&4^f>B8gVDs z`>dX<7!7EKqyQA$epD7N^Z^*(|M4|X3Dg+W;Y~(oPP-TU(B~F;(?9#}mVS9HXf!dB z#KZexl7fnxipOROmwJUdlYHnnl!YZl_t$oj;0w|xtO^u$o;7{zU*~myn+?6{NaI!Q z#om2a#GfZ)9MfYt_Rc3DUjxFgkIc8z zs?c#YGv_{o4y;u6X9rgoC?gFFhTSnDL@-2*DwEv8jlWn+ntd%K|Js7BRwado1wZCe z7rZ*l4zSuAOziEJqW_5I!I?DH#>22lZz!ucoy6B1tuG zL&yb+)g(U{kF_+1*ry1pIy+2~&*Jaa90i8Rl)NkaT3_-`b&!zhg+?S{u(A~IVeBGvkEBYr(-R+T&ctE%e;VaBj?cyIF`BLZkTp4cM&RC=MYGn#r)XDv; zHV|#W;nj-iR)dj5rP6SByNa7(U3iBVjOtiq3b#l}#};EVAHRJL-cKIa+WeIJaockQ z4%X$1Ibe+cgr__5t_tb2gRQ$CZ1&J|f2ZDA^O;Ed=#nb@S3!MxmUHSH{_IU0w99$p z6&roOG&>6JIavVsPYAwe;_#o@2=FfEt2MKs|<7xJOF zO|5zp9_Ww9*Vf)E4*TXD_j|z~xte?4oUv6bHN|e(&(nxQz#A}?n&*w+GeMphWy@Y9@>Tbo(!~dmgB`w zPA!(s!+y5|W|{AFA++!<>sC*!cnq?0(2nGNy@p!e=*jH9Pplx5mo*a;%`WC?vsKyb zmaXS}>ePZN8lr@Ao1DIIi?pv0&wNVu)|LW*}~*SExc; zs!{l9J!|0cOBB*s?qnrukHj)bekY&NJ%3T-a=fm-10N$I;RKmA6yA}y7mx8)UWAJ? zwu07g4QAgYWfxuRH)w{h5S8;)KKEL%c6oJ+OpuU!Y)qDFX}G#IVlA^h+O5xxLw0`t zS|pk3F^FK3q5jt~dlLV^r#nl$fQ?o8kLFv!Vi; zl_hT32eSW+&wzI`Vt^i})btf3L5p$9sz*0**R3MRFDd%k7-pQCOIScFx-kpY78shE z`e{t`ee#T6&I;+V`i8nTk?C89zaQCjA&&XQiCXGb>teNMv34?hsb89UNnUd$?KGwp zrA32)gKxOU?6*L58ZCBXYuqf++*&c8(#HLj;wRCtmCU4rlnGm}cKPdKwbvomwqxFLubEa6Men@=p<&!y|#B$5N{;7C@NCnrsY+L&eO!*~HTCJFoW7V2s(H z@n-7LrKJds8?xq|*6w)O`hM#xl17)M*^29J-spn99-)q_mPx3vj5%Hc*^htFKDRi8 zf21_hnJy>Rt&EbEma@7*-dH3Cyk8L=OlM2i@teT>GXVm|yhw_I_$= zN>jx}&A@q%JYQLPP zWB-;#81tBB1)(tpw%n+GiogIfu9~bPW!T!9Aq_Za$$NF>A-Gj7J`7_8;@|tPU?vyAOShj&ICI z-V5yRmTBp^Vnn&Zc_K0`y8wEly;jIrUr9PjAnIBDJH}k*f#bX+&odpHh=O_>%%%D_ zYS-dRQ}Md9#m2pEZrk&qbI#LjA+-i0Hx)dh(Rev#j;{UaN4UwCl2RR5MB!)~y0B;5 zL7hIgS@T3r%W==SM=BOW4NQ}xDXf#|FR24=3NQj^KCZ&frLm7qz1(=C4V%ZIUXTq= zi+pK0V2+AB1GEuJcC5GefRU<=8mZ99T&Mr6rvUHzFaW#&k}+8FX~PrW2umPhydLE+g!SvdStcw!rfDXhvoPwC?K z;7uHx>8tpW1@N%Xil`#j4}ZxSx2EzsTmyX4j6tlqyO10lqBs8mb}`r& zrB-qeEy07vpZbzlk*uz+wue##T++8&AkQSdSZ}c%dvN~y)RuW0Li;>sONa{NG5?VD z)e^P8%JRx|wjlD11lr^8Hz>JFo45ZakZTzP^n?mp@qaN;RTp z;sL6QW<&#ha z-}K2Rx`JY3KEh(4(VnoyG%SX+Sm4xZGWc59z4o;XBYvv8<8=5B517O2j?pwTc&_v> zA)m&PqJH!v`1a427j>3JzCtc4hDZS)5Oa!1Q*KOD^v(4Sbf(N=%+apNVLD%xzAohB zF?y#PoMDENTc(tXs?BoaM|UHgYL#JA)%pF$*f<5n9X=uMz5Hv^iwoRlImwPIDQv78 zJqyr#L+k7dDYYhOHB&Jy-=)}yW%o#@rnU9_FC!j~rkOjCMyh4|++3>Hxmmxx8epXU z<#qH6^TzX%SC(4BA*tHT`xZIfsL2Lyh^VYN$JA-&&oP$=L>R2&Q#=8~4lilB(d;6w zC}N!vKFzI#0lpf%xV{f9;#Uekd;KP#T|%arWmTa?@?^)8GSEdm3*<6K|Jp0CFBkiRcfu z;dR{}bzaCGE+_D~TJ(hap8bAKw43~t;K6^FwUL27=&C400%l&|0nOe5;{>u+LWo4!&%27W+ zHI7`Y3!?);$2VJf8kDK`RElEUynJzZk4_V*f^?K%b1?cq!|S`|GG%4`(fQ2J-@cw3 z#EH$4P`2j_#;+MTUm)c&c~Bsqsmpx(gF0Z^%}kWjk!o`!Ns)ODK(IuG-XCS#u4_J= zcgu+hvD&xU^*<&9q=8CaR)723)47D{K3OdZdh);fX3Kuhm8NWjB~}+>^@eWbuYnQD zx24&ye=;P>6g*iiW5rd)e(3^Q;wJG78qBmV#0SUy56|;Ce3Ucr8e{FhAtp+l zH(ta)uAvekuPvTZ?{)uvv%MNn(m!32Cgr;;6;W)ODa>1-Of6O=NgrVlUG)J~HF)pf0GT`qulo_NKp6**N&9`WFQHc8_X&$U7>IGa5K+U#V(KtnQ1dQ}|BZGc?fp;1{npptvvlsLM3#T=JDlCjT!vqvEUd#Ue^(5wd9!K^C zyZ~3G?q`u2xV5`|`J2ASYhoNboJYJ0cb>ZdCoGa0I*QLzuD(>ZScj_g$Ake=X~|*O zG2AYVJLGM^afpKVBo8&{&B0uI%Sk7Ti|sW(%?E82MsnY^&cImQ8~^D;OJDwn$Tjf!>p$eN5NG z`<*M>K0fh!sodV7msM=ovzCl`8Z<7hSO-Jtx z8uK3(?3PLFe0t%`JP#L9!}kLlib`V-J_TqsO${(`Icmyzw=n;UZ%p%E4Q1&O3OuBz zTzeNm%;&1zZg>CZQGVS)x{`-mB>QH1oDl z?yCvQd)L{jqEuDY)=*Q(?Ah)Vj{A+NF;knVxaYNYUDEE&mD-YS!`FRt$K5l zP`9}XW9i)tXY$$OGvdfBI=Pr>ca?NX4U|^Y!LC=RRPYWB3uE6B;smIU#%<>~ps_Sr zK(mH&-Q&EFy=Odfc;1_5 z_N8K;)dSuOz@HS1*-f?qN^)E;q$I%nA^9V+ru9_?pOjmG{xiW z#Sg5$+Y zj?qo+-ZTQXI+Y?DWz|ATov2a-{p?HUC1mH$j-O7uNF66d#-fCPbt z^Hj1$;z~i7I_iCyk2eU--s^`+$ zFEMvp>;1^arN(a~N~NFO5WZ@%dg+%YQ1r-pYVoU`Wh>)>dG=HruxIe++%Ssyx3N?3 zf7;V1rm?e>3Hp}EVvYO}omPT!nW4o7O;TuYxn}68g6pJ{*1i)PD{{3Rk+Oz@=D}33&N+{Zl9jJiBfeK zPM}Pga3FW0-VhK_at76I1{_rb6Ax7rq}Zf$_k&#R*vzBHVWIB^e+|eoy;0Qfro3ok zh*H1s=u#*@MHE#<5~-yfN_fR-Usn+Q?N6J%%g6bn_pse%lM^E$N!FdKGFo+P>dWUu zvFmCcRIgk3m_zYjx%M^&qU8R@S!L{lIeR`TDZqQop&!#&LaJjEbL%hD%*FuSVvx@We{(X_t`Oc<& z&!fOt4eN`UBD}ue@5w$bBl4;>&%giuduv9g34B1XJcZ@nFD|p?JEpc5&MTtpEmmmQ z4fC|`_=lbC;sB`8-Oio{DYstv$=-7WH(R{G@l`bbEa%u#dYWcQo)8kQRrN<3 zwx4(ZvMOZOy9K>7p!C^U`sB%d;t6;^HXe!?ul)p4TCTrlqz{0v&H$rB%t!CZG!QuW z&nc}nLe#HCn!6Rh0Vef=O9Q!t;S1kXAtxF?wYoy{`@2v!m)_T}5f~LE)soh|AJOU$ zj*NFnV|SKQFZmrS7YyAb-~C{E+e*Xe(7dsux|a_T?v{!)qNf{n8K@<(>$*N8Zy*!X zEv3kU9iw4~-|}()t@gPWKxe*UYOv|Na~=e2O}>`vY_$cLfaBzBPnb0jV0(AL>TIXi z;M3%CTZ$IdrSrhy%~9?dgTH@w&_dR#ueA8VlI`QZ?yh?u`JD;0dxXBgvs-;rQS!46 z3-#qz&+6=>zawF3(ka3psFsWYx8@f)88fT%&~!;;GcWDN=Kh@eXpi(xAxYCKVw=Va zji|RRt&^6dJj`@2V@RqDnPP-kKnHnsT2)v@efK8)T(?vJp)th1fs8^Z!0<`u?vtke zvHr)&H+qfFfFK)Nsca)p+t->P2VXgv1>|xJVo7`??TLt_&XDn#&f%Lab3mZU;zM37 z@RUw=bxq&2crOc@po{2*yv`sTuVZWxuzFwS@`zgQnas5J!|=$|R+r}?Sj!k2TY^vV zrLrwYnBTHoUm?|sipzfV3*x^kW^9$%#wC@4ejB7^VUI<7*ZKO1OO;W!b~wj|0;vZp ztvcPFxH;B=n6p&fZkqE(aV-&SEQ*UO@?4F@Ghoe6ip62Is|qTV6uNjrdx?QjHPb{^ z8Jjd4f<)EK`#-U&kK8|i9`%|PC=z*AmS`_j`szQv<`aN<%HJ3LJsbmwSg{AGVv|fp z?pTdHYec>kr@kx)+~+N=+XL`RrDx|6*!s_e%^0}^fMvjUX+A+W_32$jzF7d zb$XcY{H1QwLzmS3=b+^YgSI%ic!toV?+C{}aX&(mXL{(X@_P60j}YTUoJp@frUV|I zCb6>ov!|PPxdSelGVug^okc*?@x}&ib~?JucvGn&HGez6BZtP)Vefz$I!q~5x;o;6 zC|>+fLu7o#M({0lL1ilXLHw;X+vtn*{Yny%J4W(ga(?wg5BeUR!vSrEv+A*3oor|M zq}K*hji4}#IfXgU0dG+^`&eUZ>RMA3+KrWqQI)ABjzxdU{^UUlT?(ZgB$KWUX%x)rnv?Sz@f*U1zZ30kKlHIlbg##HK-vrkc7p$%CRS?lx=c>(sr1gOg;>0A0%L zsI*as8c&ial921dgb z@d5E8w3>_XzZAv?$#vR0xrXaHwmzvPX)0rF_IhOH&Q9Zod&)b)Np`DBNcm)}q2_jP zzURtG3iG4k59tKNsDi+x^FrUVf^cWgQiL@IRX4-d-VY{=wGHeJ>vUyVypkeZ*(E8Qqjl%LUv)le%;CH z#pZCFtaecWjxhRz@mR~z;zDb~3bGMO-k-l#Wld=EGk*Oa>fSo6%B71NmQq4WBqSsh zkd_h!NkNqE4gqPUyFo%i5D*Zgk(Tb35|EZnmw?jU%{MpTc~0PSp7*cseXr}Bb8+^b zecv-{)~x)kHS{sJGd2T*QVLS%kU5?A^5+RS z`m$IZovcpdV2)(*lrYhd+bzdL944@QA5Limh(+$h!!;cV%^`!L%YWsVL`uLhiR$}^ zAMSfPgeQ!Cq|`x%J#4LE=Evr7Y`!k{ ztwmTRYe&iq``aQFEWHMuX0f33tMaR6^e-dU{^O=VXEp?;7)b-G|?h_w3`n-tllWup^fahWUA8d-ZgvM`D{4$M5qFM)!0J=y@+WT;Z(l5dD^*pz63 zN1cv@7Sq9`s@JT95+-=Nlw)QmV9_*m;6c#BSd7DACs(*MxeCY4rfb2l4<)9!*po{& z$1)AibkT6XZB>IUB->Vp+VZ4=2%RHhZsNzz&@o17BB? zs&ehxD?(vREy0zz4;VLZ1O3}ZE5Ch-f#fu7~xXP!e#U?SKCi z7%Hr_2}Gy=1MJ(IjA;LA2V>>JLx{CYxHRm*CDR{7vRF8F&KRnxxr;)2Gp?&3H8+M5 z9nWM?0EZG2nQ46^2>`ZLS)$wbzuph9PLuUXi86b+yEQLwq%9zKtz;K6pw+jqwBac@ zUY~jSfhP%8!K?(KA=vJ|zAghE>iTNKMLomqliT5bo270QK{urv$Ew(kZ+y!pK{py5 znsj(j3#`<%PtPP{KsGk7B0$0J%9S2(gez>rS0R{wk%88e(UcIurBTs(fk*Ni!!n1` zt*xRxhNlSI)}DC2l?lg^GzN4<6yxi0z*)pj*%K|S{S_jeJ<)d9SvrA8Pt{Q=R|SQF#iVt6)gtIhI9 zkS#DQjvT4ru+C6b%-GgmUXB7W^0JSYL5-KHYnPx5ofA5CYyIW$Vfh8p= zQl2YbQxg+ZE99cnw&x)lBfsJzEk6W@u~qgalxFgHN3%uO$8h4294GF5QjcS*>syBvuP1xBEn?xwfPq0550o^5aLb~ zm6+*I4@KHU-td8)TZZ~Gej6`f=()jq3%ho76R4g|34Uz0gI0KD;#}2-J zDfQHPo{zIcp$7fktc9g4n(Kk>6TRi^=;D{856X`~8tJy6HEjnOjOZLnY#m5HvK4Md zGL5)%lRNQpb+v%vV(F_JYZGp}jl3?$(IVChDfTG&I263bTZpcA-PC`^);R4+a}3&& z9HT1^2*7cj98Zk%Cw`yd7ssgyo7`SdO2rUoUYo#?Eh~z?R92Pc>A;PEV7SxCYb!uL@*HI9q~xNQh@Qkgy9^Ub(j z+FxtCp;bSwko{4PMh1Df<(}O^CvTz4iAz+q-ka=4xwWsZ$QESC^Ab)U9qjzT$9ueZ z8;5sym6oe7a-Lc>r)){-p;EFYZ~4Y|cOyRtYEhcABJ4VY(Cd_|LI>&*v{={N%dRda zfI2&-uIeg1nx2ELTX>Wc!nnUe(7BZjRxa z16js$qltsa#z$2j?yLq+Z2@4%$>h(PJPFdiukXM_A2i<%rAD*}n zbkb4h_rAhb{N5W^)3^#7aqOG|oZzu%_}n&c=@m=S^x@SZPu!3e8RW>w$oA~?;!{22<{4TTYB>0a;uE8Kc91V}}jGI|MhIgl+> zdc(nZe488=s<{o|4BgGEW?DY1*w~wfQEYWKiI<)3hm5-I=H`amwaeYis`nG_j7u$^ z;wit*^CW49FU^3F#Z|Ld=gJ%9N~MDIp<1mQ5nf9@DNsF;uE`xnrn@aZ`wSZ zCR)dSEn0sTJ2vQZ)@>&g31>@(jkFnFFSVd-$G?AjOsS+J8urauuBWO!29@>G7@ynw*YEH9#w!FLFLFR+BXaeEDDGs+aA_%H= z+q3-fcG@)4n@OWafp}Y>`amM$Gp~trHptMY1!3amCEzT9sN}ag2361H2o>UmcTAG3 z`#Y%t*OrAl%p||)YGHzs6@bz!EKf~^u}8jjY#wOzr?F_lXdmt!ax5y^LG8YRV0ZY^ z-MyjHyy*K(tyvhRt3huk>Bx6XQ>9ylh%ygjA9R$B2i$nXt66D~uZCyDM8qy~KdiOH zw5Oj?;;RbEZe%@aZ})}^#jPheWu4~Yhhp@MyI3BsK{sI_(Lj!1NwjL2(AXSYH(sYx zZj-Db9%X=W%V8yGT)ACci7n!uP7`hRR!X9V7iyu|z+Gw7Ly}YG3wL{xHZwUXv#`Il zFr?X?KRlRl7I|aWe3E-kr$_40)b@|LGv*yTm~-X~^mu#s1p7Ptf1Va@4pFog?3y}OwK(qj_A#Hg0> z9=!-IQic9#xoaGRtcM`d4OJ3sDFJTeCKxt1qZWP&^@@aDOIA3lnEieP5XJARMCb&U z`9~C|yUGVc$r9a~pCnu|Y}%ZYW7Fgj^_E_$c*q&`m`UwU8#gH37(D4s43EH92cGl$ zCV>1S?8>JsyPbE`oup_wCN&`uBWHZL?$2bf@s6hMK`xe8%-dxir!?dm3+HWRk2YbY zG!cZcTYbyJ!XS&pvEb~{BeBpQG&DHCABHqVKQMLDhB9W0he`B!aU>bH!$R^)NB7O` z2MlBR0;YJrk2|BEwuDl$>hCcuPcgqF1Q3nv$fhBTPk537Z@;&N(ZQ^C&Ds^{xAhe# zJ~A>=4Y!Q&u5sndPNp~dqo2IyTfCn*gM&76 z80VB2scPliu+&D;_B!}hyTf#lQZAo)2|I3O3MdO~|4N6BfewkV&wd~{R6wEj@iGV$ zSs}hSab*y-dd;l$FRJyDd&3ie#%?H`N8!>(-U^stvZ$Msc0tBa5M<4-6>u9j9s5Yq z6xh9gbU-t`tUj1cS~OMfO8=yX(6ud5s1THP%`p%4Ws8Ffyu?KN4UwH-!(C9Hvw4uE zT6i!MhuhB=IdI=Fq~+C)b!AnUno2i7o31Z)@o#<3YuoirXrCz1Pp_(}nU=e9mN?~n z7`QB((glE~ax=9QnAOt(fw&-|Eb{1XBLxkf=%CIfI%?ud5OKFUVMW$1fOCWFA*^@j z)Nv*YC+;p|%|lC5pdTp@qk8<SmT>R|n}!xvqAIy@g11nFD? z2arOh?8fR}D{Ae}70a!-u~JcNrR$$1dVBRDF)+|s^}{2*>r1pIC~7p@)GGSuYsU<1 zZPyKkVAI6M4u?wECElpfcY0BAP^6=O#s`Q+n zyT92zDp~yK9J`v)u5a%klS#<_ya%d^`xq4)o3W%z>|9o`c;)FNW<%%HWld*Fb<-D* zY&^3-h)<8#hGKeOhquO9k;& zl!VR#yp>1<)k8UP*$&9qT*;ZspyoV0+1<0;-};KnX}`x3v%AE>=`i8Zl`NU! zfM-Wfd{6Ao>$mrnbSz5!g#5pO!VBpHkTO2jX$rmG659GMg*XtZ*;FsL4|OTC@(olk z<@4!`>0w`)?#<{{3wxBBkAbrIwy#DAtuafiom|jJfQm40ih_%24Q?6+>NCb(47Q1f z#UDNQMBApn-Gdg?kT`k0Vn~?ao?)FIMG?qh>VVHo*0V{KzMZ&^v047 z9XH8r;)7A}-a9WXF zz5xUUy1DA$Mk^B(2B+-M#?P-R(9bN-`y#Rbd9DYhJ1MAsbs#cDF_cOUSeN41@!20e zzEGIM`3vtk6wZ`_GLE0ei>&qUt6UBguosbpgr2J7m=ETbdJEl4{LZ|qPNVuIATi~e ztG=xk-eP+{&A=45QxCuhdLZkUQS0yz)$^nlr-eo^r6PUcKcJR_W*hPY?sa%_Qq&-5c7 zxr)~Nt6!GdmZy2h$usr_o(Ar{_J|Qzwo0LZ2Wl+kao#lhJ9UWsSoj%dCR!&eh-Ni_cQYpMxQDuQ_w(eNiULmfbxNA+v0IN-5QQAQ}5H zJCynZaSWz$;QTi)rNXV5dY`YZwl^Qf5AUq>0K~2k$MRU1#pNz8cD(9bQ9+4CLVsEHRQ}gS|Ep84OA%KEthJVDz{)$-)!U`ou6txzgct89aB*btBbVL_?J+t>i5ObS z%WogtVY2LU3XUV~WzZZ;Ec4}T^^=d1&l2g`-|89p0Q=&ww$Cm^%Jn#NXR+G~&u`py z)Dz|F&})xI@uDl3_ZF1DI5zeF{7U0d6_TC(j>g&HXQURb()V^tty(Hx$=z}NwPQyB z2?)sntqw?&&CY{%mYfbGW2DB3Va5^X@L{#`)LWKRi|~)8tKOFKB2KF}mdY7=Tf(jp1)pR;3$;(Ow%|Eb$Fk zhzjm4FxzhXmh8Q#Wn_&c_&7|!N&OcVfHqO1=)3JnJRHKult#w4r%^ZbdPh+Z#P-~S z@Fq()J6d*kdI4;u-Qmh>2EB=J1~%!{Jl`ZVJgIH5NVbIt>-ryCF&uWen@t3hm<|9& zUCm|Y7A`C-dSdb;=#pRnG0m11LZq!}wr^Y+s12gm?j(Il^R29aR?#Nohb_4G3JTs* zzy(VI6y@y$(~(j*_WWCnj7C81=RSZe$w1=M;{5+^NnVC5NpJb$oCn}EEjkeIZz&R% zJokO~x?v!gx9y7$1%ku29L5^;Q;Hqlm1>6#Z&cg)=k57dJF7slEJG&I(`XC>#Jr(W z`k+=HmIA%z)i|<+cHR>LDwgN?H7MK;QoyBY&+ZH-0fj^SI=7=)qie-e&ZE9mW1yz@ zb$?g$vy!*pu1RRHz_Z<22x+0TFbHnJ9!`0-JOazm6Q>PlBBhj!l6z3uSvS2T9Egnc zF7JUI=EE|(JPM3H)W`(~rKUhOfS2W#?HW}Gy6dq+8AK)ink1z#`dCTK>8O);ZluWA zPj+s8w8FZd-SRu%eu|#>oXaAtUuRx!Zd(Q2K9)XZ(r#^?lG|m?WA{O0-NQj-B2b{D zkg>^cY4ZacrCdqu%Z^>fb(-fs`A@m2PpMa3ZWg?aK$1>L zG=6oNEkR$Bo5s2X7iH28lRM8K`0h%@&Pw`9z>TOCiu^#UhXiQH+RJ0Ofkea4&duz( z?G(Z>V6kUxujEAP1V-Q&rD>?&$8i1piuVUO&f(g73F`&g4qp{g0WF`sjsEQJ9L1b( zDUW=toXxxzzR!I9Hc_9$k_NjJF`B2o4CzWY8< z>$(Cg(KVD+;hNnkO(THlp2}g?gVLB>@D?YcflKqXPyRKWNcNXxcxsi}V&BG(JMbHKZ}j&fPT>|4vl)qEev zEVXQ*xdY$}3}cY?wN3P=<7}CB*Cm>ZS@&^%CMr4FHtbmx`Jvp}x^Qbv>P3~)lbRQ` z9-C8cmK{$oL8;^kF6!P*|lWV#~ZHY3_)9Z+OcPa`aj$3r;=*a75Ui~$$;oMuF zssuOJrW)W0i=K7gKofm)jUZok6egc(021p0h@f$cjCK9>GG*}<;P>=pGY~&2t_8KK z27-2_t2e(_%^4^W0JQGVb0c63EE&f6BRPYhX`{RBb$ZYqKtE83Q6Z7}`Ef_x4E)kC z+6RlO%9;BPx&Al&JwV1ac?p*wEsECY&@Mx+vK0GK`m|D+F+M1N=_h`b!^+To*MF0W zK{aKrtqI{wbzdX8yteW+k#N~)%^!WX*MDT8pTjeB_7N3m29e}@^JZP1l2|{$s}C@` zARKog)s%dc2=m*;VmT>xIb4iv#Mq#3i(@y7&pr}(oYs_GXMA*&%3Y#<&;z= z(S1(HVRl1D2ACT)0Wq6?~^*0KhTkI~R zd1GT6VJO1lv-6<45tq)2_ZJ#qTAhp5M4%X-U8&%)<$arvKcYlI zxu&7=$yQu}3wL{f-?ughUb(Jlju`f!v=t+mUa%#cIf>l*G?P_2j2h3TO;$jMHsiwq14|JDqm4L-OPV5-4P~Y`0*eS z=0^L*f!$C5RNZwmWJh2p6YiM@ywdNGqJA@u3F>_*0C0hx-@O`QmbYYx&&PJiVWMx7 zYvaKO7!ZG4QDb{|_22xQXD5&ge!^~1j9QfaK=nTH08VGS4@U6J;^4MiNG_g@gN13A z8&pItEf?e|M>R0=^+g&+MBwfip!!mO;rVo*Vq+#`^Z=bFW9oxlpi#h(0<*X7h0^=rQLPgTUIW(-o(LkC#RoX0<$J6Sri8i;W+T3KvB!BYmF>RN1vHaz zwau9r#DTUYplD*I3Y2ftnqIHL@$oN2`tlFY6H0r=NOh{=Nz0CS!*yV_Lz_ zo4?n}l?kGR7Kv+AN8`O1qnEb@Whw8f28eA(cM*2Iy;0S-2C)6diNu8vNJ({hgM9(| zM5g32`W^fH=Pd2=!?~YJ2UjOZ7qwCdgO>W<4ERbhL{G#PmN*BhrxJ;lnytB1zkuEG z&qhe%TrlS?@X8%dvp4){zdz|^fe8XkO`j%|R@)sTKSba+j#L@52!m!)1_GV2dO!Pn z=DKofWSUTLK9*lK7?iOeD-hk!L&N{L!zgj{q1|2fOci;fXniJ48S|DPRQlDm1jz3W zQ%nK<>KD*&B$n~UYF0;o`zS&w*3_|X^6E0>tmzlGSnja#`svHAN^V~1DRr~s`Z;5j z3En(B89QMGr4~^Yc(Yf4;3`FYer!s>$jDfftyyD2s%dGi21_FbxNU3*w~bBDlJc+q z_A^Ha%NT2~O^Pj0by5Yd#3z%cBkWU3Jm8#Ye6$Y(HOgP(Zu#^Y>81L-MJF=YiseRe z*k!TtcsVj+AXZ{ht}e10Z{8}M~@dpNI;`JP-kBD(-?$; z?V0j8#RW5$;evw;LGPaSYdVekeebW3V&s6}gOw59PQcPoSd@Hb%J56Cm`BQN4a>CM zO+jc)BlqoD!f2FMlWbH#I!AAqS;za^O#5&N8^uF4b4GSfr=gNm0C{}kziHm)$#^R+<>Bbn=Ts-&CJZ!j@YgnP=z5DKUm{esw^=+h z^W`e8qP$I@?iJ<1XH6S-NwZ1wE2jM-j{*_B(w&6j9$s-j9k}-+dp;YO8W#A3~d9D25P#tr* zu;U|n-aF44lr&w|keoRTLpC+GK)Z`sG?W>(q|r_7nXja}H$V#}PHPv_Q0qQj@_Q8x z>CU;e$kizpRv_hGIb6)%;49xQef;bRGE}ydAba0-Dat}K61CbD0w>JcJvr?tR}5iX z8Sdy@#R36BP?TTQkpJyH^}ocY-Q2_>Ab{)$LBMM=*ui+~_aDikusR7fu1U@JPS0|T ze6M}w@ADJ#lM1f1w?>hsuj<+(qan2!x12;=dT1loi(CKA9Jaqf?huqBPNTWKsE4`M zr0FW1*`0(VMo*!-Tc+E3XNfwl`Cf@(Ma@m#6pJbc!@!pQ`R?4iA`zTL;>N5FxxY{W zJ=;d#JV5}%XtV3&XOG2>6j33FKv?hY8&5tOrEm6SqSuKir~s)3rU=F(>W&JK0?d9O z7)Y3m>^dpTY-`*5>d*-wKHkxgS}9l9GUbUzVAsMYV1bIP>+u{)p7jD_JrPWqHbp6S zWM6iH>cAFW3W13G0xKhB+~Wl(+mfmk7l*CKSms{ByTKF)`+n~9qj1Zuq%}6 zi7&I<`goAWkn_O*a8EfDKyiProE#r8RUPeUI_xZym>)6kEYDSM7cS#eB|eeG`Ef+Q zqaR0uA(>}PsLHYO5|fm-uPXdu;G)KI^Ug=^CCf#rKD|Yy3HG+cg^3tWl;Oer+TCPP z2>2QGDF4|AXa@2^IlQ+?yNS(3qxCi29`|$L_CPqhtyo{PV>i&;FSkEXoi~o_ zwR?$v74~zt-3v4xqa0H$)ZQvJmlJB#?G0&J^}(UtT1eej)A%tXY~30Wanoa*+CC?u zOyXl`2HSIL`!Q1O99Ged#%;}FOX@eug|jtQeU)*&o;w{>+w4X!-$D+?F~nvk#-6Ak z@J;LkVnBff3TfB>H3sC4{G%mQ{~`7pS-Jpw+^fi|HmI6v4L0VUcd$c>X`Zhgpc3Z~e6$>;WB1JO_sfJ|hOna}$DS$n|rHse7bfN!AC zM83xa?dolww0Jpw=I7V*HGYpm zs<#XpnG}7Jp5m$vYQinzGygQZNC-^KK@UhgQ;2s z|9%nCv>g!3&fqbNZ`4s@(JZP4@k?pX;FQVL!2;OW_&8y`N_6Px=I*OYEE5nvEn2RO z*QEC8G@-^|D8I8OyFakZqm~Ph%JaOIDf7v7j-3--QllN6Ed3e+B@;zcJJ@)1`MX1M zh?|Q~Nf!E!Rw6B?men(u&Ek#b7O(PDY4Uy$f1@rSYQ3Nrwl+`}MxwTLx!?r=$nRsn z`&{jue$=nIAWk`%6Ayf*@(t6xpCNIRpuM-?+4H6{u{5gxfz0i>^y z4+;{>rl_X&2M5cQs-3=?nj0hePZ~SeJ8znRN`G1V9HCREA_@B~_B?j5EAT2#R&Jg* zHAIKsC3Q2m|L90-d0O@_hZTAFoER z2%$oO#>v;ApkelD2~ZR{f^O@(r&_hkN>4+SR$N2$w7x)XP8Jd#>7jYj|J2JkA!cF% zaD{Sq8KUse7~Gm0tY4n<@F>-ob`YmNh(87>9gB^1mbp452q3pOqfek%BuM{)5c{UY zB9A_kR-jZ+%(DUPFLHNh&_Q>ek*$SfVkZ0X7n{9`a-jTUzg+D>{kF(C)SEP{d5Q!R zQ&Fw*8tpD8q+upKn|HnkZ_HTxGb+n{?fBG|C06UE zH$qX@!hjxP|2g<2;7Rh2@A}eGpoTkFhO)qNy-9ml2X2iw`A~^zxu; zr9TUBC0%EY9ajxhuS}a$dy%1_r#jWjgp055GCsA?J~#<*AqHk)h=u>@ zU5K6udi|sV7&O5>O&&8vGJOdly9h30g`eYU%K1_q^vc(nhajxmJ-lm?!fvBMpdshe z2-DHT1>5DWQQ@HL!XH6HIV`l-+P9$QR#HCdp)#)PD9I9$AuK7PsmMaRBQPvRo|V@YESsZLtl7`eTQ+ z3GJPhP~K||xn$k!`Vt?mw^2+2|IlUvaZaO#ClAZrt9UKxMUF-WMJ8}sdh(@{*zEHz zc_69M_myuqk#25}yM|STVCrogMNoneG#2Yha$l+Bu7-6S+f(eIMs!Pci%-txBOn&$ zQf1*aK>m8L%5f*~*dF}ryc4*~hsoL+#hhdzIo%oj3aSKI3c1?S+0W%)0h%LZwt4kO z#mtA~>%{n@l^u@Pcwfb%ZfSf^Fn?dmqfn}Ac1>p^@2*zM>I8&;YA08sXSLt65c*++ zS-WHRqd8b0m>W$(*~w{tUDxr>2hhGiM(x2HWZ_b39E-RqXTMQ8*BWb491gDB?6|x9 zb__T2^rnA2<7aZsB|9$gh?y85$5^XD)Tnk5|M~Et98zOte~e0{(w>12!xX5?kxSR) zedO7(D6%k~C%5HwVZ(Y9LMEcdS)pdpO7gxlRVWd_OYFSs32_nf%Fhp z>UBrVPW;n;O@U&3kJBPRFR&vHlaaKa%^QMCkPH3iqJ{sC>;jo;P|{!}8v7jwG*&!e zf+Hn(o*G0d5fh?<&-45%qaFPh9qLMumz4Lkqi>#i42dQ@Zr&sz|j_M&s4M>sxM@=U7=m31Gzz2&!~DXTPmLD zmkSUC0zksTp9v(M*P+R!W>LZl!VnUYjn^l)j8O-3-3PYjR`fg59>s8)-OIb4fv}~o zKk&3Qyc@(-a6TIK6F<144ZHI-b1L^ZLDMyP0u2vqMHU5@9;Wfosg&Yb1Pb1>?4ha; zDW8}j?{_(7YJNh^*jS< zBiT&6pRKW@*~Z1z=9NMDvL94(EeAYfxnHV>v)rt>e&FpL#ccyMof%yoID#pInxKW9 zl~maXCOVI@96x+&&`&2?*Wvw@Q>z^W$$&yaXeNLMu4FglN_Nw;Q=$HANJA8IB}49g z@uPf8EqU|!?U8Ws0Pz;VrHVAr!!*X`WF3v$=}<{4Wb1_3cUN?WzD|c9AfXx&L&#sEO`Z(7u#|_AOn+^!u>=ZF%c}=UO_YIXU7+U&zRw z(YPa|O42cgQ_L`lHu)}fVrTLI%n_1a6zba3};2ATWg#Afyi( z+`n0ymk>We3356UngJg%g_H%{=jZj+oI}O9bHg0#XAuB!k4j_yTCe{R-4`zJD^*PACHULAW>j6m;Mt8*{4dMTPG{^hF0X#u4w{hRs1bt*cm399QPtdxFzkr*Y;RC(HD?WJa zQnRVc41?7%ee-6TeyX&toADyP*W{|rv_R64Ect2J*Ll{mrX3$4T+?kc~8MVVMX8ZNq()0efBIDJJjwkt<5 zyK9b4arRsL2hbVAODRAt;R&{GjRb(>sl$9v8^=OWpE1b)S)udX2POb91ZoT{R7u1A zH;(+Q1vq!+uINuWFM+ffxmMp6z~#3E;Dh3LQ&oZG1q!fczl>J+87za!TT~PtgN_bG z`ZoWgoYk2)+3Zdp)_`~6QaPaWHmEp$HlXwb5gvvAJJj_aSOa%<2u^H|I!JzMIBu^e zQ!GCLeWEN}L6wdduvi7{^YxMw_trss37Tt(BPU;elmkuY-=SF}v8b^F(%#zUU&lLR zgF%&wi1S>}#{{vt9y${}a8-D6C(Cm(o2tpO+wY{jyqagS)ssX^OlY=v9M(Mn0Ox&d zjYBf1i>1YNCM;RDn!CPCfBVONJ1z)zO0Weg2GC(HD*7=ppVq zE(8Ih9~@)W*1ltM+(yjAMtJq>yc0-*Lpv2o0jgs7CZW2QNp;6I5Pti^rn>NtKfqTC z;3qd3G0s3&qR6oIbs6g|`A|v87Ph{EJ2_UsOF})J5c8A?4MOQIOT*`e}B=eh5nqCD?Dc}um-|;V*bBt*wb(6 zfWYrWl?s-DV%g*r=OU`2M{S_ z2!b*_Dhk>T?%vBhJXRB5(0iXsERTNR?9Eyj!Me+JHW#cXo{%c{^be!~#8iSBKq)-( zZ}`^(?i&e4`r2>dm@ETqclPX;a%&Vtg2y184Q6hJ41X@KsfSA>Ew|d{y&?BUyfoe)ysF$R;k1L0 z0VtTm#790C^MA;seilDps|H?Lv(?|ps_xU^k$TJ_h^arEQTB%dH>4cBN|M{Ut5d^FS4)NgXzJBNMp zCEg~ZgWaMydH??Xn(wu1ROry&|M|{`KbYadKjNV{h=V~wmNeq}bbG}L&=w{s>$TFI zcfK|wK#%GWeWqk^M`&r(*y1$M`^62egUi0{3_9~20scy7bbXi6oW#IkHyG903VKtN&OY-Z=&ucn zC>-!hlP?cdrt~*vM>JpJ%~LsKlD}zGu$9c2CwF)Srcs{G8EG_1+lN2Wkjsuh)ogUD zPll`W&NV3~By2&w931gua(j)A$eHMw=!57^OLdLHX`Y><17{c8yns&I4y}n+a9fq# z>Z_Y_?nYtf7x4XHczZ9c{R7URkun!WOgd-{19fr)a(si{3n(1`6swA zdDkD)b=Qqh;kaYPlcQ!mOzXKXxI~*XJ{qIORf{}&EvnyhZ`fp^e5y6}U}QBl$4*Fs zdg%PB=(H$s)lDx|;!Su&ZM;>9apj~1ov#o)DLr>QSyO+hZ#V{H!CCicLcUK)EI z_{ekehEEXs)@#6&%{1cf9yj%8p`|{Ydx{YI!~Faz<_vJtq+m0%@5h2>h|9g((DRI~2#+*4?+W(N@+IQTteHP;?-tx0jy`SQUnMU5Lb zEgP`+i1qMK@4tNfy;}2-;`RyW=G#Y!>BvHPs6UO%JD;wHAIxpe>|Q z7loX@T#y>vNIcqnUCfQo3!=zF;$-aK-GaS_iag)QRq>t0r#R3 z)U4KJdpZ61CR(3P9V6$GKzsfO3+&yFooaju=iW?*<*g2`z7~iL35?F9YvUwBs(&Sa1*)n{8nsStkRhoUd+_WN!*F-ij`%R z_8Zi3#5{H_l(=aMC}%v&Ck?Os%Hh_nTdgzG)5X{2gPg%3!m0p_v4|tb3-=r9NIeCY zdWi%kZwrZ6EnB0>ez<47*h92s#Os0%Zlff_cNfhrr+sm_s9&*Mvi-N&k;3i zZ&njU8RwLbT^}4Qnhla3@_%B;_;9w?Cp29vGnWobSy@=}olHzjM87UE+yZA|3h@wD zXnw2z?muU8_Q~C=U~jbUaOt=LlGvV;r1T)xvCfxHaeHJsgZFr9m8;Y4OT4gs?D>W~ z{oX4pI}xgRsNLC<6B&j2lyBVU?5ngAzP@-3CWb2q=cD z*7|@M<6Ynto!3JrI>BATv2Ut&CjILz!x=I)H#jLyPZ~S^ZWnvM=ZS5-cdRL284ln&gpXjCnAF}`^ z(52e=?MCp$*MKABdW&uyS81Slscnn&Xm&bX`~#;6eOg3L$ta|ZZQaE`aQO6VnhSN_ z%W+vyt$*4?qG*eHbgCQNRZvD?CO$LO%hueE`>V+j-uf|CGFkh50I>vdUY9bUPW%4> zbqI=qtjKMQfmql9ZN#~gdtxqA!K-4qy23b#RI`LJdi^7HFOh)lDPPCy!MC^;^Zq>C zaI`5};$A@!=3WnGUr^&Z<9a@7h%cbiwfDdAh1LN4Qz_H!ln?hLi&aR1wjWn%NjBI+ z8>IO?16&|}ICzD3peXpmJ(H`L_N%X^Oy#be%|i=5b8NeKb&$z9U8g_DZnwT3eZ|?d zUa;`&GQKD}AR~me6 zfZY+$P>~F(7_*y_8r%7~^Lw2+Wo~C&$j6PuQ==oi`>LceS;sQRq1(TXsDmpf_RWId z+1}shBQus_8hqyBqJhz<4h|I`KAh_-EA}%UzGyB;;3iI7H>1{ANJ=0>Bd(LLZX7J5 z<+ye_){Va$Eq6Sq$K;}~Tsz#T7wzE8d_A=YjuSNSD*u_~VX89@E^%GqX4vc8o#Eby zoL~UA>S6HA#*@oUAfK`aUpy-OsE5Opjt===fdnak6|I;AIwG>1Jdg|-q0%5Cd$y5( zXJD)%z_zd{|NLDS{;H4Z+iha+s%WC1QD5-~_DoOkTx?dVCZ0;#3G5d6Jp6d*A$Lyb ztMq0wqqaUDOv@-A&QD-sLj0oDa!r$lgrN8G>AT2S0R5XUr>Mar(lULI%b?|@Kmro# zY-PXH{4cUG1GZ|sUsP(02lIDy$pB{1-)9OWPA*VA-VDnR;SuK<$eP!=G&`NDeAY%sy&kg=m!@Brc(4m6^n}-xXl%MJrAB*-I z3tbt}0ck75IfQro@?@D_**Cf3L+m#>58Vs%aZ<;!2hHFys;zy5NzIy@pO zJ}|eU2jDIo2N3`V5Gd;YmW87P4(x+TQQhuwvx0p?(S zi>XSBzv`QCJTPi6ps^q&@Q->+uY>Ep5y9-SuRis?Xc)8(J|{3>t(|_`4{1v~E`I+) zv-GYSAk>5d*BfAi0NVGue&M|Ri^st;{=%aofD!26swRX{=h+Y84dVXt<*&KkfCPhw zvIJ0_JmCG=r&9Yuvc_KmuZEX;IVejeE@isphpW}zfcW&nRRc!@$QXF=-4CF|B63n8 zqu_6|`}Ii$B#u0_z+%uBqFZmmYh`eIA@Kc~dIdGZXKE(MDw8jJ+a% z+*cPHKlVJc=%9qmcHb^ZC-jLsJ)oj;;i)JDc6!zEV}%P=bpG3OQ|q%E_zeQQL-d6^gpLDPdEj#q zV4(>x&I^a@uiN}5iWww=3F4$`m~?tL{M_fdSY+j)Y@*JKm#Ao2X(#H4ZftDi-^oK) z1uTD6=<_(SPu=2gbwj)%c)$1uyS*dS`7mAt+zS^*Mg>;=J|$TJL{R_(`fG_8{gy)$ z!P5Lqo_FI%Q>Ah6+`|hQ_gDlurwi#1d}HYI|Nj`uBVOYEijVgwNTqL_@#1`H~JtQ_s_AHAtfo-wE@1eiwBCB=(l1B>kc1a z+e>n)+klZR`TzMS=o2!3z!-Q8#z3ZY0p`uA3C6wPrDwl&Zv!icYRSq^xiCI#BX(}A{l5b3e7Fo*3v zQ_jvJ8!sa zZ?TC1T4*8kgH8QUXyW22pGONE%7DW#pA{F00)kS>JH4;76rQ7#@i>qh#n$|YXhtfK4)MX z%(IHdTzD{{wc|p8*RH#SG9pp&8Uo$1k%q5d-@8!J@Wn>}wmSd>c6S02dVQ;q=;B=g z?|N1OhWg)q0S>;FKi9>EsyPtwjoiGwIAl`XVBCDWpwRC)kPjU!!{}~9vkH(7FP?q9 z@P2u|7QiW#n4i}WkztS`wz<`d^}7j5A%Qp2iX|ws1GC0>to*_UevStZe1+-&+0U5J zQMz?8BS2*Qr=Rmria*Dm#fMeQ#;lB3Y!$!Mw;9+tjkJUpoT>&K$cnqG%l&&R8Ca=A zoc2?g*I~woxR)+ep6+7MNiC%ZW*VO+SYovx-aqH@>yv!20lFL;4bPxu?%AI@KL1#u zIHYTa8n%G0=?3G3(1tIz8o<^WK!LPdO&Uf$9uCRtTo>!EZlJD(I9QOtuA74i`e8n~ zSi=Je$rlY?Xx_mvozZjoJZ6n&nF&RSFE}sm#DIPle1%A0%eR5AU`g_irToEN|J!E$ z-{5cH!RY$O=l+=ZpZ|C*4osk8-}Hz}?ipYs<(&oQ&%;>%TzDejUVn=h()d->Y5hz7 zxQ>^vxN-iA%KFQ>x(*pwmogX}fnY5o`|}+C^PB$% zOn<w68%LZ+{ILh6N3_(1kNj{G;a^afAiu(G&vjZDfC zJUR+f$?;;+07icR82x7{$DSO26_ups%7x>a!Sf zhW|IezL}znDXqjsk4o(25X`W(b$Vt-Rk|ay<8-e~b?_XJsLjzpWED5-jSCs3w**C;(=)hjB2bL;Pap}e%Px$j6ST#^Mpm(_VWf@Rh4=qm^ zBj}a$lSfUMf@&N|z{|-XHRHs+^!Pu_vYtb5@(T;_H{KXv0Gvy46?ku^1Ct#W$}7G@ zuy<>nQXnKMCk$xASs;aSA!)%hrEYSDJEs-SKi&I63z<2Psu|fsThzFo9F0o(8pEvo>kg6O7(;LCKhzm~V$M-+QN|8XUw6C+x6Uvdiv>!Y?bfQ~O z?gJsI_+_vt>$u$GUyJ?wyxnEMbVamGHxqz>yIC`RA;;(50n`-AvxgXx1k6|Mx0&z% zF}4eyk?*H038)3l&*V2m2Rj!3M??si3`iG;J>5TnXDBy*0(KmqEg*WyD=^K!uk-8^ zDL247aB0PhP*NWX2c)PnL|r_Tj|^-xBKYoq6??kK^lG-F0v0AEObWMM0cPXN^XT); z!(Um?-3aWyyd`4>1s|CNAng5_6rZmO1yV=oDY^_X)duKrH7` z1pNFrz%VoZF~BbVAL;p5Vq6CTQ?W4jhzp3AGW@?k>5NK$wyhxf>iyUG|CzYIEQ9~9 zkq;mh0$G0lJ1vpt!xENHlaT+A01Y7EhrhBYC*j7o1s0ycfrI;7mBbEAkIY)ETQB&f znCcQ}E{3yEg8c}4y@*L+K)5Mgm4tr@ks~1!+vvl62+!43 z-)|PN-FZ~NBR#rhJ)d0umG+lXV$%Ss;LN&t0ABd>wISW_cHQmsO~A*te@X`fnv38O zum!NsZX6W58t}uFJT#`aR|}-Z^5W};tBu3Gp3;`NT)FM5J>d5%7Ul`I4)WH>cmb0g z(l7m9hYSFU&ak|L@(kYPtdIyGBXs|ymGf!g3QD>zHyxps7Zh+wfPdwEbQztMar&_F zVNchZ(HYzTCjKnH7qx)P_GVX~)%uF-lJRM%pF#5;FYkE92QL>Od+w%zNKd>+?8Uwo z#8FmYrz{!1LYC&E(MZ562DyCUX39fF%5^4&J1J7BOt`8lhRnQhL%AP6 z@W)wDT@OdTBZR#EM5kj$S^`5ZD6*86amvZ9PMu%*t8VD zE$lNgI~gEo!lMOp(|6_E*|dsWjK3jH?Jxa*ti5$u)LZvHtOyDyh$0{$rGy}%q;yGl zcd4|5q_ilZv>=_*-H5~hf`YVkgGdh@Ll6A+Ab8G^!?~XCd%gd_FwCb`-s@g#?|IOj zO7t7%$QrvBf2b;UuVvm#QByEx{)(_^K7{IA1r8ACat;K|P7Sqx4Jk)1DGGcwJgTYo z7Wkmmm+U=d$SeNF_{BXTNu8$bJ+pLkH#pKW_(7$PXztj0PpBOJ(NRY^3yPM!V<>XI z7TPL>;7m}-fWIR8j{m#A(rUWFr}OhIr)R%B>YHe2sIRrjQWAD(qZQPZ!J2!cRn6sc z?Q`+~(c=9ImxJ*3G5Gga^Y1*}*+NRyFv;XIod)3D2Gw*9x!rYPbwF#47b<`*_E z2s-GaPvfr*u|0}Vk#VYhvc`JX#Y!4nBd%jeodt&b}FgA&Fx&IN-$ZX|BOm+bdrcQ=U1 z>TZE$TE46L4L9;lm+2b3hbO)h+K?VAUiS@Bif20G<_o@WG!yGW^vfF;H*U8}6xsK< zo|lY@%E|Dq0Z>(7?XvCKehM!vr~}3Q0=TbXM3Ze2M9T2^l-R%Y@mn#+FPy+fHN6-? zgl>{$*lD?A?`ioZw?&9t=cQDwL)HDr$hHf}G}N*-vsT|zy@>{I6^|CThhM{#($EO$ zMQc$u94tV7mU&0nUp({HLjo^Z96>3Ejm5+EcEJu}Q00h`qAgO8SPW%#DZ+w5A6Yvt zYV$TcU|obd!}pBTyjV4RXxhhSVnos}}{SBJPNo(jcx z#dHDB8^ghhTOYsnK4`z42RVucVD|3D`DlRIk@xXxgwh8Q(@=l&fWIVegqX&h$m63v z2bi+_tIOYoJT?sMi|e|*Od>)0A&!UoR{@F5cK-tg&|^M=meD_P zufI-y^3Dk@vj+F(0-%+l=cI-TQ0v#`C26ST{1rwAhmyr8Uwq3y*;$%T zpFzvy=EQ|)(OL(Qk(i7Ey=ozg_Hcr}Tply~*w|MuA_=}ZJnzNQwQ^1))m>Yj+3~E7 z87e0F+j()W-&EcAZz=jiFzC&6IHwk8ofW7816jIyu)a;CM2bkuz$Eb z{20mIDQ*e(isXUj!+e87`y52VLTHEYGg2Mu8ytL0LS@nx-sT(5*UV%<1tkRcdt{V{ zRHr0>z}96UgJ3}PA}~Rg7u)J*x`hv5W$(51wjrt!6Fbhh7f+yGKF+?{YeHk0Rdft2 zvQl_cl7d>;jsxxnT} zYWGA$QI%?7P`l6;8!P_lnX7A+l8xCW5Fy_)AlL#aNkk9-#5w-)g6{_SfSg~3oe;gI zz1G^-Hxd;t1-Fe~9JO+s-4Z+Nc&$`WMHa-B9*>f7k5?)#;P$AJntjhF8Oa>i=AI*{K(4VK+-s4#4$q7gl)QxS!xgz0oFZ&o+`dq&$tjmz0Wt5kW zHdu=4{uLRv&sck#QcngH(+DQjn;PnVR^UjF0sb*1-1+7eq8~CHEZ``ntGC+}%5tt7 z={!=wH zMT3WahMgq>0aF+4tNEOQ524eF7=H9x;U-$8LZ&M5#J$-(W(nL}l7A!>tn@6tckYnM zMaM?N)-R8=b~bQ6fOB~WDlnc$3I$X{)`HQE-^lQHFgt$n1%zz8^^Ko7TCxnI1D@|K zx(FwIACD&Dt-@tF{uZ!~%-hDD-xaefub(`hD!WgnG5)FqpvqJ`+sOD5(0~A?A_|Gm z#26ny)WC=xb|*vThlu9!HZLxZGGi?_Ro}EOLLQ{KSsK}Xs6jLdA zn|?g4eA<)K5rC4e{-zf8E5{E1g>V$!)%q$=8SVTBE#?+mTBV0bWQd8PQwQ4Md?V$7CK4vKPSSY|oz0TqBh=1R~*(QL*@UqXN+dF#?g&!M3Z*4?%5+|7qsv z3rC6w()QTmZ~WsJz9|$$mS5Q1QM720aODYcuGGp~dtzqeenRGaLrkt2$`1YMSAU)8 zXhu2Vio2)TP}`L|ccQ_o1laUNB zYmL)MeeVcAn$~&560NL6he#sq%2MEAgMkFaRs#OD9g2uaPy`*ZyrW#g$q&htP8HFT zsVp$;Fs`mCS*9FK#vA9B1s?H_%VOp54aSd2>Xca72Zm&^ZZ6v96#|RXnNF-Oss~Bl44e@a`#7I7tgMpeR+zc+Q*}!ORaqPU2?jx)~zQ z<~W&)70};P@`YKae{gGjRX+*DUY!F2JCc7sGjM$OaUg27fc#O*nV-c6;MU>(+2Cq? z>cErHnNE{6E1Kx=V7*2N3a_G<)$H|L+$CoMo#U8h zq6KCRpH6T?r?D_Aik@^BSA8#8etqoQbd2YwPStBr>Y!;u&RR!UjEkQZ8W?J2VF8U^ zCGaen(!yH*FpR&2Nxp6%Ah|$>!JjCw+k_+777MDDQ4Cg#hY6)~Laa`wE>nUhy3Q0= ztOuF&cWjJ?IfI#Epe~P0ZZs^TOklwZRCyMsG;(a039{g31RTsp5wnPeS??5N+lMx8 zM_2qBkN+$0y4Ry7qs=ew4743fHLLZw4^ZE4Z1V!#a9xodfax#j)W@i$tRZZMVGXhD%6QVcAio`0I9yEy8`<@v>fuaYzILt&rfZ7nxmpn(iOWSL2m?rUiQqGBJhDOYr|%E5{l7e;Ah$)c|dur5Dlw8;-|Yi zvZE(#6L~c_UppqN&`4Gt^zS;wGW8I&` z_hO_G*SB49_eTNE*pLG(**GvifKLU8F&xqL*#VG$%FvkVlvYljxJDcC(GRrlp7VKQ zTn=+7hHW5-6d+d{S7bLX*e~($L$s?|T*lHWrSfhUDO@}`?j*F48oIk$^h7@^;fRZ< ze9o}r5P#L3A4v6L$2u+eEHQ#65_i|4F;#S64T*T2wY~A3y!K_)lKQlDPv( zN;EBO_?Cecugo=XB(8P)zIY)H`mI(VVuljFeRcZezus}@KwuF)pXZyPO6VAldoi)} zn2MqQra@`x9`0z>cyN^|?P-|uMnTt3gi|tiFd}37co|}3TtP(50g@PM@sI`~r!k#c z;Zuo<5pC(CF#$}(_}aa*?%^+xr3mgeBOMT%Y#Aww& zPr*lqWbF2#E3OU@$O3E1+$2b^!P8(Q&ZvP;a>)XIO6WpR(#b4<|LNyAdFOVEnY#OQMz(qx-NaHr;h=H551+(5<`QZ7V4K$(4S&JJ+bJc z0YymDAfGmE3Ca<`_o?(Wru&}Q+KDxsoa>D`@IdIQkX9wg{1cp3Dyr4SVA`gCuxR!; z$6jMeRs2(=V96!=(KC)dPT?*E0}$<6KuCP9KruUNgKy8e*fAS;7nF;6v^zpEPW`cZ zaH@lH3ow@t?W&9FNu&FDF7ULae*r}m-q4qyc`C9D2l!ABbf7?kN`|;Je?sctnm741 zwJO}}1D@-G1b%mPm5KgMw_T%0{?w;11)naKd*fwRuYtuyMGY`?PebYJ2rf_tIN|@j zTY&~TwOXcyJ>|?`)3CI5u=*y#yrhYyy=)IHyLx)9%A(tJ%o6SNYEAcxXipsw=JJ|e zm_G*{ii2#b0Tvol1ETg@f$6``JA(VV3lx`Uq_<~}m_?sAm;9*yK_i`U?y2*DQACxM zzNS5SuPk)OQt&)}kDAXl9*I@puu&5|XP zB?&=4E&rPVHIpcWpnSAzuLJNNbZl5+Ud703fGZCMC_h0)J%O5M;-TXOB1(&3a4R!p zPO}e$<@Q%;dFuG}@jL`73Yn)1zniM0tr{>8F%3TQqw>qPv;5z!@oVf5LwX+wWnaw} zsQ-`~Ft~I8DH4fw6u(}E3bD|C1@m(M<6vkGrqX5575WO6VjnxybCzWF(-#;<77ovy zFY7zT@560Ha!d&9%xEPCxl^Y$5=2cYQ0DWgqW>>EfRZKWD+^O0s0G znkvVCe51vEdO4izr4x(K%|V3rrl~CuDGsiH#e@I*IJPoyF3k&ITLe?>(@MDlIBScD z!B*~jrhoCS(>`;Ew&VXepTZbgQwwI0r4Q!f&84d}JLZ^Di|5@4bDZ#3MK*b9GLnZoJabKxVE9G;aP%9 z_zV&a2C-{n&Sg;Cy!30ZzYHz0_(VP-jUWa8&}TPdS}s5#M_J~TV^5T~323qZUYGS} zfIL1l9Sjy|3tj4ZSqOAVPza2CUH;U17u|j&fM9L8tjc@P!J^(e`UVFUL@hNgTk`0H zZ{tJ^mY%jw@J@OWxI}E4#^NX#HsOwkX94&puqG4%=j~F-xiTR`;-7`zbE@b@outWk ztt-(psp}1JLWfR?kIc3fUW9?M*eO{8mnY0JFZmb{Yn^$2=fP7iPR7cAfhd|HfEF2( zWKRut9z#Mgy34@yr&PfnEPP@>=oFthPp6d=spZuaG!Q#Y!?(t&%T&Biw(#gy;WyL= z3Js;G{$z~;!=0Hv6<+zUkAmNzfs;Wh8nWXpFb^||6-31UlJ5!dCITiC|I?BmL((aMkT@uWr}o_0(T+J zfCTL?bqav!4h88nYgqKgsb=blU3~$O7H4m^z)U#khNqmLKBSt^&3TxB|Ma7N16?PsW5!via|85&*uZ?T&Q%2rqZ#;66Yq19v-BYlIDMZvo8c zT3`EpjZ-h!K_HsCVp8#Ip_aH4@JnXLOQ$@R=H74N!O);eb~YWjtWo3P6|I0A{P2j= z*c0T#;+NI@xc@a<{TvvpKcT}E0WLzF<+#PahRAJd;+>MTxtJ8G|MW{z3&aAYBT?YL zQgY}6JAa^oS>}@pMVt3L5#U2fNq|i`-&#DC2~Nxd>v-%5K|yaKcg8cMhmSUi|EW8$ z+WbU!AMd62y^p?M$%6v(GF&A8&0a~=)Vj~@sQ*+zs1ETChEyG)jj*PgBT&%QS> zrJG?jvz@lwf!DhUd$fq`2aZLX^Qhh#BD7=M|J1-kTlsuZZT!;{l8#&FVt;$V``=cjoL!2XKL zS*krp7VYkZO3Q8P@@ZnvesOX0ng}uR!NF^Wt0E+@6)#ky--8ValJ0;YpHsf)7w8|V zmRe+r=+P9pn|Pmd??l|rSp&(yfMo#-04bMdjjIBgATl&To2y9v$aI)RbZzF!FGO*h zZ+u&sn^PM9?0swa7V6)GaQpr*;&5dB89zi`SYcie1!lsn-3sQl9)XjRJ?1>#uj04q z{C;8N1(E~PwB{5&3M`W=_EIma)BH@`B*}r>r5g+Wj;k%QDG2Gw^14fdEg_mqt5#!H z;n@Q5WkD7dSRM5&) zZ;p6ou;A|ZGYkG0b#l8VebY^%!9d4xiF&;KaMGdm<#&pcQ}IzF6>$#gS@qOc8I~Fv z?B*7{3L1FvAzMaI8Eo$4z%5{>~DP#0wAZ&$6Mqf+Ms+n4bkBu|rZVNPQ9>A$(P)R^B z`*1tqUHQj6hNEXbnH#@*3CQIZmKlPB#CjDkQ2uLAlizat+Y9#?5YwzKmT3Lju3?JH z5)coYz3m7HaP=WC+BtImb4T4VQHGVS3OC|jDJ^W)gGG$1FJ@QjuA*7Eb*es= z{)}dv<0mY)TGHS}HQXP?(W-E3j^|>ioE%ml$V@T=)^G^VKoV@h`;~AQ< zyZ|~D48cy{@1%#@d$%b!7~PeA~R7)mYR|E%hi!x4%D_4-mX(>r!Nuq#(u-EJEt~Z zM>JNQWtb7dz`W`mJZHO};!ga%1780nyG-!93I_ zR6KV17jSnr?CszWV|FW>D|lSmtmAIwzL+ob9Uggt97Sdcn=wbn)7menYOPwUH1uLS z=umU%Q_}kk-(v8XuinqO4__s$Ysft?fd&WHj5>{O?u`ctVv584JQ4c(uIb*9GB3}< zdt8Z|(%lXMtUN=`doU%9H49+Re@*sV0fg9$-)$F(7=sFCSS8n)Ud=!(pFr_S0R0;1 zZOg&SqeRmdJL7zdw_LZ$#(B9W`d9oy;<2Q6S{uAWq@ABVT=;$oqd3jG;`06hi8JZ5 z>eV44yzKoziMsMbJ3DIUQn%%{6~8eDmA<0zSZAB<**zU)aWhN`EZ-W!v9-CC&r+Pb zVSDAiZT;}xG8(rkr-s<^iTFW$o=?VmiHFwHiQ&BWH20&Xl*|>(5DzT-tBOecpbt>_ z(RPTW_CUwOVrFMHQ<=fa!@N1Rg#7~)SW`TICVIdyPVKb{fl>xN2S6?g`GV8tYy2n+ zQSppEnA#j>iDR`>d^n-j;5iesn!M6Y#oI=>Wp!f-r&~fGL}y@4WcK+?EfMhN)sE8<|({tM|!VV^Vkvz-Pvy5eq;TMjQC zZpDPug6}7=AAg_5Q}rH8?E?m;!#b!xlHbe71HuxO1zVy{;z+QKGjcd~812 zk|8_h5*RvuZVy&amfUzTq+^D;?0Sgj-K()yUpmG+lTjh))UN3$`QrO)OoG7yTlRDi zR8QiO_o(?AWzLxD=9p959*p7O`H>MrST9|F!~f%soBfkd+pQ{TG%$W z5+StDej7b956l)VvK=RrwQ8#Q5Yi0>*XM2zC;2aVH*i5ejH3k} z!YqHH1!8*hP6ja!S|aiY4PvoM**oT3Qm|mpU$gFi;FqRBVq&r1RGQP2&7%pKRqzH4 zx7YIEWDijHS48Z+4Y?2VYPIY~I5(`oB&3`3frXcKyMxKYo#Z4hSJ&)In}9EZ?08m4 zHE)A4x4!j$LY`_?uAgtb-^)cYp($$rsO@uO-mjafj>~M5ahb)@TlmtDsvbR)NPd5F zfNl(x1U{*P?nY=TZ@*^ueUk2j(k}V=at2pp`eDZ*ncCMR&Xx~1R1P_|f4s9?hE${S zR;|aaye2td9zK9@+8@q;9CBN0KHLeYs0I1%Ua4~M;T4wwAN}##@K^%kgM2~jp*~>{ z2;Ru)uD$ZtvvcB9nj8K9&604ozZh2I9?TOS$o4k3Dmv#W z(^4-PX9L4+gT1{A%aMH-sbaVL^H;eUFhsca^D93VUj^22yzx!RJ=K$)D<}}m4)U_< z5j6W368!& zdSBatMqkk(gOfc{+7sBJ+y0Ymed?#wjA5vPg@I37rR|vP2E+u&5+w&{3d#4vDDdwj z2D#uj_lqTFGlea9yz|)MZF4@{|DxIK(r(Mp=Te>9FtN40Y05Jb)9w&wYQ7p#T4W=w zA;(WdjEuF->#jClF*(t$5=AK&52+T~e4w=~I_@dKM)B+mjoQcEpgVO}Oq)?=f=h?0 zaNX9UE8?py4hcz`hRP?B7Y5zLx0sqnAZlgWUp9K>GiQC3r5iO-NngICLr%04kuYi~ zOLdwO=I&Cw!9l9anvRE!EqL+kc@^<%d^Ly2+d&)EOok2Mf}9WJ%p7Qx*0u-N@hrX_ymo8qjiu6G zOucj6dOB+SR=|zPZ=2bY_&tx%C#P6hlM=K=A-5SH%Myvapvzk9oN-bs*vy}HMi!$@ zGPq50ZF{CoD?`>IQPABG=5TZ830pIAw^^olpmxwVvj+W^Zc%UZO(v7u#RplPAJxQ4 zUF~$LuHVzJ6K9weMoD4+m3MQ=@v9@O|!l`%CJ>IOlnKiNH=#Q=!8m5kZLT>JmInzoYO%;p36z#9A|DZhWTX zu+3Hpv94XANAM!fxT2Z7hC|1qjhV5hho6gzlSl#Otcectgu)EjG?}&XOd*#T= zes>TY&WXMOw~6#P%$F&(wdIsojWzkEv0KY4 z75Olcuk=HkO_jOE2M6&opxJIRBY zN%@e1Dc|N{hz73&rqzG_+6e#j@NESN)z(r&2H-(H~s#LA|+rS1t z;-D@vt0dY9@piabzO6w%1oOp7wb0PPVW2 zNfq_vXv@}L$zc$?us`4Mq6Hatf9Q=c zyuxJbOgzJjLbPxpM{oFSi)^W1V+2_VZ6qx0E{Xjx&1{%d0V={)r;w0gmHRLEleu3x zx>|jIhJ7Aclm3T}TkVXMH11hZ?I_$mLWQp4zEB57nw6p-@*M*5H*OVc{W zTs2p_m2}6%$fbdb^})|gUVHu8mYj07LT(;51%d7ax?1oqqlN^{NZt9+r0+9_q;Q??%6=eaF3GX;YFcFbA)Hr{lyx)U1}&RF>E8GSQ!x;=w94y} zsOZHG$(r>WMio+fck&wuJIkcCk`R(cmeC&9^5 zCAK$^%Bw*_n_&Ldv@biE*>oi{-E9+m3;hqZj3JZMspc8X7Sg=%Hj1x0jr4iAZIsaI zeh+~+MR*Irfjk-+6$hiX5c0Yr8I939*a%-jHM(pBACS#!o6_Hyl#;o!fOGvecQmv6 zjq?&f5Nb`0Pl0tc0NLzsuMBV?1PqS(w;!K`feD#vM)t(K?AXI2#-;qs&p%C>)C5Cv z*nHbN79}m)4%^QLM^{m=G+VeeMbjD1nwyU#(g#x+3Wr}91}+~`H9)E33>ocKA5yjn z2gc8&G@m1ci1J&P;CF|%zPv$~a_o8nzGlU_m27U%{++>mmoVGAOdM3GL)Sp3uM63pdmT*qH|DZB3+K7svRk*_(_kjNqK@aFIJk-0H(a;zfJ{EcSvzm|juhDsO!t6g zA@W^iNtjw1&w7l*E7O1w-gX^(quC)dosg?(QR*=*{R{>Mw<7HbW=Lb zbovt0@56PPVt)8L2;Vhq*9vDWsW%Sm$x!F3cN_g~L{JrgFbLfO5KDdeBipEfr%4w) zKm}rWAyDJ*Hu;9<-|gj>4gDZNNF@abmvc9Zb7#rr$yhUg); zumAbjOZwITsrclang4>lf|g1708*CmYWjuQq98{sLXAqu9sV9@o7i? zbeAu-WNnORE2`YCEec-Tu?eQM758=;EZBO_Y|N`+J={|8LB4EQ0<*pO9P$Ev0*`f5 zPqgmMZ~p2XS_YoW!EeW&bq%j2j4GvQt;h^v9+lV z%%EHBu3}Y@)e5X;ob(S2bH852pucw0O+zzHIfV|xhHJ1dekRXXeQ%*`J{Z8W#_&D% z#%RuvWYKrbRgx$}iZ!iTGD|gAR-Nb%#Ok~~JaZaSqbBN| zb6Q!t`RJ&ed+Au3-eD;eDgjs8q{lZtxh)Nc(z{fJ+rAUQj!Dv6tX=im(J4IW%gl0v z)8#1>qzpKL+>l}jaKy5|`%&Z%K$HW+e%Eq=d}jz$41~^|Kg7ETwGvi z48Ev}l#C>WM?V_W<#s@UD5YLk$A^#X$u#GAx3jZUmt!n>t63NaK;Z3K83Q(%w*sqz zb)~)*Iy{AjOA&j`#F+dn;IG(>4SU|jM)g8sd|j=xF zx_v(JU?eFJT_}?fUFXFCz5|&=`F8ZTrtn5OnG|tz&R1QAhrwEV(v9Ouct5KHgDEZt znc%F6L83fqFFOumetlZUo5x8T>*SWFkyFLn=Q>7l!1V2I`qtc~o)jq~w2&-bHC#L@ zM$2vV$O_BAF6(snk}TXHM3J|8djOjyFzNLnd@If`CSQ)Sb$2Z1wda~)C|lD^sIV=% zgzn}>fV5$`w#(J`IqM|`E@SpIVx@L@k#T_n2xoHt0@Nx|Nyeml2Sj565d(w^0c?o* z+g1K8?c-j)iIUQ71)!6#VYxTRW+eRF3{GkwArH8J#{_6ei*JOnFLTH~YDktW_!~(D zs41#*$HK55s7yX>!C`+6^|{6ED(1>-_u#;b{T@lu&_TWQ+xK!ItaJ3nQ)U)UZrpcZ z8m42kTM6%`)(4x@l6pnEbB*q?Crd;#cIG;WH3n1Xq^uK2eLMz01_OQtWNX4@mBL*w z7Wf?oA{zkQQyf#mC(>>{aDDB}?czIq4Rc9FOWpe+ zNw)&_STzY1sL0>Vt=p0BD#`JeZPJNXt`5z23&bHINAKCNiS0tEQQ~l6syL7aohVctj9Uw)+I@J?VpPMse7p zi-j_lafPs^JQY2(aYkJzyOKfCWo=Aunp$kNwZy%%wd5i0I_0K7YF*ebq@Wbn%I_L`Waq~y)8Bl8~w|9QcACQpR;hc6x?6_^kg4xt70*#e&tIR66u_r=#jYh6&mk3;px3DL_%#pHg+ecJVo zJ*8wp-%c-Ud>qZac?)WJ|XH5sFg!JqS3)Vo14w=9Q;lE zUpZn|8{I9*ym3FOKC6nD(244RwH_cBdt)6Ujj%A{*e0<-W@M+NMo(b|A3v?NDIa)g0z|qqO|oc|62Xg6GZ?0{W<^L$3otoSju+^a~;Yj~$rS zCpI*0p!YulJa%AHtKWi^U20uLl#6B`KbZRxGJVQ@&k!<{Z-}GXr1dyLZ=_9$n|$5c z_{zK=dap3?J3DUNo1tIbAP#kjjLy1ra{Ty^{5bR}<{TYG^r{#kVX5J}3Nuje`oyg( z_e-jJguD+PGO$ERcg5HZuUwQbb>#OBS2BFDyTHP3_p@-{bs=&1lY_?EOzUk=?Y7%x zNK8DHZ(GLSVqARIZzZ{lp+@f(S8WdKMr%tOotN2F4&hL`Oz9{ye>+fohp-{ z9?!mKI7E(=`hz=^wZyEsycv<-zcvlw;j@_)fvDb*->SbpDDh!m?NGsx*=`}oQNCK4 ztgeiRZW}%;gwJd3ORM($d&h?;n9wWjAWS+5iPk0tiM2^Fy68S^Y&z9Kon7a(kZCY{ z!1bIa`|UZT1NsD7K1e{cn`QM(aWVd$YSxgVRJwenbVECQZMF0+ z&w*cjfl}4BBV>A8><)ogE87}j+wwC+oyPcNR9dc-MjeLW#M9dcGI7?_SW<>&A|F_I zJ2P69Oqau*6vODvnm$1oZ~<|OysxI)GAWAB?a+nTMty%LXQgFSu&$OInlP?ow*tb_ z6>7QmTi3E~zkDeLa`nJuf)OBo&WuhAcXV&Om**r)H;i8|)=2xMAWpegj-&RJHXi4Y zHgEKoEx%_rXB*0jR$-Hds6i4Gvw~!-W?E)t^8isAOu1OHm1d5o0m&~Smmv7@y%G&) z%nvRi-m@+K_{nla)O?>5ga|WY1SG>U8*@}e_Xv>jBh|;CiJ&S~QAPtQXqXIYr%_)ehES^K9l4i_=}$wiVORH`2ijGL&hKo1XkL^us|- z{*?Hx7jJxvR&$LRqW`QhP1!m~2}V`ACLLNwIMtAu3RLkzzR0wV_7G{g3nTOhU#AOgXj z&VsLqU!Vlg$(%~K`s^S^`U7_4C7|5XcJupaCc6y3jENq+a{r+Ai3O)7(1qRY4N&*l z;Y45;Xk5fR9LO4EP}&fp-W1>5zAsWxJ3zj7x&7hD*sGMi6q$aRs0j`48{wbfCJ(s} zlP>gND88;z)i=qt9}ExW8KH^Fbg4{N4Sf*B&xoV!tKKn^;egFKTi@!IEW7hAsw)Dc zQ1g?GLir|K+isE5tUTpA`^t4@v20u*Mf3WC-b8XX^Xt0{h9p z3A|WL_Kf_|tu77TYw1j{6On{yRFh%px=#k=*-2d4>7{?Rs$_FNGjME;8Xch#-`dp2 zX7Sd#GzM|f5#Vfn02|F!>sMQwcBTKEBOOVE<5rQ{%Vg-xM6E+vIu!BPV&vVj+pZ90 z%P=J(Vm6kO!F;~eXe&vM`I$5(zKwb0Nu~5getsvxiNc{`(e$mR=lGGh13=+t{m55c z1s(iOp|`&C@+B{#DUsG?koRBuzy+!>*E6_=0e){ff%)Ts<(Ao-=aDNFbZhc)R!b?% z_T=0P!>#n+YqB$n{_vfnxbNiHNE9JwHT{iL>_YZl$1R+ysQcDg)0@H`J@10bg|0(f zY@;<(yz1GEGduxDLHAB#6c^`<)ii>kTi|`6Y(4AlzbsK+oDmbrA@@X4mRcrbJj&X} zbSrAZB`tV~KqNpXSb3z%ng*-8KxuA!C6Fl)JuCBjW|7QmbY9GR_42^M(AHxYfgN-j@FyW z6W(eXKkKmY@bfn`Lj7UpMZ!Mf1>@t))=DQGTcqXX*URI(l95CH=>=vfcE<9uzA5I{ zWjU94%1ycIi;|#cpqSr-8Mu2OJ4U4*xcf{Y{yEo?q?fQkw%{>U?gYG81K}%>^HuNY zj>KP7@4EWm)%ZtnfZ^VqV%SB58n*wYi<-L|p%J-CMpW&GS0|>vh4CC1R6bLSf0rHE z#o@9ZnFO~v=8>oK<7ezGq4k9gRLtr#9lH+%p>NUc*!P&{_j8_&ILKRXCq|kjzj(LM zCLN#t-2JPUNq_~mx&RMgk*E8T?1UsLv@ge0xl2p!zc|;rV6b6M-5s6ucz_E55kNV| zp&&uDJK^knzxSPGHu}5^TZX5gdn)8on<$)h%xH7NNR<{KR}Z_GCz)fXC3jht_J>(@&Cpn0ljJ=#TYET08A~aMdOkT+Po9{qe$%X-! zh?9e?PR8d)?8GiY*4rJ6aB_#W%CZoQ5J3V~vPv)-_7MVTzK*42K;2lRZ^h4Jq;Jii_)1xPD zrjFV2#comm z+(pSNilw(Qu~K@w_7>4QA$9YV28qZ=6$1?j9?D{jGLHdiPN5vM3futn*+KbCc_y4i z`~w%%+(kFv&)ov;su`?sl^dXW>hmN44H+xHtmbXRJ^6xzM_m=)qW9RTx2G9Q=2@xC zXqedhtkd=5Lcf!^bwK13!4#{q@arXBMXz7~qJd>BUjUraoP5ncZaYee3~~vZ*HI%T z(j}d+)(W?;3TIyc%j*22i=wjaaw<^Z4DMf0({-Ex-MWK^oqi|` zu&!pU&Sj8r^Yg#ZHmf4XVco~>zKn{RZnHMoh_P}+_8S4MOe~&FUo%@MN#;^*ho4qIKccHz@QgT;C3ntk+IEsvpa>(w-fnSA>sI-kG=uK;Y&nf34DQDo zb}^;fX=8PUw|N1j0;AN3H5t~rAImuFs}!~F!dc+poItbh?c!yc88o`xPVi;S&TiQ? zhS`KuVhgT-P3*ToXDlCG0dLPg&p>~ZlR_S}j-U2_SaXHg(ARM&eRun26HMCc_AL#^zsIm_Yq>Q>`f-jieQ7k}v3Dpk)n8h>_u14QCjrlcE9@k&0pO_bE~ z`#KSw(XvXj`7+v zWJ{9lm91gV;@Im{sAQCot?Vs(Q<9L7l{iFY&+PrX-d25Dzvp?LKYY&9_kGU$zOOxA z*M0N4+|ifyap|*b=`-8~B+W>Z!!VuTG1{~fQG~lDfndQBo{jpZwn*kN>=)P%D2jdX~ zom79MJrvfLe03&P+nX+0&vXHWQq=THo3bw3-SVc{J6!0!8IzGhJZERWcj($%1R>M& zVI}is$IXhxD#GR^K`YaH2@@}P1R-O91Eh6MLXn@lrE9Yk9A}JCj_Ixdrg7iCF_Xsu%c7tRI)~1LLjKKzWE49&r2gT;-CU6rBYkQ% z7em@Sl}pw*&NTE{L|zETzk#g8z^#N^EDubZ+iu*4=9P zHLB<4dlX;NL`%5)lpavh_+Iv8K`6Fuby4Gi5|tjOrVG*daoq2#wkfSqP~)xnswN*| zl|;SrT*aD4^UMp}wN*!1ZRVBWG0w=nDGYT~6fLP+enoLq37}_jzC@A}j1U1(VeZXu zmLp|S$4Bp!hULqbh7`7E)V1sjKc*9LnRs}$bR}K0?50NA>P((yzMbNGt$cddLuJR$ z&4xW**7d%P^?{jmtaTcPcL`d3` zL*fo1rB`Ac&I&unT|m@3Q*(i=K-570C}Pl=BG8$lNV=7OG??7l@CR^X4|7tJkjM>w zt@fubPP3=VEP*dN($K@7a7qkRO6CX*S9FK(_^7fC%%8tl<7hi|!(d3C2UQW;`sH%Q zw-4X0BvSAxv+3OD5`Si1_&{^fX4%YAML=uOiJhhOaBG;kW!caNzL>jBwc&hL(l(<) ztV6qFYRUpNIQW;c2MeY}?=mOEzmn7!l?10(eNO~+Oz*)B{4@cl5x!tfsig}%T!x<= zm1C|;SzUdBRiW~>)7_DCZty*cjAh-;LXkJVj;aCB&>4)l-*`yuqIeYtY^Oz>B<0@6pdWfdkQZr8l4FG!G#;sTNazt%q?4gW1Zga|Ab z*qM}@Hz@vPj6OdD%4+YDB^W4i-9!#XnEK|>Lh@I|f`+8;m^K|C?M}=u;$iM{FE=RP z`u5guzLKA_;$=mDjLOttNOC_E!p^ULth2TSxR+$d*p%Lv+%Iue5Al+Z(xbedrtv6# zoNLk}U9AwT`v8q!1PZIKpS0GT(9x-0c19<6jDI20HT~g}fZ%wwev8Nh7W$phT+Dal zL~XFJJ_$WX4ZOuo+{dSEI|RknxAXPXA?l_5iq#>gx|`YqIKm0ahi$o;iCOCf({}{o&#l85uc=n znorz7YOpz?g0eZf4wxd_aVhUOMFfOecs-ZRQ6f`d?ct!kmeyH|Ms1{Y)yFYbh-zwC zy5WL|x;fBz#O7LQdwOt`Hq`PK$h!GmY}_F$Aeay_qGwcG4@UoHd@g~r9!E1lzG1J> zJ(+_)#B2Tr4}SLZO|*Y}+9h`fGV80{Z1x&Z-tWq_8=5WZRjwMCb+t3mK~OM?j>}Ng zT7cq~Vop-R#RkGmyH<_IN=%me4w(`Jp59t4{s!VkM5C=^T!Tw844(0a*DKHyUsu&P z%1uv0(&dZPGEKUT6m#&&xd6iEL7c!KxyOZ13iB@Zy~?=0n8fWfG~Hf)(06MX>Cl3C z>M5JI0OcHBJm^07?ye=7dX%eg&4*a&gG~nUbBA{Y;XRo+!75$w!c8z;@9K@CEok93(4U_&#u7io4&R6 zV~fL-s7IZ=XB<+hswjP~EP727Ey-umk_RrcX8N{GOC3c-ybP@SRFMDdm@^oev8iY0 zz{vQ5L@$Mrhp!`}{!IC;N~=@0G}Mdut{{Kd3!w)-zEwTd$m6vjymXe?e$6H9EA#2g zwjYVJx*k7w&pIema&`E$(bRjVib4s(zK0flk59d8Ys!0jZ<3nRQT}DPtWd(3TsqJ7 zwMS96j&fBU!gR?#W|B2NYY@AO?3>**c}3w}f@a?HmreAHX}DZIPekeg^aYdS(mqkj z@6Q%+Rp?Wm5$@b*?Im5C z(Nj5v_6qI>*RPNBH#*K-Vy|r=Hn|x0(5SOCUA&g)V!ChOP}}p0YTzmv*t|Mk+!oce9%v%o7XR2KO*V-5zr8*QDY45D_$W&hV)ic{EJ0RKP#+Qy zu^;M1>3O%KB1+zSL3_0P9ByE$;4ur=jEfsgIGTtp#hf2Aqm=wqHqOWR`MDhJkK!h} zU7hvSL-*c$@Ko?-F_FtC(sQN8Z{mqO#ID*DCtQ( zOH<6K#Cq?SlAkTz^ zXJ6xD4ja;O8GSn88h>8mysoEPEZCZp-)?X~B6=N+yo=MkDa}8Edrbrbz?JsSSO)mR z%Vd#wB%z;`NrVzl5yjV;(B$RvM0p66OytVkdc(jQG`j4wc351`&`zDOvJEjLhIguQ=mz zjIX~oE$br)N~v1eBipoZ?|EebK06QqE<9nwA}?PZOpX1T%pQ{FDrYy_i8`TZ64sXw zweyR^e!WEX!_qq+nx{`Ir7yLWG1XJyyn1L}?bJ|2FzkIuNuoLR0Tx!;%8QGrES-Kq zLI#eZyY%6~wiruNLPEfrYw?wo$=O~g>J~Mjr(8GO=12{c+nC}5pk%6s5qEgy=;u$} zkJOv~KZvAU1iGC3<$HlX{!shy#j5k?-wxmslA7{4)bDUK^H>J~5aT}#Cp{6(X!b$N zxhsZ3Ay-L;7-d+O=@qob3~_#A_+?Dr>+%H2-K=sQrv^}@DsMxP% zdDPP9WQMcs!@Jf`qj}HcS|C-2w@<#WM$zi^_?8*^ENrDtyy+0!6{{OP5gShG&8@q1 z3Gn+wzKU3Rk6YdhMaV)u2tV?}2g|XSI%$s(v|MVkMoH8|>aKE2HVFcm@B+r2LQ850 z#Q*Gwe`(9t)HsH{!_9)tF7Noa(mut-^WNX7%O-aJzKYi|wy2ec4WFwlDh}Sv`Sd^)x1D~F0BTcD)G^%!g`9)}tCbR%9RxXn59X`w|cQrP$ z)Qmk*fzisa^G@$(^j~=0b<;hV*KNjK9s*ZTQk|HhIXxKT(R*uXh2}$be9ms9fBgEj zY^w9}!lhvH(ppRQp>SW#PtqK5uAeeglA|EN(i?v7kMGhRT%OS6edVkiX3s)T1340% zN$D%U*L$%0(lcx|2A#3XNE5Dl;hIC;cs(`G+9T0#5<_VM93#?33UlO`5DnKg7Tg+_ zo{7UsWvC>ICs9FJ_ZC-`)`Cu4r)rw&1iS&O?>chEm6n&D*=p#ClBKIRoDeJ? zkgkVz2hwh})(H8ocO4n(urLnQMz|75dEg8$$ICVSacF0ocF?c_lsEDe9x2Td;4Gf*79GI$>&D5=S z!FXpID`2)N^ZZ7%Zwh(eL;# z(NwaA%?I**1YOvBkCd3^FU zzm7EKq>fBy0&is>ZBaFp^rhdCn$skUCJXF3`H0+v_*_XEL7{T`d?rLu8khE0k2S9r zIcHpSQFp$W<;new;cF9d67}rIm=)7@Kzm*dmq&k6Ortz=@k0z2DT&|t$BfcPup}Q} ztXZSYC~+9IkdKr?`gWWFJ#8I4>m7tqx)9)rp?u(uEj zpnnPl5SxtH<~-gko<#e8qP6toot~>t_*nI?wCW?Mz>2`@AycDQ7Se81KWWXJIUIQr zGanHD)e^H%*O?#7c#6zAl$TqHImofvQfLd8nlUEwasr(b1E`U>VKU~2USx$^b6!{F1#|(cnFRz`lMr zO>ENKX;o^@EV7UlayZ_$7IO#eTir>aIunS)_zrVBWj``I7aeQVUl3B#KB)2@s!|^c z>O}zrakWMr6V^Hx{@~W)h|c7=j>ojDRF`JGqz`2i)2Ei>I}*(cG_o?Xu15OnoB8)# zZ#&p(FNIz-4Hx8lok)_BJhx^2i!OUo#D0sOOU^GO?~{PqHi_MNIw-T>wQ>4hCJqKL zBww-octQ{i^6YJFYz*jZCo@4Z6B{JFe}ztaXr91vE`Xpv!LRkTMJ~hH7ywfK&_H zm5g>EBTe7iZ!9lDruZWn!V@$a-*p~VEvw_noGU-5=%~QcnHV~jV)sV6`^eEMi-xco z)6di69cz~+ldfB9k*UYPIKX2rt55S$tx!!uDs^9nbX;o}2zkB&%0tFNzQh0q?t(_x z&(5wG^f@}NehpNJ`?_WxBKgj0c-%EsxSgFdK0z_Z?P1~=$MPPb+AzG8hT+YYuk{iF!bYlfki8li>qMMX=$0Ui%OyoQ zM)C+E;M5DF`h&SDY-fYsg*S9kcf>Ox$)W++qJ@#hPsZ1w0Ep7$^xA=~jSM4t5qq}& z!-$nfAW5dZdFcl(g!VBia}lndnI-S3kZ?BCF*4PX%;xJ2tF;)>z8ced6P(uQDGl3U2M%L zOw|u5e3W9A^_a9ZKICWzR|?H^CG&#*s!UxW@0U?lDJW$T zfJ*8?x$7~9I7fV-+{=^X{7nSC=9=GgzdD^RQM6*>G@SgdgDPWsvnP>huQ7S1sQ?QT zAF5A%IrTAfib0=!vXSILF!wn5r($g9x}2*OZ{=S-=aa8p?4-E3SDVd*o91*_8wM9a zDI(<^sM!@d8hN%V=8-0KFndR!yb3QygdOy*nO~nga&yOqem&z^Y7qg#N5Pj=p#sAG ze{5IGe^M9PQQ__2qrrg<{c}~_w*y=psP{HvZf3PbwS;P^1q<&iPxI{G1L(8MORaX@ zzPZmuFC4AtdUB&0`#k%@ICBv%$YJ&tkwB;K%ooZNkFKbHkkQb`FISfGuJl8PMvB9?+LFjlDdAy_(RY&IO=%AL&(X%+#GfN(Yo zW=5MG7HXIUv*I0u80ab{8rS5g@7sJMjT__WGeq82;x7v2ryvMFkC`y^lyJ9gF$ils z(=+9v0R5+Ej?Fdxn8dFew``fp}d(#XGJeA3TxrnmRn?p~&oQaN-vLlg6f3z9z7cg$m07T9@3~>(dKsy0n`^y1BKv0|+2*0w3KVW5r;snmt$o z13!{5_cQ1~udz?Qt@m51uzp9zdm{wXqr9`Znx66s4|B@xCSOXwHX zjJvP{t8Cx)sDw*JeD;3w)~U*Vm@?teohPXCs{uq~O~WeJG28bKiSxSq{UdL@e7q8J z=#h|(|DpB>6m{Fj6OTBIKQPv|vH0<1-IX;|K>9`oBP1f>qYdu=<832GaDNJ)N* z(*XhHWaD$$Qb#vx>X()R%y@97QK90)=G6OM0P-pg*mk5DBBSIfSLTWm+~Qk&38Ga_ zM5@A^I1|i2QqVf(c0>#6Txx4rrN==kHWZ-eqeUW zn*=5U3Wb+1*mt!&ny%;n_Rex)X}0WpYf+DEr}1;_NC+LF0h!O8F|#7O`)i4B!m`zk4`zjKKv}xY0V1-E7bHH#0i@R+Y2MKJpDk(by`C^GuTNbW>>A>d6E zBmNpHRVd=>*p^47W78TZJ)<%sb8=RbTY0Yv%79g zxNk1|JZbc=Tl+@7-f$qRGWVJNx_YH!2{8AYK>eq4<9CS=jj9J6;8coG^1m8J#K0x{0^b=i z$m(7Q^4woHJ62PO%+i#Zclr>U6nrTT$J9O(dJip#CQi@fsP7qATwm(z$?Bow^DVmG zrBzEEqmZY6)j&48KhH!PaG%WFPoI8H4foQ|ebg(Sxs8VUdTPmtuCQx`q=o(luh zByfO82omU%K7iVxaJevef^@{^{$D(SCr&f#6qCtBzDRt1?E)Jq3OKtV zG%wQ6?U>~GQF-??r*A6hugec0X)#bMAr-kX0gT3BZts&za)$0qM>3^RpXsu zhiq9_O-)Qf`<#E|;jTl8drHGUYHG8G*V6KSgkc-H&*WeZihB<%7ve*m)Op$TFra-9 z3j8k{GCM;#!Om~a&iTl`MS3$pU2B|H@xuYR0+ovkYNCtA2PoE{&H-U>rrdeY>{8zKcXP`b)mGcc0r1+Xw5L?xyJ9(v# zkfP9t!R9b^`anx}h+Lk**uuAm%cw6FCt#S+el|FM+1qUeDXMETK;Dq-Q^6hzv!g*( z@8h6u4i^y3Xu8!r`3GY^cf&KwOUnW!j!+#F#Hwm#O6Y$|*|PqX`($zumbNxIydg?t znlT~2)OqCoA!j(`V{o;Hm^6n)-74AFz2@Z~3n=ftzbXG%b7iRBgbiwHpmqwFtmlx(gp4*VSwS9Oextp^uru&f$ zCQecc2S?l$#W}oQW<{hD%B)&tNAvV$-yy)$zY40I5|p+j7FJvUay07z*;V^+@c;$* zg^Vn>!n&l+&@ZUGiH<(O$BRbwySov9q=b*w6CIZ&#n}}Hm9*}B*1qpLZ+mZ;AwNI3f2p9*^=f7Gbx5s= z)L4NR|Em!T;M0ZL*NotTf`z|7Cx!Id`am|a)A{`JATpjBRYe|?vCPTP^tVBg#e-i6 zOo@>+261@sXB-*hC^7g#91hcAo{IXuAn!y-0Yy%K?~kW``!vA<&$ysEwew&kKpW=O z+h&Hx_rg!i@wlxwH;edPaKhZv~Bq2A{oub~g|-g<2$UG42aab4pS4TR1l!H`hE&&Wlg^2TpK zJ39oW_36o@RRLz|_hRcORBTCQKGIwnHVtmS0_7FTEzH70gXhQ(OjrOlR-IQz!2E2p z@7GV72=#CiE*l}ocNj7IQeL3pKR&cum(;-l#l|jhE@;Zmmvh}x>$+}tg?Tcav>owl zB3o8N^>+WX%a6#$y@~60tAk$e^n6PJn_vMt+gF}$K+aJ5o5i1h3~`hWpY+nGdsKw; z`^yNZX8pLE?0Wry{p++a!G$>jZ9w!Vwd)|QHvkEi2f80_Ie(R3z6XZvzfai8xKR^Cs`J11- zkb?1^8fzk0x7EGc=t?LU*VOj{?c8+mPHHWLT921qu}$Ah+h+)gxnTxti?VKcNV#*C z>^o)^NxmJFLd?>d)E<|y%h|rQinnK$gJmzh!mz_dw;4! zCXP*Vbv96i zt0Xr_I!wK+HW7;mtnr%IzzG;>h@Dc`(O)k|{?Ok4d=dhu-0$f*CawX+6LHV04Fo@W z_cr?voHBKPr;8Ru z$KI$D-@qzHVmn<~>@UbN*^92Bv;436XVlK|TI5Y9+T= zVNwd?yqU^>vvM?mqqes7=%W@L0Os$1A~(zT9=L~BK}A4viUo+7vL6Lcxs}3?c9Hz! z^}^-IA#D_QIKnr~(Rafa+H&mBCG?$rgUB4TDJY(9Rwu?7H*qa_a~B4GHYv0S74LW& z-(lU?ob{5v@d(UpBTac$pZh=C&Hqpy;j0V?$%wUjX&Vbd^%Fmlo-`kdNX)IFbZ`KI zQEoV3ol`WLl^;*|i`nq0I{;8(b4wFs=+pj3kyB?pir}O=u_FYBK-&6uP{bm;_u1jv z`DVRs`h#Gk|&@-7sJ%s8p<&#;+>#S!W$(a(hqW=yNY*Y>u)UyzAeJ9y2JaM{?<2|C#k_u;g&r`qC1z=B)AdMmSr{E(m^ToR7Hh zXNURIde>J@i%@8%r(qN8*LT5Ly(>#|ZA{HB8^?&WTN{+NbBhRc^Q$5tB0M{V&vVg~ zvFdO4V<-@$IUqbIZiJ?I|E$crkb)W&6%2>d@53cR$_hKY6yQz55=flC`yXoms~&8_n~$M-Ai|voV9e`i5&U^8fAsO;&Bot@kd)QZC@XG z$6WZfESP7@4ReQEgmGiXziSi10*yOV-g&wDAeqi7M+i|D&P445hltE)mEQCjS$D77 zQnba+4aOE<1iDhvctqxi%GGP{hBJq>`^DE0m^3&9fzfETzpP9N^b2Q}I{;>SLPmjVhbIHt!yN-lVcCvPqG$FjFS=@P>)%${Y^8F>tO zq)fn8G6_fE;XTD4qFD#g$7fgA4D9=fv1R1`eo}IC*euF<$a5q2_ z;&Ni7?BDY&OYeu^{`Sc-6(px^k)tRV{_9vm1B!{#UN7!{6p7kGIIAee%h1CilP@Y% z{`+>QDfPEijX#36+Z5h8F91kxCdGPE|JdqhHP~vz%v^s90#cZcCOW)-X6Gq7ZHhbJdXlu*?}@V+=KwKTcz&VXyZbt@|*9G_wf9#Nu$&hM zL%16#w$Yq}2qX+%L4dyFSudH|*Tm&`aI+~)Uh?5T@zek^ZnF1gJaEs}I12y>^O^8b zM^I}PIfw0UFGmud{(UwSAYTD=))gUlc~82n~#UhFHe7N=l=Wq@wUQOH5eO+DCr{E2EyTt9Lcc$ zX7Rz|EysQDlBoS(Yo}oz|Q^#)V=j6?b;TOgae%_ogcXKnt0T2m{e;>&+#~@I^8aO|&BhHrU zhyMY<#u^#K080LMgvP%}zfb@;IMb2H*RNmSRVz4bg|s+JnL+7QWdoi6fy8vHLSp2H zUE`Fkj&t)bX~rD|;Q#LqZuo=Xe3r(37T$LW8c8@Bz4c4QgE1rBgy1?O>Cb^uj3dUK zreB!yuRI6NAgr-u|;tTaFxF zTf8R%8Lm5F-`~cN&|gfmeqh@#FK{IRIpBP0b}KqUD6Dj87N*CXTT{R@l_KeRUVRYe zUjl#x4)#038NMJzgmLk|?;(7P`%HnG$L!rSU`hE_fskq5vK@l_d}W}9MuFPTfyysy z#aR}CBj{w$3*NtfAAf2$@$7&gR*5Y;-;r0&-WiXLpj)Xu-yBG)$=@)|zyD7d%K5m> zvvA{>xFzK3dFne9ivZqt!oC%BLkB^vnK0G3{A4EoW3OQ~;m(RaTH2e^oR@H8zv20zmN@(5g!}+=k z_YeM*Hf@{8&twQ4dk>VjkU;@&^`0RN#+o8v(v4F+|6sm1Osg&};-{gm z)ItPu=Icl|5AZ4cYV*rG&*3bNXH5C$qwdNY8I?&lp8kVoy6J6;Y2Wb1X^<~WX&Ar% zRFFtPur}cgXa4c>9oUn=4!eE)3?v5*12+D#Qw(;i%T% ztjgALlbC$e##jQlQ6u1ypmVV)(eJ{wEuQ@JQIvrWF_E&@u3aN7y--UgofRjG;P$#t zQ{m3M+T!NNwd>&lpnl@jL5$+_4 z@g6gOL$v41ZKGW^IPa)r%vFP5;Cs5dMORG3=-QW$y{tApM)Q&8E%hKN)~zGCTAfCR zgPPKHO2*n&lDd|xGi__*#&-10JJ~)2i28MI&)lsB|xkN&G?z^buUf7F*NlWF{q^>dhwuQ0xlFwjT%?rsQgv;bxTo56#a?gnwu9cRa?5Y;;>J~1*V%jT_WrVV;Ll!Isd=9pU-V3?7?hbhR+Xbt_bUf$)k&-5V5|_E~&Pyt? zXzbe#xZKj!ICwXp)DWbnv0wbRmu!`4O9>Cl@yEeZfN||8#>uYzQQMgIB9c*<*rbEZm&t)&+b;4Z~ z?R#K#hHxW!I*mm1U+x`%-Hw6Mn0ln)%+AgA*Y>&CI8a4Igjp*G+7IKDuIS}2#=GLl zx_qy$VF3#79_;^R$2-ug@T!^gc;2*&a_m)U3HRY*LlDGEBOxJwmDCh?Ak$nn-3uf& zyQF)SM29*yM>5kh34g-c`0%K#EOl#mh)lfh?I1@u-_GOvI>8O5!v+~#6%W>XhErd+%DX$gPoie`) z5uMzF@|OOliLXqNCt2r~DaSpdX??^5dDYav z8FChe$;A5-Ub#&QEcJ7TyMQRnC9#yP|tsf7*>BT_eh&th5du3eJ0s2scY7p0Qhj;Yqaeo^>&`TTAn4To=8 z#4d*-TqdGPRwwEZtN6N6^Fow5IN!1VwIVr5O|1^EIiG9T)Q*%x(^T_2`B=zpaz z$y=!??u5DeZu}nG?&@U2mi=JqRd00cmv$5OfIS@K7MJ|>tymtq9o1ECQ#l*Rl8`5$ag>bi)JdS})tQbzTsXeL12hddy3rl&`z;Wgr>pqd zPkYZWLqyWt$Q@eGc-rNJ-Gq5?ciS%vgXxK6#5jMR?ifdW?W?01FW?HzH+Oddy~|S} zK=S^1t^eh?AzXEtt|wQ4TrR|L`kSl#9mN@+E*j4%-|l)>Y35_UebeydsnwVDfn8HM z-S{hgOWsweooGEE-z+Zt@TYyCBUO#Ze_i7CJJBFk0-?_c(aJE@SSe1)5Nt(*_J|(} zU+Z15oUWMUJ5%8Kn=a5PqPLVx9rHuF7S7QxkDUun6IP4GSHhiHUF(rE$Z9uL_~g(m z1p#3in6kphJnY-={^di125_S*mwVV9e@G7#pH6c@5aVuw?sH=uTM0t>L$)hW0fN%SV>2=;3uQezy@x zE){ToaV>smCSfiRqSx)`*FWIV^DhiC^$cYVDtHw|by<|wKM*#77p2NmL7aw$tDW~c z?YtuR-!J_zI1Qy>K$qe!q)@{B+C0od1o1=UEo_zBcLr}+mrwf-tbH~Js6d2TbiRN5 z?nKdeHgD{UU2yF-&%|Ec1&Cq5Cbm(i76Tq3!4r9RO? z<2*O#uM4CTLK6!uD%(vqvIkMv*oH(zX}fr{6dw^Oc}1PzgpHF4(o-8pPVl>KZ*Fc8 ztX5l)OkfU?NJ-{mfYNAQ*`6K^oi6RdA*|H-sc74(p5-rd{vN39t9iF6D;$BX8Yx@h zAL*o^)(L0FO}{pBu+L2&0FFK|kL<-Kd3zBe;b{HifA#PKo%y=6r1J|B#sqO~(0O!DbiRUTidmDZ5*SI|ElLPiyOMz@ zUZ#4s=QrKnBw>^guzv;aVPR?S?vA`m5+RhbFfef{Ot#mz_`&{r^d$#c3F(`iqaBL- zHlqhG&7|kQiLw|)0^uO9w8h(^>q{n-%^jsL;LPs>a07DJsYTO(zbS1w5x5PTduB<9 zOE_%d;dr&AaCh#ekr+Si5&N67TRef-2o<&wFvte3?e&!!%Kfl`@H}BS7-yQmp|M!qHfzsYnkh_YU*9~#cI=W)AGhCi(3 zEVvg!Q^{|)!;^U2N_6l4LLp%b90gH_dKuY@CV&5~ly=eYaHD-~Iur8|;i$_%D~_c& z$Zc~;Pf4o3le|ptc})K=uoy&4|HYxn%K~t zLo?EsTq$PnsY!D$VFilhrRJQ%#<77MFz>o$({F;kB{M5vT8CYItu&uSY5Qok%S%I! z!M~6$q;s%!E6$g|4gNi<(}g=>H>k?uYMXUg>SF!jPqCnWY&mt}Zo-kkyTwi|lcT?o zW$Udtd|)7l?>bNT6AbNjB-AW%p^3Q6n1d(N2E4)g)!Igpqft8@ysG0VDN~!Tl3&RwrTRJz#BxA7)fUE|E7U%!nKGr-=SiY1RHVRh=fug`0}fk;WkMVRG~JhKU^Jq zg&{*SKP7cX8zYygiJ8q_EbLIi5$2Qmb@sXYCOY*?LT&5t*7u<+hi&FDMPVuGaOab# z8Y>*5m27j|V+Z;+aFoD2;=^CHksluTJ|fkfOxir`=-JdpDduoGrR^(2M%($?92|8D zLsAYCQd1KSb4`V&VGaFLhmd_+(>16JSTx<_-pgsu=em134vEsxw{Ua!z1J|r8w@C7 zUz96|(xxpm+>3+v6}cN)jI#M}GKDvdRK!$APwJmmUK-m7dMi`Jo{i$+4d;9@d=f#> zjQe5el_hG9{i8zq@QEzFb*(@j!D-#Qkm{&&UXAH#ezlW0r|2Uf#amU~(3iza$rK-I zZHf29W+p{-nR5H{9;{)^0NjK6z6no#mWR9k>frH~y&zo0G$1J;<&b^-!3qaa25+mO9@6eQrDkx8)>qyGb)Y_3Y)?deh+KmwA#_}O z(?p*htdYCJK`X)F%T%5-6f9oPN{|Tsp?gxM7CI@TBXftA)+t!qsERo!B7z6p$~`G5 zdn#*#bTkrockXVmlR6Y9>~E>#ZxS8n*K=y_i85I}6N#ApNZ+`jk7PiA4sz^KOoNhV6m9QC8^Ya@`@+O_9WBuNbe} z5XJ7mE}|UZdur=i3GObOpm>Q@_YFCyPcB0D|8dk08qU^S8)DUz(RD3HV%(>~QRq%?%ZpWa% zd>h5v6Hjbi_sKMB6vdi%g1hMoMDGRu>VnqqYBDBd_?yJrK)p1N#b~ON@*@OO>}2=j zAH3e}R*ZKZtruUPO>ZSsY&$)+it&@Mz;-r!_o(s!ioQrs2=0q}j(UIuqG%T+J4=pK zfx;|07W)MK%O@tipHI-gYJQQ;B83(|z9C&xM)`}TAzI`Q$ZY(P zKwo%&0^xMaiqo@?Srea+99PYgyyllSMt;ZUMEao$yV@a&2C2a~lUrb#Yo8Gs1ILMh zna%^xx&9g*sUgu(i_gpb5FI^L$;)jxw9_$_)!Z9BBopDp-gBcS+O|uL*9vpUxr#jVK7qSpw&RIQ6~%FE?}A9FG~c#_pFNPC9BT1YD`P!Sv^x$ZC|N`wg|HJ}W^-3q z%Cn=oRxfHuhGNg%u@|h(Jzgmqx=1|LSB+;ekvu z>2yM}=3$hhZDGDLaodvETGU@lA*t6?3)xh@DQ|8S3SH%6n7@m zSc3h*%^Gd?Plo$w@M8G0+im<#v7OQDB075+D2Pr)YW(gkQ8;)hBp)vfn4QMNr|a-! zTBu7!ztp-MWq_S5$b?zbv{}Qki%fOM#Xrd;88x(mDsI2;AbFSV2*^x0xQ@ENDLquG zw<{eZf6`|LbGny;OI+lu5RCCW2y3qP9y2Q07 zmAarXj?YkU#y**ahhPxvsywczAe-p*po^?s3diFpN%z!ivgBlKnP%$u+8GM}3(teT zbLY;N(9lp-;(i}umGE@ChUZ@Db&_sS7C;e4#|z$w{2t|m(X=N>d;sMChCjVPzm&Ab zYGhTnI+s)|kdAa9zp^!`__3vB)veXlVx*mM@JwZ+hNMnWz|FH}^<8hlWhnxOBAs)X z*UjG_Br!@1cuX|8n?EP2%PrJm5Xa%9AC;fsnezvD75hg|m;97TA9y0HHIQ0uWW6uV zyjY}1{UjE)8n8kSbGULJom>>~2Q970@n=ba8akrPT`Pko&KU$EvgCY-Y;KWUFFB>GvyAICQY-#M;}8cMQIn?JKTK4=>ah{ov`i! zZ0Xz1(y!d*Wy>uOIKE}xFkDcxj=B3O>D4WH17+Q@SpX=nNMwrwDfiQzf;NF#Y?L;Z zw9*lv!@4ITs{E^~at60HEJj(YDb~`3^eQ8hkVF&hy3df5?V&B#-UDp<#-yCZ;r%lv zW?(qh^)35|AC;yCxsR87GwxT4%DDG!HNAV*D|782;CdZY?m+egpz1Sk#ot>gn=0#< z4iA);3_9v5JB~YXOiIrl+3 zL)eytDa~}9m>C(L;TgfKv#{?cCrN~+ASY2_dwJi1!S>21ME>hhXCd!jO}#eA$V_30 z8oP`f`|#{>pY?_9QP6G5=w`&K^!z-$iz0~2ya~1U)|o3%pKHhC+?&D5!q3&5JJJ;= zl6Uq3HJ3rt4)=A^qA%kJ7U$!!YnN)Oc}Hn8)+y}5!mb1}P2R-3zVyvq|5wms;emh; ziK{KVhkbSG`lr5lvN&$^{H85ls+`}nZUeypuy-@tMaN_IvJk6=Ia%*&1Y?_d;o-S{ z`V7f?rvBt9WWrM6vU04-HQJxj150^h}sK`{j>7u11%h1eH_{E@WC=HytQt0DFEHY+}Ov8s6ANOo$M_Ua=V z%oQ0w_MXcIl_4)C$*LV)D?w3RtsPg9hV%wgU*ROlguCSLfubu_vL-eTA5zLF#wGa8 z-)yPZB2oj+^9js~*s`Iu%-XZ%w|H3;PyDbS>4dMzzF*rh-UF8b%lydiA(X{j!knlD!4Nd;SnD3Oia#98G_sNL4*A6 zjcE+rj_Mi;h53W|N+Z)8@rPRQ7m!%H3mjM0KguUt%fEoFyflu~X_@$)8Vxxp$+n3+ z0M+L7qjpU{cE&!@Qz>pY`CV@mkb1`VH4%%I2hw0lu(hx=SUA^EkYr z1{$L*Sr+RS^S2iwMCX9L*XHx?Jf6y&Kp7-hl7GO>jbVEmdqkp!_CwN1Z{1C`ZKSyO z9f%Bz>0(O=V*bVlyD5hBHH-ZJ&;1M7OG`_W)IO2yMT1b7m~19t|3Qmbuyg@kA~V}G zwWgOteXFAH>mWwojaHtacl@+zxKYr{I6p~6!Zkbgx8gE%8P{K=5L(Vr_1RNb5ebdv^ z*%Q~@5whXJ0G4g${l+FqFoTkp**-llPZF3%0%)_@)M|~C=7t}yYb9HM-KaxL{tuj! z@G3T~w&q<*b{$4a=7gZj-w@xvjU(CCjQ@N<2SVzizKcuqI(pJ>**V5`&lVQISTK^7 zz36mA9Hnf_&-}_1_Mg!5H<99&Kexp)2qUq-c<6lQ2I!Hr=?Qo%=l+S^24bgzwq@rB z9XDS;1e!N=ciE`}X|3KI8?3C%l{)pKqp!$=Oa;PJ3E7;>e zFAHjau+LB|3eZ$k-faC-3zHSRy=9RAq3C$paRAtG;4$P~euZYKKUnhP2a&y&%fS9@ zUYR~7$KJhrcNGs0PkIvD{a2AQ#1!uwJz7 z|A8orQi71TtAaf?-(iK1U+sHWn`5uSx%6!~9-0wsNT?W|0eD5(Am^TruCA^x!^6Y1 zj8pl6h>aBm`$+ib7bA{r%e~~_5N*O0QVSuze$o2T{D#&n+4Y?7^Sqgj!bJmOysgw_h6G}pOT%B|cIwArJ>k>!UC`o*B-0^l!? z;YtNJMM8%c)IRAw{-X+DuvM?MHDBKH<{2z#<@iB&lJ~Q-vup@#Lmw zw_*)jpg^hZ$Hi^VaPPZ9-W;l}8Rji3(>qa5`<^}_?|z{pKFI;FqFoKPZa=)!uTqo> z?kh*exSR_Y;bD;T(eEogVM%$6X&u|Y&PSQxFfrdgXk@q8B36}`p4^r$wr57b$)3H_ zo{)T-av9=i9+Bca{d?muX|9ECd$iEZS&qja|zjD<~(%>}e?jFvc^z@uZc zv?;?<`3R*OG(jE;g3oR5<+l$kAPS~_VK@r2UlwkV_8J}e*;|ob-1|9JTlK@ImTgBV ztQ%0C)HDsy!7%T`0wGE(V|Lu3)xjT;uU%0rr;Xa|`TUvBatwZ?4+-eQ>h!bl#j!gm z4)|83_krC^vcGnb{l7AH%#W&Esw6EYCpln~?~r-~2xE*KCdZk&a26tj4qe}rF>Az0 z>I#5V%yf4CQ2#CI+#+xSgj89)w@9Jd?2BKVcR1hfO0?q4t)US0ZK^2Xq-mEC$p$+0 zyslt>4z~$6T0^sGV=uxwIL4r`(GtEFSX?Ydq;@T3?n}9|1C5w)zNBG@ zEKJZguQ-S1(4j-!ot?!=$(@M~C{dSf{GnecXooqCJn`07h1;<4;|DRjX0~)7Mq8vL zE!5Bj8|MMUeJq8}5B{pVw_uyKFIXzlj4H(_b?)3u>dKhA-?HUmFF|XZCH4Byc2)jk zE<#iKyICMX{z+w3PwVtK7(2`#PvVmd*MaFvs|AZuzRfW+KvM6aEHv@;`04Vq#dfZEY-_z&U}vva&J? zY^ZG}0XsmT`M&XNl;UB%!ukj)5&b|kQWCERG;+@*2;lsg^(giV6pJjR&*uGABXN>| z!6V*>a1~@*ymDmO*=}jx4%yx(UNM8*%&G4ybJ@yFUh|I$sFAwWy1+!SV<*cr70s2- znNANh-l*)p3a(R6>vqun+ZR%p^c6=CT4rV2l16kn0y$zgp3?ucZQXe zg}l=2(GkqnLLrjud#UVX&)#SgAzMP$vKtg*-%6n*3M0F+ zW)DM1-sj4UL4AL}_x-&6bsP7d^PFej&N-Ezis!O@Re#OhjcUsZy9Uil>%s34@HnA) zfFL^r{1e1KT^8g|+c+%w!L3X9P{Z5>%Z}hZiX447KpK358I0mcB&{}nbS$H`c66fa z`JHP?e#Bu)G|7@rJIWS$3=LKVYn{@S5)7-lDm%yv#LHyq*iyTlc9S?^E+Se94>{aW zNf7H|ui;DRRVvgHP6SWw56#g~Ta73ryvM==zRJ|=Pn{U>R&pIqxsG+6ypw)3p_i{P z+MIO&EQc+u0w=xc+`p!x(FQ*56J{B~`<8dAj!&NwedIS`_5R#K0ew+!g+M3=DBCiL zR1DvkA|z+gIAU|wm)Tq8SL@v@8x&$f+DLw5gD#tj=adR>M?lX0^+B)C*vUcJ%l&3jf zv;uBw{BMaVXu9*!{EMI{Js`OOcNf>AoU5xi2iPHyL4gzU)-t*^TWJQSU1Q||h zYcCrJk27dTD37!pn$J}c6)=$uiw_eDEWG5l5U3o8bo?^E^>Jo#@!qMn&lVR_liV8D zF&|7(_9l)!xBchnH`aoi(_wptJ!7xJzZgBsvs%C+mG`63kK?z<=yJla?zfBDo^<*hH13sI# zgvs@gbWUjkQ#5Z)xl9!7mk{KBS9W>V^s}q$=$&-F1jf^&Q}pV#>b&5(i-YU2MLQF5 zYX3Sj3sSX8X^MWrK=!~1Z?b=($rh=4FLHPV48VP#D*;VFk6sv=YMSo-!yK-MH5op8 zi5Gw$a%Uo|kyjsDdXWs(KKu42B5W%rg}T;(dy&qUhEygtHA(2TUc#%K`uXXqV*}B5 zrv0K8)gRqGtII>T*Vv~)#95HjQQvhP5%xipp0kyrU}aK!J;=<4Dqf_~mKoxq#;yNY zA!G{{img)vn*5XlW?0VkMALl^OttJCs~rSRNBNN{>k7S4`!Mk>##mdvmg8lo zCy2fctnIUK0vP`li(32++Na)h1Z=~<9BAR8*04=#L&%TBT*qEJmbM&MAp}?zNP|th zc>+yPZFKpr64h>gaVD55l zPVf20`ffpkA9Ux{zJGP}YJH-kn~S64zW<5$Qnyl0ftI5aGbPMdEPl*%cUHCeOB- zw9XN-SR3A(G{SseLLdVBm>@?eIINnVMiPj?7CziAMCkcf#vB#>U}$)n!?1cR2+Ry^KG;`fyn*LR9~Zf;_t=`Z*^ki1TO?e|XQhNIhUMMLiVT=M|R zvjoDMK~nd}_7Q0cd{*n)PJ)L`nX;dDogK7Go!I#OtwVC#NKxXucFC05-BW;kJvo4b z|7#1;v01?YpsSJXFS--rdVeVKqpRRrTJr{B#n*%eSJ6jmOG$(>GUdaMpf1+JN32<{ z=X!KwW7`m8^xLV-WSwOBPEF?@O(lt(k-#{@44}b}WWOU4Zp8z-2si~_SX!#4`&x86 zZdaK~y3FEo;V>(AN6AcSq`T}75iN*isqYsVKg$YwTY%r`^#w6~UtZi2yJ=5jj%1nt z%5qWdWK?V8DN4+065DsGsJ=wKe&F!h@t^=wTwC}6%?wU1J|TqImEZRQC6mL&HIKyM zj#i3C{H%8BQod>zpSa1bzH=?vG}L^$n|B2**^d>RMuF7FzwjMY-I%!p=hePuZm{dX z8#l+L_YIM%v7DSf@fXuGEZy{lC(|CaotW@OS|eHkq@j;Z=l`*$TP14{F{+~$#?UI3IwsEQs#T{F1;_AGR6o*P zb|T1|7$esxvGA%fehWT#N?0&=&Y(WIzw!vP7WeapOzBi4EL-f%{8PUC3n7`ZP{s;J z?sRZ}xL>I3sp(>@#?564!o#dSR$%T&=Cmxxg-rv&FCtsWKfrb#|LHQhk!e zeH+;{jbVh$6tgb;>(PrsJd#Ckxn$;_{(=ewDH`2;YS+mg%+3yngULT8_SQvbERH*kP){g1UlQ!pp=x_?$fcG&!Z zrY0J^%KJa#2quU|Ey@VRt%wnlOnY~8%58&i$96R#c=aXH8=rb`BatswG7;1$Xl1s7 zi}`6?noGFLuuUP_VrM16Mw$ZlExd9dm$f+YP1z{vD09p4Z<)P6tM8}rmyJ(RPacGc zj)AVJ%;bIl&xzjCs~s*;>XA_vu|eBTItHZcOt{!N_S+OjTks7K<~slm)FJ~S$!xQ&RpufxzTcGp)SPp7VSClmmbxYUlVVoy9NEB~;Co&An7OTn>Fu4bM~b=~ z!b@~^cYJ>*UAzWE>=B1{ZIDd_!O&#g9$FrJI`gyzA)i{pLLR!le>P<* z26-8SZa?i_pY70MBZ{=xEG!G93GEBx-+on+IS%;1{8>WAyWvlRkoL6@_;~%Fe#QF5 zZH7IXWSd?){%FHqXwW=ANKPO2h=A7)*sK&WM<6OodSXf5<-DYJb#tM%x%Ow~c_*e2 z`&MEO8qTiM^iN|U@$IDtJr=MM>x?q5scf{N`{MQXri}_ueZGBcnXK3&%eWmGUm7I7^`dx=TI{ zLvSC{AE0Pzv|WBk{^a{@UB%NnH$5`XY!&ch`{m+iV$%WT&!*b)=d=?Se1JgMXsoJJ ztwhX%FJa-2Xk0~-7?#$7S(ng1+h042x`{?s>yL7M=l@eIYOlVY@owkk;_RNAc8-p- zBTut}-;A&l2>l7Mw%Hec5R@f`@~FG@k30E4n+}}I4k9~~BkVoW*ZwQ4au_Fgp*D$LX#x?G&}8m4bKMy&9A(%(E5 za@1v1sA!AUB+BlW;s)J5g{#R71j~~^&$jPLHrwxEa&Q)YrhSz1WtPia$gI4FJ_e5B z`zkB3K@`TT$YBB(j|ch(mE@rd&A3=aX}0N(6i)Yk5*}%By^SS0SSSa;K{x(9p^A=< zBbzF|^|jxN`!%#Awr}?9ly<1ZO|m>?d~Mi8UlB?&dFCdvN^Own@^BEHcK%Ho-=%=qrZSH^loz4?XQf54af4AK1N%v zONoB-NL=2`GkQ~+9r^Zn1?=(Ai@&nXwjhlh2{gTLsmUiy>4AeLbYO{7oLQ(vZd!#3 zy;SI}t(w5e&wMOj-Qqvnow?MIc~P6- zH1ypsnN!fwy4dY>er=&EUkf{Erbo!;hFe`*|1}>p4FNin0tYCawL9eWxPMpnn*LQ! zKMF(9e0$5?CFs~2K2sZ_f0DcC_s{Z-PfjB)csD1+W}-&GFgvzip zc>l++W{WUC_ivN01+N7>$3UsoxC3^mJ?^bE#Y>|d1-&~+LDtNeW8b7xail;%m4*}X+4EXc z)Fs37A4h!MPUY_zE1WX#9s6Uhw*Zm{|6|LJ7w`|~m7Sg_IS+#ZLNCA>&~v&4-H=5d zt}8s@?mV@3r^YGz%$uW!Kia#v6n?rb=#>}DIrUrP@!Y9Yi^gI`8q*vHt)(QK-dQq} za9qZ-CYdMKEy)zAu){HB7j%ntOhS>8TRLz@#VA z)p2UJ^rTNm>!HwLq~d^{1^6b3VbUu8kCKEiY&-8J$y+5j@}$@3f%WV3qhF&nO@h%> zEmV9CPlVdK#O%~Zyi6O}9a&bGm7t?w1KSr+cQ?hNh`ZcXD~T zrSiF$ZGCDY6g&W1SkJKVQ%?2RAAT4Ax*o$d#iFG>w^;0%ftq(i!$%FGl3NK{%>!VC z!-IzoqFAiaa`KxV$|eg5-gsfU@6`u^TVtN|U-+K9FgkamHA?PYsHd4An_W@CYjfC+k_$X^ zxgNA2S$ZBovonKv{KP)N~%g9eFF$SmP{?M}^dm}YySGPD+z>?Aw0Ws~{w zU%67kS&p*WdP&lwi4!ihS23Dqp05zUfyMQO>}M*oDbb8M7k175?-pJ^$U79TX%lv@ ze(?d*X{Y5)EUK)~DwH&El8{A|cQ(?}Mh|Ve%nxN{1kG-|n8>@dq0{#C9SU>$gG-=) zL8EHZo5QLZ$t;BXypIqi`7;RCXN(rHt8L{hvL z@X7?#Ze*O|quN>KbZ66mQ_o`GTU0T)o?qGue2wX^YO6I>`Nf;V|g`CxDg!a9;ug|yXU|X>+%JQ)0n*#_xJPIi} z6rFiZ*2wW&Qz1J z(svxmx=H_B-!Yi2vOxSjv0mC2UUGsg=-@uHjFx0KigD!(#RqamMArJ0Ld zKi!ydOt%Kfk^s!bBdEfNIk%N^$>agNirQ%wvV;lBtA}}@(KmKIJ`VC#gWIE6mXGlmO_p z)|{}VHikbu`V;nhR?Gi=(a0(@A^F_E8W&BnZk8Gp;C1a|*%d2ABhjzMVSbk7WLg^g zZ{?9b#gg6^T=*)2OHX(k3L5K3_=*%cU|6?PoMLKP1r?06UX5#|Txften1t8e`qyYB zk0=!P((HjWckTCyACE98%gWSd8=t!}Q!n(2NQYVPT54ci6mg&&IKHd!zxay={W%vN zB}f$6oFUD$u-thx;{XR;+pP?hvk<3mld}F&IcUYuw;2`&v`E){^O^+xU;I)1y)Oz2 z3(FfC8iJR%e=kt31O>EN-*mH*m@!!~FnQMgM>Bc}V0?a&8Xw4MBPVs)0`rh&(H7U6 z0*ssO6Y50D9LrI$uqzZc3M?T%56tJl$F>hkzaU@h^&w`reJ$>v{kfTl&%z%6igd0T zq;Hxo*oy=c$g+SvfDzyD(UZq%8jFkgU+3qTNN3s!X#oCj zR=lLY!R$3L;8S@$x%baZ6jH=n#`Ashi|P0BWOitXi3l4=mAt-!fQ1aB)c4!6aIcrG zk3_|wxj_BoDF`f6o<(3oGPbSw$DRu;h(Kfyb8f z1qul#9KR^Z{*wZE+mJoLaG*+qGS6boC2!!@QcEu)^PedcMFXHcu(~QXAaCVih3Dvj z8WL>5q{6;@MT+E#%p1KsDf?p$^N86#B@3LGAkRoTb(9~f)!+cbv9!j;RoftEA5rB?lYV#u@{rl|e9ZAxDy^zyKz;2Tx~^K<($ef{QiG(z$k(0CYG?hb zR8}0I#gq<=y|8VR6gU#0K+TdRZi#X<%y4nqPog~^#RK>gD$0#z2C>THSj#Kza{=SS znC|IPTP&|?#2#gaMYbmBt@QI?EsF>QAp%Z(_=|_wcNY~9rK2?(w>H~s1KwDEFuy$m z>fk57OK#&_c|7Z~Nb_&Aus};0ckBB<;rSn+XwGz0QQ4#^s)kuBaO02bY7N8xYy!}aw}lNau)jmN z;!2kxJr!u2XEjWW{;tHwuuf!@``E#@2HG^~@|ByXf|LqjX8!+VJ^a=eNQSMvc>u!a*Q z;OU4QayAa>XnoLrp;NunSdjEb(z#ICi2pScM&f#-3n=V+Aq~Z&K5_5Scv4OK z*YOwVy}0{(J0Z6SKbCAK@0YJP*_o<|&rPX%v7KJ7yoD@!AE_6Ca0(50TE`6Cw=N-J z3G{`bexG6B+En4J%c2C=b1H^u>vUw>BzAQ zd#;3$IH|CqECr6v?Tm`3op8)p$BESW=Y0!>*9JdJ8lQKQ$5-8@%(;KiCcJ-_yW#E| z^x`*Okxwzuwrp@4c5k-0(c*fNHQ~5<=kUXvfePP{PjnKJzkOuCv3auo4Dd62dZX4h z$>KxI$*n9sars5_q9L2Db6Zk{TC?*kxJMkBnO!18Z6B-F52b#z7M+tMx%L757H+iB zEZ8T)LcA>DFHiPxG=JxH6v=mM$@8{VBqcGDzsfVl=DmvVO&Pf+iSO(Y=NE}g7T>k< zT8Xl=z>Gh&OjO{)xK)=35N39POH~>+T)js+5896&rcqTYNlkDKR8*?n^)COB5~d?;c%wI|6zbt|qbVEmQv|df*Qw?r>2<4)3x>qTf;-%}UatgBIfvJEdC97JRqq zvPA;bq~NZSwgeq@+<)C!4l%2Oh4hyl3->@0NG)|v?}d2&+lUc1Z(hKDw3BD-O8%%p zuF{%QxK6WjRuV9ocELOP)`{wDq~xDC7p0h(B3ElOce>RCBk;zufXk#WB`QL(pVegO zu6l2;b6TZH@!6vb7i2A;b{!W0Tf$9Sdxiih$!v=}fc}b*Osm1Gk(6X5dK<4QWnRKH zeQ>b=HPNJ8Te-)Hhd+dY6tacMCeDD~77N_*Uzf|3NdacRXc=!`#@HhC6&sr=#M4tX zFy@8=&MFn$X)!uDd!a~USCIqJtM4gfx&i1}=Zb#9yb6e=Wz)c|bA?~?bUq+A-xAq; z{Uss}Gb*V+a2`u&$O_fbsj%QV__rm=!jg)g@7O>(#L{33Pi3cJNorcxw=cW7MPz%$ z#{S&sjHG_sU3KF>MeY5tldqY%!u^epkQgKUkUP2^sS);VHROX=HEo zQhn5yi}c(!j~K&3j}&dgHz^}Z>ac~O+S)oD_p&|JEjZF|1-!IOLTi72)$8Nhu}$hf zm_ZzFs}iVCy`M`uRS6H(RSJBIP!mOO9#y}CbMUj!HlEf~9~f5P2YnE9Q#N@Y%`%*< zSl=^_ju*Ms?cWj;1oEcVaFB8)`U*LF(EBDnxrXckPD*#%dSlztmWJ_q+no{*Qp zMf&#z{^`>KZs@)K^gr*u$z&L9?D#Gp7sD*D{FCZQkt1!EuI%?2Z<>X93a>L%Q~XT8?0{_-l^ z3g5Z@FBkDFp>iYrXiJy0WFcAA_l=}vmC`I>+CQXnvbafGN1+<8k15n|e_CyylRI=) zGqj$ROBSXrxdIBBQ_9yGbh{E2oRzzVWsS6ixDx4*OOwJA+JQ_o>{RQKae)A`6>O89 zj-rVVU{~1mA{utO_2#UA-*&+hD=w;g#{P)0&U(OZ?I}73O_lD#x%Tq0rX0J} zv9<)qm#amWP3M-|IkvVy(pxcCVp0#e+RFRKTZ8YUP}N*Q5NJR#=zQBTBH(#w#9{iX zRg38;hEFXIFBzY|JW+t?anJ^&F8dfQNai3eqMF;nKb816c*DSHH&AuDZo5e#_`|Tb ze8C^vd$8#*BQL!i0c?LN+Hm>hiZ*NtiAuOQqdd_8K1Lh|Yk~phnuYfLhWkBBAO3lI zn7?NJ3D2Pqd?RAF)sB73C~=9iMuw;Fpr0fZ<*3Ce`nk zO8_a0qM|mC6@^Jr_2bt>a*@8K*F~{tai(rWE7cC{(w)D(Yk&hs9yD>dE!^2D+c zmt(3gAFNAJz;PDRMLrC<#Vh5QbJg8Z-T)Gz|&T`pf=cH=XX^;-Rh^m2Nb zYhv1;5++t^>JlcZ(Jj>aHZRoK{{#LB9k9qF-aFPVTl?tTa6Ov7enO}u2D$QhIT>5P zw`aDeSoJ%}YY0-V{~Ia$2}ntxxFnaxCOxy#oa#9viev<90Jm)~pPb~sHR}K=BB)r| zWz3`(fqEq_lJwp3{&GsdKqWCSDre}U&C6tB*}}ny!fud+^48_Ef>2G(sb`2h zLjaehJqfq1#?If((QdT)ZAYcrPTVT+pdSa;$|>WvY^}>@=D`8V%4WtgZj$nc8rj_m8AfF}76rb|dOJ8xcVbpw z`f76shAx{hO8P#{_GvliBXH+|FgJMb?EOzdLL+*= zrOEPF<`{X(v=RaxH9uB>Et`+MEU^X6LsPxUy<>NSl7zov;)JSZmT6;`CHqRpZ2dU| zRoXOh%TTa{rG=+bZpbG5OxwpnR02{LSv+Izv&BZRvWjhX@3>!YjKA44- z%`7x-;5a$uckhuQ-cb&LGE-HVJwgUC!tN|L)vy|v&A+-rSjZ?J#;&P;tnbjM&bKHu zH$icXqhhF5$q>n6J2*gjse0NiK{&V7Bc`vOb?j4a=Yxt^aoS(DyUR6a__vga!y!9@ z?d6bhnjx!kYalCQ3UPahn-S0$*g%&u92&HZ4i$Bl?3CstD z8h_Oj4^*V_>$_!mwk+}8;dJBrB&WIoyu=aP3NqH(%3$vKak;P)L#L_H7is?xr* zTN1S~MW=A)R%2tM7&fH+Fb12(RF`|4o1B(OY;6Q*`ABHSmBlIh&V%7K(8>Id48mIS zV~7^Iy+kY;|69C0CNhcf*FJoA!ZnrCe0~~24S@2<1Cw+|7=`|ht_n@d!iicvJ*CD$ zMoE(e3~}4V<;kAK)f`0L0! zI^uS=MZ0S=<LoXS`C+EA#T~9qxDGzoSHC?%TE3Kri zPN~k2FKMW0!=F6d=#VcWRH!uan^}WFN9PuEf+cJSJh92|H2w>ocK{FP^g%rW7XqR= zA&k;tWsUr}j?T|I;6U!lv^{Bn<8W}hJ*oS&V*(xDw5Vp{QcrhNcb@z$c%8C~DzLJ< z9^}_RDnD4$L@cOb3w*GKtZ;ir?EYYZ z8*ftXcKh|CGRva<_bnDmpIQys(71*J(pCZGiG1Q~(EDgXgREn<(rHtY>iHRFv!&qgYVsekdV2ws3eff{ayuFvFDIeYeG0T@pyd<)HoY`b3 zOr+`4^E06SBQeTRPj{-4_kFTxrxR=&b z@)=B#=1zMlNhoYEumxA1<%oSe(wJf3*!wX&+b2!JfFn(beFMbU`K{`#V=V`ydsJ`r zo6>?qP~yL!REIM%GTbIY?sydJm-eT2Zu(rN3l{T|C7F8azF@8%Dw?s33-pp_Jt}Yyz3i~qNk1(m z&`5emC~jtH$+(m)>SXj~B%Oa=)hm}YoxilEg{P7d#QFLu`-Yqjr6)nIlA>lgI@Vpu ztt=yAHt#Rdl_8UuEiW%W+}t55EbMfSmc0By#XFhAa<w;6T)y#g)cdxPVJSHo9(ix;EHm>#SF0RSzRP*N#msq$H4zuY|YA-9q%==-bQ2E2W0WfM8|dOep-lkx*X!1rgN$P^l@NrzVjq z-1X=TS9^|Sr)ho&_QY}B=s-s$@r+3M+KE67%!x}1ZtL2;%dg|LaZUMKew}{8LPnX% zz*ANrbp%hhmlcE8>^$ije?hb9Cv_JEK2TBj1umkyeLv@hb1Xb%d^0?e1KKxrOy;mB zecbBr|B$lt5O_iG zy6dik$o+F!`wxKfyTf%z7SvV*b&7h%^)Z~L#ay~#|5JzC^?{0!R+2w@E}WK7F~LN} z1`0jzpwiJyYiKZU&EvNYuz97=rEbq0R^Qq(Y=)UD^_TNU19mS0(bCvs<;MrRoK1%E z7l>A}Kf%5bi&GZuVmiH6|HFl_pb`1LhB_>+n9fgDGrDj-KG5IQTV!J=&CiSr=A#)p zX?yv&suQ2xD1Nb?T^cRHCwWepR?C%?fHR6P>=xxl*ohplf^+h&uY%bkYDH=1lN9i_+Mr-BVR zYgxw{Stz-@J8xEQt-73WZ_k9mUj6geN3HDaCTq-^vxauF+Z=+#)Bb}sUIvwZWLJ0z z5Tl?}#GL&ZaJJm$GJmhT-QoNsa+;lR`l<>VR&`_N-`T zY`^qF$ox36n3 zU3-HeU)%BH$H!;aKB+{>U53TQur(cEoie%qIPUu=tcKY*D*D(CFr2&4?vb43W!`rQ z{>)7zX;?k*F^RC+?hp@rhCDz>O4Tb9nj-2cFQfr69%2X&c5Zl?+SAG_==w+$*; zg*sw}12{~!@22-2T$#$YV`5;_*Xun@X=XBfd3)m3>*H$qL{e3f zbDbU{b=JOolwb8@m&jA=h)UiC3U>_#H}0*@>=UVn{)80wni_(kYy%e`ls%bZi)~Xo zZ*doIRjAm1#bn4JNYVSX_G9*o_6$E=aK#?5pN z)_{_eo2ySapl<&j4468zoz?1!|AB@cOh5yW0-17_b)BpoYS0+&LRb5@(CvU0F%U~oyheDvMcp$_>D=6-JBk*T2 z_sjOn111ZyH=h(Jlo6in@znpoyDj&s!l%LeH$)aKxRW)7SQRvcP}?{Eze?(~Yzh#I zp`X}5=50R*(MjcqRm$DqWy4_X8WU*6Fs7x@wS{%;>{KMq(Ji(>%m`zUf_3YA_4CZ9 zSik}XzYEM*yTVZqX@DY}90-OQcNsaQ`xp+T%`$HdJzCesG(IAecy@X+Hn8c7A)ow? zDTPPlV^sYV;y0|Y8ijG|bVD~Mfa*OITbK0oyS4?!#gOB9wnFU#nb@%!sb@bx!H%Dr z8Lv-xP;tC0m$f-cZ1&FOpF1#kuV!rp_Qx}A{ZNbg|3Y=?C@rqNyU5YWh7H3v>hC78 zzC3H>qePGmo$sXsKI@CA52sKuq?AJ5Je$)0MjiJ==Cr@s+OiWDd6+UJ0Jam>EFVU4N?f+~19H;YnM*WAD zmm5Cp-3Oldj`APZiX_UOY+#R#hzoR#Q|#YY)QdIw2*S(vUo3WE`rg1s1gATk15# zP)8U%Dr}u52XieEKlP)oGrgpN$GM&|I}AP1G%L^Z%*zQdWmZ*)VA-ontsDVsTKQK-nTdt=bBR+AeC6SAUhARlaew=rkIm9 z3lFB>-h67bWH#2wQ4E8h3xbwL#Y-}YvpEf|PH=sp?MRwTqNc`3M!q}Cje31IQ_P%< z6gKPPw>>#nCjG45U+E>8*vThz4SBT`DH_=Kd^^3%e{BgEq8BymmDjF!I@ zB82*?Ma!Oeg^Hc96c=uhKNE4r!S4yAnrKoG6}m3}+#zE9g&w@BG?sIDD--+6(hg6d z9yg4Ha8Uw~ViD1jNd4F^Lg^lmT;I=M81l*x-=C}Q(jFt(70N<6%*o0XmHng>IvOI? zlIzWN&L2j~SJR++BSA}4SmdU?#p5(ob2EqN?tP`k;v&ZfE#{rdyCKwh7GN!f!5Bb- zsF@;0JE)N@jWz7PU-G>f<5;EAXPk1L+bAFMI`rICPO|+vJ=B~nc9PQlK9+qbJTXw{ zjlCf2*!u(0qQlLu^i7*)khmd@(QgX&F=&YG_vx%W`&A=-J+Pf9ItJM8!mS;|4CZfO zyI&Z?|8mhNyTEo<9*kKF8oDOv)&kopo1`J3UYx{vTJOk!F7E2 z6}W?W<7e!h6hbcmU-F~hG3s|GPDbvNc*CKf$`$NO?cOmGsOURBIF)ojI%2TaPN|tm zKLST{(FM~!P^%$!$ft7z@6=++Y+-E7Zp^o{e)nx8IZ#g>&qJeXj1hYd@@6q#Wt?sB z)^d=Y$LB3CkR1$hZHi*ciBm^d4M3~czsONL3gPQ|`6x!9eD=9RcL*-_|25@d=osv8 zSE-R&T+=hh<1?d^j$ba{-;-(D_;SBH%gOUcu!=`)a}@i}#nvV6QEF!2+A)M{%4TTX z2lea{Z3^&Gx-=;+nasU9>7&~_@TDce<(@}QNn6rWYsc*}W8o%|O-A}3M6kHwIlKqX zZFp{ylFp$>vA;&NN`IKgHv{KtKvlM?Vbwz&)bxL1_(Snay&&KRNlNJoqt>1 zu5{|xHd2>Lm;!#Tx0UF!1+OA~+XH?R-1#=UW;1SDzwtmtIIua7<7fj~R4|{}Duv{OGL}=T!rYI)19G9!T0NLm6Pw%8m zFv&%YqlUuia#CMG$yz^F?62HWofm&l`t#Jr@O4g~LZEmsp&Zo!ev>xMZDQ0fg7{7P zdU^j<`X*^=M9q_HFg)cl5s2|9I44Ox9i&3XtKgJL!`)X!qkXQ(nL2StE?3iO9C{7t~a^{2>d`G;$iN&BevpgPguvE9+3WhFs(JJ zNR+)=6jYI63LJnUM6bgnW#^X4MdWqT5J_V+fJ} z9&9@#zwzFoXvi&m0GGhgC|O1;KPb`+Q+V}*+HhtIo99P*Kes)o+k0dZM?mL3*1$|g zt~*(LIcb zuvM9@GM@R%NG=0!8=J_i#6_DV4`+eWZX;Y%3X3k?^9dxHIm%X&aZq4A!H4VscdEMG z^YQA{lB_!w!Za<`orne2hBWQZ?HIgpbNJJ^@6w$*iJUa0c#Q$9_F6`3MY$tb@gv=@ zHx|p(W(x80UQAX=c4Loon%8;XJ?1A)d;UnQPmRXj$8knU*x%l`nKDQM473N>SNikz zL~h@B=DE>Kx(24_yp^v^tRifSIt;#4Oqf$>+*FN5{`;%PWsV4pcmrxROy(ao@7T1R z9yDrYkmA{3nb`e7iX4(%v~-PJtOCq8JJ4Y#EZPbT==+4eoOi)sajh1+nHL;^<~MKN zm@&&a;oc~JxFj>DaKtuTvA@A-YFG}uy8gDkGum&{mAHacDa7?!W`h*_i!=tbhh0uk zy6bwYtKE3blp)X?GarJTEQ@Gm8W-aFqtUUD6qMyk6-+$D>ztc63a_8AtJ`Z^>3{vpY$ULK|?^~RckiryD<*QXHN;Co1W^r#lc z^<`b8D`Uq%M+en+gvJnVwVNO1&k`eeq_Oactd| zKt(ZQbLM$t89Aca2Go{ z@o+llR=IVcBIA)*6$h@LPF16Al_}WTh@tW#b&fSZ`mcZap{~xvMShTV*G})U+3VM@ zYyGt8^I34QK86b$xEyM0p`2eZ(6x_G$SAW1VU0plN1okvDz0X?yL+2P})p)#_ez8CEkH zQGe4bhdUJ?7pKKD_rfSNA`vV4>^48*n{jY_fM&!1iLSdciHEVNvs`r3kZ2qLja;;Y zaw7Fx53UjUF0>>O+^SLgx;cH5My+#DGGTJ(kP-cY29{)0}AkzC>@{si)!L z;a{@M!e)nayU^Es1-bLyMTux;v4`YW=N3=oxV6~h(&`8BzCYSt_GzLCT@NLfpvmoT zEm)+!*c@tBT7>Fz;h6_pF%H!t-J943@if zo8HAuEjSR`^;n%fy7tOCkdlzIe&%ZXh}}1rePflR?h}Tt52)M|aEw_}zL2A%EyInU)XPB)Lwk+fb2~EE!E+RX$`8Tt$BF++)YO57a+^=p{G50 z_Ei7dukIJPPC19q&e|zUn_xq!J2y*EM&VuYZ9TstaNoaSaq}&Z0?|mWKVk1U%#&H2 z&n?k~cu-n(8d|8Yk2TGC#c7oS6}KqKh~%i7;Vl0DX#)k@C)0N0Jg{1_qOyO^ba%O!#_P)T184R+boot%K#b+(k+T=qH*v#lFL%GH?Pq--!JbFW z7R7T%W9#-~anXwJ@80X3Pq>#aAYg(C=$Tba2@K^5l2KJoR#vsOx$3jlG0R5mhLw2+ z*`RKh=N$Yt?%T^;xZ3*Iu2GkU#I=B_B3|cyqT|ZlEJmJe3@z zs$b&r^tP0rLI-iEw%K2~@_NoYyzty^#t0z}X@57(t70y~t!D1rK{B~th3RrnN&6d2 zm|dCMkx*AjdzR^5ydj2>VNm_3!Yau(=2M#n2jKt z*^$8Yn#cLxMR~QoX03780~!x_8#YBU$fe=B#dP-8zj|r`r3IMLW%&K7D#h zFj?^#Z<(^Kjd)2G?AWCAN;8;8*E5Fzv7a5cw=gXjgA0cDez=Cd7>-vrDfa6< zmXMYPMdo3^jZ0i|SJQAvoBr_d+?W?vT}x73V}Ohg^Z2OoR&Xb8v)t6p-%D+{UQTy= z`H%QQpRt^rd<2R&nKS;q>^%R=ox=3}t)@wZ&1lDybFjuFGWo(_F#sC$ay6y0N?7G-%eh$}7ppH=;T#Fh5eGVX@7xojrDPE*{&K z6d3wRBVR_qt^c|RUn zP1z5*YU$_gNGN&%b+!qb+xhH{Tv5TZZK+sSW2hN8${C;AmHVh?aB#32>Z;UQuN`nr zTNu7XS~0v2Qrugs5qa@yu}orG{`+QGrp!Gv29?lkQ>&4`w}Y(ztcgjVTJqga;lrvODGV{^=T zJu{xWnJedW%1pPrb;y0(%s$KA^QY3Z+Xl*>#Alu6bdYNu=rzR5>nlaLjJZ2Pd;-pO z{f6rbO%fORx<%uUv7ck)Rc6d|K#E;W#`K?}lM~_sHHu3zni%r!M}B*y$kbxkp9dqc zreLu4wr$(oPKbz@LFq<0m811WC) zc&pz}$`>oF6%Sl*iC;ew%CmnVhX^%QQ(AI?t0`UTRqriyhs$N2u{nBrIO|8h&{E^} z>C|tkDn%mWq*{l(s&Xh=Y!Mx(q#DRosP4_g%^wpK%yoba{#o^~>#f&6BYBwP=EY5! zNy09g62lG2yxOsEAml6z_+855?1P($Yzlvdb+rOPk&~0#gEQpQQoyuG&kr?uR(zNC z*SaX7J3f1IFuioM&}hjzZ^eEcZz=t=0%4xbsVXNYEW9yPjiR&+IPe+RejrHkrNK-sxntHh_?% ztz5zEqj|FdGNv?8%elv?XKfoA$DLGfJRh<_kt=xbcJf#Jf8(>c(mNmncovH@ z9}3^y4!uxC0o(3Hn{gye?}5a+PL=)qEGm00n|&g0Z!_Dk4XCCnX%iQ5aNY7XFj@J{ z4!utWwpKsJZ0y&)v9Ft1Ft^$-`+)Sfh*yGy1oGe8_m`qyBKd&%?n+A4`AL)6!%&AFlw4q<3$>n#!FP{ zi)J5GxEW&7td#wd>ig4rCoeyou6(+wPWL5FwAsEzCtG-|V*kh`_8=|(Qh9;d;l8NT zA?ABL#iWxxvlKOEJJ~rk=Sk{SW`Rx3X%&9RIiK!Nlm#bXhqicLRH9o5G@9#y8zdu?APC?;c#^Dv$^~&v7i{<+Fh7OB`T~u6iSpdw zMBl`SQD4$+PI7;SjuHCwy+fL>YPL+PoT#FX$rtphcmQIe`Q}$xwR5qd==h%ATF4)r z9ogdN`#R%7u9nUoa&eWLW2WF{)b+Sz{*GwfLUr`#VAtt32s@lUAr*;2;VPpqNxKJM z^B<XSghjr>TJ_St9X=7tYWS~XdKtMFGeM%!#r-+F;yv#>c2O^ehq18 zjmc?sxUFQFteCj_NnB!c#SCS&Ee_(Wgoc6OoQ~(oebIq4)O3~%^2Y>d>@zMOrzwTB zTSn4oI402QYq*8ehvGP_9lFQ@iDD1i=Qp?RMCv6MXvrC3>y$#{0szIc!xIxW*D2zm zgjp3Xm5oXfys zA{&EsJ@sd`NoR_fQa6p6gQ%f}xj4O?Gdh`SRl+gP!w0K>YZFBWmA#cFQh8xvH2| z?9rU}lsA*Y@FS?Tpe%D$hmqe0S43@^=k`iE=34wg#p|KiE_GxsE318or7X66_g&Xu z`PyDMl3ctFr}d&X9VAr($>-Hte;UE`X*aCayM;{*v|xvq^TuES!Po)b9wkoYs&Q|= zum6a>GN4ZVj$zbMg6=+87`Afx>p)>x;`~n!I0*;@xE$$ z|QB@c1X7RaSzOrN^j+TGNq#$2&u!Lx|uE^Jf|2 zxGGtJgQ|Q`8{F(rC;VeY@#Nu8?y4T{u!a@pY zIdh}E%B{|kf&%t$olHH8&RwJO(E`--Ud0C`m=#IHXF~_6-1ttU+)+QFQup-ux#?;d zQhpkN74;#U|JhH-oxB_OSSCW6MEOr03L&=v-O24bNPZ=VXHC%NcbZ#&k&4=3Qs@VY zq;B4rgG@kJ4RW(&ID%`)XkP;&Ne35;J^hH5ZJFf&<_xvTS+)~UL~Gix(58qv7le&I{F zD4n$3qLS7Z1r;gNnx&|~=S|D^(aD#7oryr6{yImebkoPW;VG~4s z40C{Ti&>C@*&n5zcTLZIIw!W}r2~hri`@NFf&S?H$(89m-}jq6D^`N*=;U)H>Gntg z1nxJBj}@V1uK4|rPjdVZj5_Cwq0YU(nc>;Ps&O1XfmE*tHrPQTB&sy}2a6I|@LbK0rU72_G*?{mZ2&>rl*vV^dO6umWBm-5U;0{O?8CEZjxd+7Fp4 zggOS0OT~aTs>>OWgxWbM^%Q%ZyI#F$oS7Bwyd6 z;lY_u4a~+142_aA*Phg~I)PyP0R}&i=9)6ydlE$Z{_^bA7tJL8$pKLt-Rg6~A&osivzo*W=n1ixC+_n||5BPu{4*m^$ z;44G0-GMLcr6Af_(zR}$2x3VMhp7g6+7di!qpWpI8Z9rEQJvT_;H${nN>PZKiyX0>4VvOqArFQLg1L23Y zQb$QB%(;-}YgoINA(GcmG=|OMiv+=2br#LT`LiIp+VqshNq62&_eoW;+UkaTJ8vS= zE#cC5?Um;>h9oJ_ebOqVWB^zU$Lwd^D*c?sZ34xoau81`xYwMvw^h#)^_o;c@wG8aC|*q zSPiFL6c1L6nDQj>=X(r!m~I!9!fy}*UvytLidrX5-}f&b9clR(q~sus#u$<;viwrHt9Fg_)O63>-Z(yf6fj>A2>RH%tNG zVxH`vUA9V5B=8y%ck5O&FRgpkH8m{1-|buSZYNCdtwjp*lkdt8?F~4KB2h9@Abhx& zzLHr2uB0$T^Vd4wXC+vWAqWUew_X=k{66J=92_B7c0a(yMqPhg0!XZz4L8fw2HUtU ztI+yj7r*?yistP@;6hQFaO5NY1B&PXQ)SBLu$Eg3P=T>1iO+%0rS9JFhkT(@iCSe zC}~e&YoZOdEG!{gCP0L5F@nl`Ez?16>)^y&tKt;Aje${c7z+l9rh5$}oU4O<&Fe5S z0Y`gi`nut5!%Po@dxZoa98SCA*^9xR8%i)7rpEC6J-m|^@}IEB{@jbX!RpZUWp^O1alfkn5#l%6lr6e0>EzGygA%x8^YuyHNUg7 zHa%OdwhqYerMxjZA;wG)j?*_&^3gS;lf>$NJDJ40Be}Oajg9upcY0f5`fS5Bpp6EY ze<$uRZ;CHV?k}gQ{U#|@o<*4pLeUMf5pffkmYWL0uN69_g=(eVHpb>fpLYHi!+(Da zu zvvjfeT#+i(lqPS&AQJotBiR#`?u(zBKw+!82Wye`m!HQGXDH~czUq#7$h9PO!4DkD zDTZLz-_C=A7TA%PIQIes;-Fj>vE1)O7oRRnp}@2c>Gzu>e|&vgq|(Qxm7jK8MEeQR zcRBhE4`F)d;8hYcO{SF#`N800c+a@up%LsXmeQFDcDFGaX!ajaJ6Daudqh-t=)SjU z!AxmlT7b!)rtN49CLTTAW$9WHL-vW}&rnyQ_6EU=Hpq)F5nlFK1(_zDlceabL$hX@ zT+mK$(f$@K40)+)<_Gt^4-?~rM-3){29#v$b32kVBxOo64Q!Q->Z+%_VlGhqEQRXV z=i1z}8c>)M(IjOk8J!$0cFaHIFab<`b|BZZ4|vx9j2?=1Z=MPedpx?ue$yiW9Qk*C z;;Rz_Yb5Z=R#&Hhh#67P1!k`v)EXge123dVZTuP2@m)cd?Jvvgig^QL_QHC(TQX|y z-sQHb_UO~~Wk(}Mr1>t!n_iO^7g*dro?Z`Cs>&4Dy@ETE`7SOqM<70n&%A4_xZdpO?Ci47Fn5mMcjsuMy)VZPSNe)}#(x%Q7x)c|U=p7rlf|3~KW&rknL@bTr# zZh>zwq<$AX-B`DT`-xRQh!MKPIXB_pknf-T`I+N=>i?L#70Uc8J(5Xmu(8>Y{;lko zQC7U?2p}!ejYUfpnpyl~LV|)ZhP^o;`RFak^qqM5zS#sinN?V9Ll#mJt!F^k2-y zHZGXsnZ|`5_c^nM-XH=ZH!rQPExf8^ylA2Ow*R z*2sU~`1If91h)4xoB^9~;G-dMFEYTy&`G`s$M;tbBW>++De5*hvQt4^QEN*K^1T>~ zk}{d>(WUE?YIXnFenGaJc4T@2Lys%wI^bJ@Qb~vcj}Py;PHfb0>vD$9RoV1;WgWe9 z8om5OT9GG#DG``CAbgnyXKoru*e;c*`E ziJa}ZZAZ*kw`1m}nQuW#n&l?^07A57bI`J!yd1D0w+2_z2p=KvviCKA0oT=y@a!7V%aV?BM-mv38o;2PD%ng|l%U;~Y)mYh{%c}yeQx4cF|``F$F9;4K)j!=SXi{K zXYfGEE?UhEk7I-M3VgDxkedC1J~S&`z}UHz_UcRB;`Unb6+pnGXONoq+=dOYjJwPuw8$%%1n^27->^5Hv;SKH4A1!E~9# zG7!b8(dgkVX*-b=h@$8NVRkn!Poc*LJ%=loI(|M!ZZBpAvWU>coQw&$wsQGA;{>P6WvimzL(}RbU9pG!Sp)p7YL(;|L>Jgf#mJtw`1f^6!?K`6am(38JO% zG}@uPx=nfM2m{_>=H9aV-*{i|lerxC8?6+-^YSb`Nr#+hxj%u8un*!~11AE@cyBc0 zZ<$M5zlgwW0J&FUptW@)1GPy8NJjhCI)&7FWGT&JGZ6kSz|ku^-5TJBj~)Op97LWkgs`j;T~aq?sgkYxP7Y@YhZ9;f8i26F~6saq!-K2}Q}Y(n(I@*}%4 zMEQ0*`sj(uU>50y1SUzp)L`>qrL4(i;~+4#ou|$R9f6X(2-hjZ1s1^&lF9pr6}c-(PZ_v??}OV2uXW?H3+t?Q3F@hwD)ys=>2n#P7Y z>Ek)m$zRs6wddX9$1H(+VCEpmCw*F1VW2eduSiFIv1wY~#bp$4c!QgQ*~4E^MNKLi zMk_-O$RIGMB#PX2*|p3a7B76tD_yi5_>E>J!>rP>>yU1rd8=OW>?nso`k2G`#4AGs z8024w-^GldFN;HY&Tl%nfN;)$koEMbo9!Q20W+zQf@Rc;Mb7;jFV?Sg{MBnr39(%+Dq4~*4GHu4z{HO!_o3`r71^mow5 zJ>T?yULd1z64{-({l`SHb?P2Fy&NTZ)BMDQgjX_a4Xy zlAqC6mR$M$3bhk&HV8WH@$|uJkQs&^;;s_QQP%i7xSLzr_^rinN#u5Gaa?OCDdrj& zjLHp@@=!Wi>)kRTfvb-s+X=-=}YCn9wR?9vBQNoDjQ=q z)hY`oR?hT*_)P*>lsI-YfJG+@_~-?t%PzFOH@mJdL>Hs_0FO(pf5&BL@ty_qwZngZ zx#CQmM$(*fV^RLZep72NZB!p4sDRlZU_GQYa|xAa3!+1yrUo@}xbT5*X(h?y!xj`fDP8$JyLk3b`%xU3^{nb_IlxGOFK8A7|K$>fx;qFqvR^WF!3= z#dydHb8gxVuGMh?Kg;cz{@pGKmio_cxM;QHd^EEHcFIM#e(v0btS!8Ea^3aPgzFm zod1o6TTxc}!!pF`=FvchzgG&7i-Y5Q=E8pgS5r7BIK}{&!lLpZ9VHD-1bnyBT#&-(mU7qifQ4VIFQ~A@N@05|WuuR^BuZA#EvP zo&fBz+he1+-yNAj&8F&3AKSfO0a+Mpq~5gtt>V!Rdg5`lH~%yD>$)V_0y{6UTNF7J z9P8@(L!7T3Ue`QBo)gT?b}^?A;eGwwb-B}8&F)&V3RD6wt@uq8T* zeUYye_q7YV`@6oYG&Vgu>pb%<=c|L!%DdnNjtCfK9i!H)$B4{(=3UJ|&NxHi z`}e7T+z^&1e-k|Xxcw0`dtQ7)XtJO3h*u&;61gNO>@Aap6(SFwWt6&XAc zC%zlrvqNFI!uY8l>t(uI-^iKq%39f^vIdiqawC5%7+t$oowB!3!Q6k~FySeiZ?$g#1T#6dgjKjW!C)`Jxfx&BzOoSp(8 zG5ST>7%Db1PX)70cI_UDhc12I)eUw?lV9d8q4=V`JJ)orfF*txdIe^P z`M9*SWFM#MhjvpQTpplS=FJOy1SKi0zGC&s+^to~svDiETHrj+k4OVf-(~@^eZ;2VQzwt#@7!{r-d-zP0 zvcj39-|w%PXOyxxUbbR3(eJ~Z7tv%A8=6~?geueIe&Filw3=P$o5XM0xcK|`?;wcf zGA~D-FfpGa29iZn>{DOwi&l{!_yWX;qoBbfQQeFSUdAjXRM$pa`)znTc zbsn2e9kfK*)LOXFO0>REAx`{40`v7C+F|cU-rn9W_V!Daj=yMP;M@DsRBNU-$4T}` zc7-dWMDzd;$d~zJk3CJKqi_%Op=itRf!Qcg{(S1B@)5#f=v3sS_~&7`uf919S%HhMZ6-KRR9OWFAiHoc zaj>V_BLCF8I54xI0A-CVT-zF%6Uk}ny!)zSVar4x7}^d^msbiaHsuiEFyi?XHEicl`=$A?-%d{c9_S};28E}V`C@*&q5$; zNyB^<|ElX0PA8xjBHxazih`jq<&iW?ym@AZjjmWq(epyAtBva@UCfs;7dSx*SL~FY zoM!a2=UJ^YYkPmp;DXUgS&hKGLR%Ltfm|Mf5P`KiDRcPypv7A-gKCVXxKUz}_F>3r zKk9bMHOqf$$SC^Z_;Jsuua2C;KehIMesGUS_WkC5S}CP`ON#lOr=T|PNs)_3RrB_5 zhNgIXn(hh=q$`iC9Bic~`?yYc9w&dmBwx4zl)g408k= z7P}Bu3Jf~TUbt|fyTj3g*-oE|BH)E&Z+bCe6Uey@8>C0NTLHzY{7KV;RizicyOYUP zD4@y$;0=9QN*A}O@r5F*$`dUyF)@CeG*h(!w}e<}6T!TkbO5$Z$nja;*icvxqBeDE z5GVtBYcExJ*xdy!lPY#7ke=h;A9cecnnuJ*kV9jM7->+1WcAtVvXEbfLqyBjs_$^n8E%0zgn3Kg1X!U2oriO)lq1ORuC?6<)PH=~Q@;9e<|_F%W6i0@m(H+#(K}g0GK>m5+}G4f zLTveE(h<;4#W=yu^oRgWG-bbgQ)B{-*G9t^;#F{no`Fn4OY=KMUN1N^;U?9T1fRLK zU8%!iq-KBpo2R~C4t|bwH0r;JB#00Csjm3Sbso$ykhPoN?c2Xj#bS2n0+nD?#0Iik zW_e|0nlo*nEV{T?eC7pcwMHvZKg{DaHK23U1W{_Cqw~0QLgV_Z4$f)%h(jodKWF2} zXa%dd7)5pF_X#P>ZfYX2^dI!{W(Z`7HY1v+iOn11Rc5u$S-t&PWkm1Q-sfk}o>i|j zU`FP(jvQ&M-2}G=w6m)=WD#69HCmDWzFg%@g4$m7gz;X#*n&cAxi`#&^Yt~dmqq35BsOq#a2 zpRxvmnGtX%1J4dJ?EKUGJFIna5K4 z>hajjDC}y`yMEHX6c)GOf~p=GOI1CCH-@=A7AfKf6HPRPVu#Ct3EJ1t2+o1ScJ)uf z?QO4xRSUhr5lXFFZDZ(>Tc$t(t+)80L(!bOb4w@)?KwNdX(2l*2*+|k8jZo-(~pGa z;?`AJF&wCbQ8o!6%{JQ9rGDPqu5B_#k5qS$j2l{!CKS=U%K=ut-M8CL! zq*tV$;A0lF+{J$v6&v|>1;sxd>i_)UeH0n>gZDcN_LR%xRFc}DPHXO-o~}*0=TX=< zb+AYD%bA&4zLLL6T_>q##!g0ju1Aiv>+Bl1f0J2jqd7Ko(iDzV7rIk*jvL|@fhS3g>Uw^B+&Vwa67Kn`};L3XX0 zOP+;`?=II~J!U^Vj9QQNX}IRbh<>$Q-ZU?2vi2YUROfs=Ode)gaSH>vT4aP~l zL8t_$Xm3dboT%??|S`gUsa~Gq~!iaVKRMES1MH9w(Ys6=b>nSj7aZ3 z?ZpruPxm09GkHBQFOg3k04?SDsVWfUZI2KujpkwXm{h+MxD0a41=PX4y;lJ62nX9z z?LDM+8MQ34!wbbpe%6Z#LN`v<9#YsI`9n{Otj#l|O~${9Lzv>A8gW0s#lt7W6cg}W zuf#yi^0>PJBeK0Jgc>B{H_kqQE<~NXbYnGc%V|}7uug$P{f`4?dMnNI}=aCx>iwv5nO%qv|aKg+K%SQOiPW z{R7LNSGp|WCp)LE4LyFPb6)M|2*Q?PJEDiw!+rC1%$9m4VOF0f-*hk$To+W^?tc^w zt^@96Tmc5+#lm{v&>Gn17HhLNZeH1%;!Wy=#`q3)=D#7;dm4&|g%e>VQ4+8J>U;n9 zpTZHR-uPa56y6EWOP0qE!2x30MxNG)X)J20W+DvvN-QF#@ zU@~E+)Y&u9$QT@m&EJSCBkdZT#YPvXAV9XOMnT1-M6?9VJg6%qd5Z;enfkh)>t1Yrw*zIY zNtK82A0;;93U*B!pZ`b%S#99coX)$jk`z*3nd-NoLNuk0F%G~U+oehj|QvqRYx91eFJ#vtaE#er(KZZ0G6=FU|!8tHMK+zU28C)C{(E1Urn5`6C6(32CAnCf-TkeLGt)Y$04$jJD01|c1i{%^QOsrbG~*610-|JgGCVGi$SDLSR!%Sr@^FQZ869}mD+ zEX%RsPh;}MZ$8Ng#fbxd-j4hg1i!D1+(tSW@|iUo&|W+zR+NY zTn!drn_3yK-F7SIuw&mISq*(!y5*DZ7RnHq-1c`C01+$RqzM?8I%galYE1;KIW)Gh z??Z~+<3s8=+}6nJy6kH<6Ow@qLAvrF|29T!hVAd~*A{9Q(lzM+i)*&b@fzzlD0`UN~W1= z#R0vsyxe&A)$3(9Kv^z@zI_`(f4_z&uT>5@M0qFldEs&%GpBy~C^%)Cezyj>vL}xV z*=d{I)*9E9mX=mRZPeY|3_86> zP3d;}`bfzHoxL0b6VCwQ{f8Or4W?!e1e z|3zQ_U88>!+J8Os#f6M*3|2?m(5 zhv%Bc(1NdI;cse?H=`i`HcyBfSEd20_CF`4*-c zOnDTIufp0D$bHMo5DSl5XVPB(K^-|~htbMf@nN8TuVgKV1WG}lI~hES@37930f^8c zox$qD(Xo8e?u$%s^oTl^trdPa215t@C;q%G_a?YwKWavy{sKB zWwFw^GK+#;iCf=KL~4 zZi@20!)UNcQjA|ebqL&6OD56AH2vWV=k+i@@KW_n0ztb&uu{|6$&@deI8N`Z-@Y}H z1wl(F!dUz?RJHBct(>N1au%xgaU53k-m8|Czi(l82UMkt_x<>K#i@3`KT4TGl;#>i zOI-T%rx!fYa-CJBZSgm&+J-%6S8!yC`B7w7`QjE88~Zpc zvf~?L5IdK`BCiYI?v-WG@qI|Lhg=M>m;7<|Yuw)X^XC(3!@Gjsj6XBsFqx3%2jEYg z4)Cyy4(7d){uZiAiow$w6yF&I110Q?vgaBtE9?DF08ZMGF8MZfQ!jTOKUa=e$N<<# zlfFTM@dn%^^w{m+BQlvzv{b!v(kTE{>SQY;k%Q~5!I-~OEj?is0n(o`TzxO%3T@w< zADNz(3IY@Zz78px!NqH1*>k)V##sC^v{eFP5HdGJyxD&N{jC1<+pwg8K(T8zX(sTi zJ{niM`uo7QFz2zU&!X-)f+DvkzGJx7J-Xq6d=Mq536m_Y(WTJd*H?DBHrg7m3PDSo zzycH|2^3Hff+JD`k*3cTl=<=&>h-r_czRNJj~X11g^DiAO?!ZfpHj~<71Rz15?65h zA^w4H0l~d1^5KN|y2mz{xJdO?-Fz(Zqn(Z1t3;17^`|8LiY@5n7eG3_{%eX)$V#K) zq@j*bj$&Wcpb8r#qj^23({r|*gF$%kw5LjxaSev_^bWV$Hy7U&kmFN!%jFQZpGA2u}$=m8e5oHDFIqSe7m_b<3b#Aqpz?{wAX;&wM9Sd$hhI+!k zn_bo{voob6liF`Re*84xubTAi_tzaWnVy$18Zc8}{ks9an$iGj-BjD#?+=bY^J$IjIIpL{fNz+Bf@#5iqJtrX>^1j+Nrd*R;PL|eB zpTWBvj~IN%+r;tBziFjWw$yVya*5UZ>OJ}M6Q14AklLtUoCCju*qwU1}*Ih){aQo-cOkw*iJ-vMfM3U5`pDOmwY@8Pp5nL zIQ`iIKXUsf&oBMUYHaeYPhrl?r>%?)YZ@h0?16o<)4CBVxqiDL3*3t=V}k?_-kbED z7GXo*+Uj*+g6D&yfyYa<&R_!NtHDnmZ0e6~tDE!TBA(P=6@OkloU-NiVslnG>d5c2 zj7A8-y+InVccJ{C4?ici)FAl+(Uq@Ku-}0uH$Il$x9C;H*EiD_0rO0sbaW?Yhl-W8 zDMR&3<5j4YwkN^$snmenH-a@Wst>H^g7aw;x?ACZN=YI&QG?ZcmfHygK|7-DcIR(wlA?cQL!B6RM%&zy#1Nc=+f&kvShO>roKd&H;MfX_wf z$BbT)S2x^@9nuz&G68gAFK30*%CQ~~U0q#wLEf3m(XNegmTl+47OtWvKLfbCzU;w z#oX5Yp>d-uN?kwsY_oyjt2(dYmP_cH(qpVnV8?={wpwG>pG7GUTHm-MR z2}a?X<)E4FHkZO$fc`#6j~?QLh_!drzXtEd*zTdycl>1kP=a=J*Z5$+UsD;W`#dJW zpisZ}3TKGsh7of67Fw9M%jAjkan!TtuQB^CEBxWtYBYB4Nb?=ZOn3@B(bDgbPtaDp zw=Fzu(s7=29A;jOYDImbf7`O>vY3MgcbYZ6a_!nNR~ecNyH^6_ebqk*;_NRF;Uyge zF{hdO&5G@n3E8zqfTF@~(`9;C{=(|H69m=ub;a_=1Qly&6WCtmW-p2lPC=-7@*_O> zMMOBRyQFI3w!E)X<-vcHrjGJ0C+n)*rZO2~R4V1(L3e$wDjjxIsM7HWN5J(&f=gx} zp&zKhSeCg>qNPV1?Xr3$fNU-$w2|T#rEibCra{ldU^~6Tm9pnSB?s{CZMfCATU!&7 zL(``sfI(o?f>_)u&vO0lO_#oD`E9%sI zm}_#Xz`J<9Dbco7$BN0T_NmHYd%7v^q7~lMbmCCegjwdoKjQN%y<4ZU%}PqimjAOL z4gS{t5?~~#uKSkMpt>U2pWSq76GAd6;y(Kr;W?swWpC$v3C+3FATqAqr2QG;yc5;4 zd7@OIB4WOrwgP!zfQC(Vx<3<$_*>)Bz?;yg?k>MR=pVBVi4&op$N+-hJCLdTjk@>v zOA?)cS@UALgoo(Wt5>6xy11y86ZW&pP**)Qlr=R>9z(Kk3I$Dj&f<7Egs?j7FHI-~ zzf{;5-ceLiid^E?)lp&C&AL|rwgAI$soGixcIlqf7bTRT>N^hv)L$kP@2Iq^Ch@QF zT)*hiVZcE-i!pg5(SDw4%C#|WZfxOp%tEMYnXq{z;~y!}-lk%D;z>gwiFCY!FbPZV zyxg|Ax!KOxMGwDRzO=p`*!|_p7yOS`d;Wo1lw`;6RM{W`KqgJBudf(V9Sm@V2s zjt`S7ynYj4%fXj5H$rk9*j;8wS*Vc%mxz6kLnO-o6{0MHS|Gd!2zCms58tC1 z3u%XBY}J*SWw;1td^gs7GlKqhnmd)UBU4Bw6Ok=?z&}-ohGH5OQ9E|{^2Rbh6_X1wCJzg+kCTnkP-8h!HQsQWN#eDPxk0?fj zo&Pth)ujTi2(Qx;*}tCx1=dTR%=#j{>w8zC&0VUPZHRjY!6!=t!=&{;S{wMZItH+p zYWTRO5b#*4$+`0qX@;?BB4(-Z2=Y&!vpu0?eUNPITZ$`j7l&gpV8@GJui8Qt;kj!O8=&8rF++UeXWDzlZqbB!JqZ=IEUhZ(x`4u<2LvLkjTM~m7TaZI zW%~Ds{7~ze^8-N|4Go=PZL#FrIX9WhZ0;5SOfxopPe!J7q5T4@iMhj*ys_P|B^HbA zvG0x+fY|jP6jaOrB~AAJeamz#ujrr9Q+YPAl?#%!^}x0Cs*_NI$>4)Y=gw-p(ar-~ zQ4&sB4p@A^13%5q&JHVx&Wpc~wIT+%JQcd;+cvU$*rsqr)2qvu4XryQ{_+Z?)-B#7 zocIuJ40l=BxgsFme(awRBOg3xv~mqP&|SM0p1db5h&z4=W7*F@l$i2tUxeJ>jpi6Y zq#WIbOanvYK5&baxoFrSuA2`wxRk&DMOSCjIrO0GY&p2xH}92yJwY$HH{>PStm(I+ zWwH?cLDPw{VuMCXowISH5&9dG!jLfe?0W)vx%w@ANPQVS6^3jx6!x`-J+ci@#SnB+ zowkzxM+^iysHx?+HF_ zVN4bNt_HLOe&!^gPQhqLfMy|2C209xMS$-y-&=Lb(87&3y%v9_1q$Ic>9oLEWLFKk zV{(7A3q5T*q*&j_c>}=?T5o z{hr=Fd(Ae3zvGZDgLxE$xA*>GCZZ6~F`4di=AZd51^dO9EM6Pm%}adPp`!+sS@FsA zVHajRfr2&6r-`{x&Nx9=w8%BOLq(YMjJrY(+u#2k=&SIPzWV!2_5;dOuNY3)v~lvD zl8{knO?%{}OdCVGhB80r*v5Slscz7=T1=NxJ-r+y!>Jr4EC}q_hklRpWFWr+hlqo! zapl^j&eO71_=$;BuHrVtRsoeN_$`T2Df7t-Uuyy^fJO;)$Yc(+zOS{1^0tAusaiPE z?LT{)XJ=51m z!94&&{J;e#u`)F{Vifd1z0>%{g|t2Oi(S0q&t`5$=2}&zp6P=qv)3RPb5qB3Nc4of zmXR@foSR`$W?$9RYQfvkP|sA$zT@=+Y7e9h{!z!yfc4<7Z~^lS+X&lk$tVwvGgONe z7+>XT7;NmtqGxkV%I*^(^ar=k>DfTldqfoglHFXimIXZX43)?iJIc%M3}kYHyN4no zr;{;LwQINgA6!oqiu?_xG1tH=xpJj-PM3H443&iRfRB(>P~JzhWK`@7y=?XZFJ^($q!J^f$HH;0V5TVk-8r9s|jW5h^S?c+p_&O<}cTv?#c zevIT9M@2_7>AxYp$)_%+g@uvEB<#Y)}%R2zU}xabPzn{tPYg3lZX*RdcFW8 zh`Yh;5LhQtAn5~3hds?RihNO^$vtyl!M;KrYGx39;dMdu1QwgsXMNv{yt)I0(I5LP zlr^*g9O*R_K5Qu{7_&c9vzMTaj>PrEA6||&NAw&TJ5_cS1I^Sq`D+_rqM(a~z-PvxPQ(MS zL`)re8|6$Q2O>yv<51XAAvcxj2!y5ErxG!9gF2`Nvmab&qQZl-_BJ2c{0v_~EHUH! zyZd*eqoNqIhsQ$we1>xrroq;URZa-$KCy%v;g+BD^Bi)Ui+7xskR0dEA2&|`%{6;? zcvy{RpQ&4c{rx=`TiU;ZO2=~1|JKFAOyXKJ7v;gBU`%y96z2Q%RwmEg63P|?1sonraR2Tx`4#+1UAO5;70X(F`tw z=@Q=E6oh<&HQocU6pc5M3OYK-Zf|ejED{6>nLs@t!=wg0dsC)Gy8FS5U2=wfz$BH$ znEFLv20|UdI8iSG$XJ8z1g2;6*WN)o8+rfXq7?;NkdSKyMBj!m@cZJhJyJ1fd%{T! zK8cS|FCZm8fL>0K9v{heAXK&zSZjNVmKwO89U~0$$OQMajpmicu%Vy$14l#5UcXc4 zy-1gdcRfX!ggj|FC`Q*J+645q`qVa-@29x)uJTBDnd|VBQGM#K_g&sUs z$ctXE%*7#nG(X9nneIppng+GjlAbZmdE;vfCh&cdi61HTm@bBV%BidrtafF@+4OAZtAHwEA?TuaLZel3);by z5&z?rXZG}f!x!>|o#J-#rMrGHMB}MPi#IdD81m~?Md$6hqB=kuI~*s7TVKNq*DzwdRIe9%*!u>SLtl%&?Jcs;`!Q;5(x0`m2f1e)hOJ7_;(q23Q$d>_0~So zc7c7e+s28WK|Q;>50KlWl4nkDoE5(~k~M8R)!id%fFY+c1}3IUe%T5GxD))rE_{Q^ zl>2&_^TM7QkB!;Zvzd%6$3>4VKUikZWv-q@cfWhY;eIEwu*m0$(m|l(a}`JjB_~xp zdGdr_ZXWJgAEcJ&Y{u-yBXrzcSGRi&F3?+Oneuhn$#7BCC=;EQa;z&8?xYjlaz5LP zwu##GA6qh$<5c!d(KG)|YZCtBrS`H=p2sdj!rLC(>cO8sD}1a3{sbZY;;&j;TknFi z@U@ogLr-t-p&~+ufsl;D_5Y$6TsZ+YuRlWW0vK&fsi~5o0yQrrT9*kl?Z4<>W!P;1 zgBE^~{(saCZg=M!xBC0+(r7%`tmC=XVQ%dEm{e0UsH}Askl|B8Oo`i0V0IvK2~93m z>}VqwL)9uW5XN5ig-l+Wo$tUe?*YU;7Kn$a+<9cI&^4i;anIQ}Mwu5?r~5m)y5Avu z;A0=Y!0wR;&-!Dr$s7t(=N3P4RI0rTqh1~c^RkUMeV4MUp`8ydo9PO{CdbD;_W=vD z(!I^?%}7r_nPOLUiYXBDHF3lJB|Utr-zheS$w9W z(JILucK8h-6S|UGUEjPOuT|b94Oks*Ur-sO__-1>;;Y{ zW94VCvT0)So(7l_p7)tsE~1I{NLO2#)Vo#)AcbJtILHsf08-MD8COv+7u((10yi9J z8wNKbQHCa|6O2oq^~0HoXQn}+Ee-Bs`Q z`F+^;C$U132_u>Uq@9iEafqPAU*cv&5`|4xI8MkL8 zns%tl(#mQ$U2ERcpWoUNW}n5|X@*xI7h?>CB2hl`kH>>9+%{Yil`>{L!rI_>Ia2(D zEP?ZQ-X`L2D6yst6sa8({eIBq{%M-;6#yk`rQgI)9D=(eImc*god*RLN~5WQWXq9N zvY?Df^2zj84bA9aEzYAbnT=h1-$W!tY4r%&I~+yiIqbQTY3ZeICaOEIVq_ zpC#mA0WKzxZg><;jH`M$6X;cDW}%@7+}ogK5B40k9EA}1?2?j_l}d1~p~9rZc--bc zg%I(B88DEzf6!_>z{p;uvUvSE}z$S4+tR5#W1y*V%T*r!e43N}M z?66FjL0~<}jY?2msZTXHtjtYC$}xco%HaYWnx-e5Rwf*gcjnpCM4k7SzoKAS zkJH&6+!F$-Y>ffm`V%O6yaypWOj`Oo3vjWHEeCk757J%VFnX)ek&RoJ#C>|e*LoKC zj^ddFnMa+dn~ z6PJS74AA|ikXRdh#Gbj8OnW~K^#3vT-hov2@Bes7MMI7)BOO_ZjF2r3vP(r~=0V4v zA*8{v6XBSpB;^>{yOM((BrAkM#|lNr{#~bezsL9e`}}uqx1;kq&+B=O$Mv|b$2a@t zTTZzbk6j*K4t?ror+Y?I>(;mB<># z^GTXzkPLW&9yIGUlcc9IUimCeFGo!J$LtZ?IWlyzuMG`UisJ*NQsgCeiykBh- z&%jpg zTP5*AXLL=)Y_R{W?$hpe1&8sUnXUb93R~upPV7W|Yf_Ax8e{DAv6j^#7)nq~7Ud=U z0fOx>k%5W1ek=5!Ja+N0MIiq`FwO@Sq291H^0|yZgx9-lGf7DSyep*$--0atDpgSJ z)%|cy8aiRc3fm#+WLpmwy*q?2G4EI{I(6&M-{KM}g3FL%w8_HpBU;?^rl>HTRRwYc zJ-2tIHY%`SrgUIa7$0$QhLH|(#w%kgJmZBArej?+X-2Os?;Q=dG3t5K)P!|Y6{E4U zFS@Cukdblj7*kBavFTCdvjF)P@Onurm0VQLO95LMi^M3?G<7IVeG=2S>UmvW-Afc% z_ymIsjc?IX2AVb3u?lxS3`ckNcxyFpUob}3K82Z%0`T>UP86@&X(+%ZO@KSJJIwyn z#fG04-bmY6B)Y!Oj(5evkz6q(2~cPy9P29u;<^y>P*i2&`Ln|^2+^Qrv*GUU#kKCI zhAAG*Kk~7kEyAu_V5VL#uz%}>nbTb0nyrm`J~48l%3nUqr3Xy7^}rr`E?NyGfkkpS6#Vf%NFJ3<+nuLUe5bJn|S6apwB4Dp@2v%43iwj%(j0>&%NTq(Lrk& zUwQC$+YN*hWuHIGjlBqAvyn1rTgCMWxJ zP~Lo*<9G8*NR^&XdscOkM~h#20(-lSxPJd_zz6y6E^efhI0$GBHw;KRzrQlRx;T9( zpZCBjQGo))6zDr_E$^n(K$_HIVu>#L^tp3S?wK9rZ1+UUwF)&KFRz+gSOw!xEhmXu z+XDrWb}vr?+E14o41mOabGuw|H1)3wt;c;PaNxFEy{|9=591ax z$UWC4x2T{>l|k1Y4QY1hb6lFPudm6Be!K(JStgMVp@pBnr^Asc3 zn`TO4Jr`hRVMBD~po+DE>CFqsjPtde{c`|ZwCZpICzb#7>kcYk<7hNOp>~iI5k|@= zt(N1X;>yem!^Y^P>s()2x9BWeA?XQA2|e;?2AYq^E~tR1Qn(Qiv$} z4mG5hvo5E244`JaVG3(D-1VGe&RvwKkP_GQ$!Pgc#+X`lk{`xg+lV&S4IMat5R+Ge z(?&l8xMg&$b|w107Z^zc0=&>63u3E^`{F zE%;_VcGeP=RMGA=7$lL%qDGFL!P$^3*1W2gP@rc1P1Qb9h~57CQ)~;?<$+_)&XXhU z|LeDZeIgMJxbx%U3&WY%4^q27GygV?zi;;DhBsH?`AIu|y%_;mu3F^st=qTBREcd| zvA{WIiXC|Sj5c;UY2`qI?O;b|XOVmNlM8CELVGAF;G^HwALhw?P%i@GH4$@b%aYsa zL&zo=e1_r&K*7v=>I7sfLT)iy%6mlrSV@{T7>TZV0P&{>{AaX!PV34(&W?~4y-pwx zs@!1ums0Pi|0pJ-Zv74tqVP~ZkYn#lk|{GK$EGj2b|-gaJ^z59)I1QDX0ZDG+wI$y z_(i&HM_`_U3ccyZaq+zl3iK#vi0gd#MOO}Yr0P!cV*pB?h*M-KbtSIv>Gg7X`I1a$ zc&<%%Mw9h$5YlH5aDDQaTb;Pkal+qZDA<4xzQL#Yv{QdGo!?)t{gFSFC#slh#OLh| zAVZbLdT~kpmj^nL0uHg2H~K475B;&*1hW**CBHlWwbLxYwnjXzS}Iy}pcCPyIR5KQBn1%GJddco6)O#0`w6DKe#Cmq~l*z^}>uHkN7278JWvNyjTOCV1uCn zXcde}IwvriOs0DP4mX_ZJ~i>93Ae~~`+_EpS^H*fxAzp6@w71VXqUCc*groY`Epg| z9V6}hkIQxYCgeSOsAB(%$9@*^cgB?W{g;C!^1}Dj`lwqO4`0uIo)0J6&LZ7Eemj-4 z!j3SyWckI8zQZJp-*Eh;BwV1UFtp~?7U9XI1P_Q@z0kt7 z@{h{&e@rT{Kp0{~eH-ULhQ8|hv+?yJ825crY|F^VUwR&H$Kmyk$BP;iq`ksv;?{9%OAe3>pc zw>JzR?Ya$A3c21SkWuitznH>0FN zt+@ja1w8i}wB&Sn5jU+R8f_C0a^~@(| zQcPtY#{6;AKU&6r9@vQ-WlK%;wW%PB-ydu$TowZ}$=FVJ9aJ=s?6ZXoS@tA%Q_^8^ zdA-JUmksouMK|PMEX>3VR(dsxFBncGmG+;X2IV)Hu`xYy^5jW#r#s#=y{{qe+rG*- zL>1JX#{}3^KvEGf8s){%hBi2FUY)PKiwPaK@*Vl?1$wA_5L;I2Kta{Tw;| z>?UgNPk)*VT?uOp?kqjJ9q0`ttd{VG5yiL5j;CWRkun!B0X&GE`&liSc%44gq(Se_ z*2ti1aeWVd@naFKtR^^ZdwMYKI{J?Jo1LsZ#>X{SBL4I?x6}R)qiUuExf#VUdpF9P z!0_JfDR$()gspou4-iu&4?VsB#`qhOgl=Vv9*kpFQfv7gB~_qR*^1BRiY5> zyw|hpTXZ$q>$qZKi+tm=+|YAJB&{k}P{mH+3JxlBAYs8Uyd0EKYAs7Bn;$aJoPnUF z=2~Zf32>Fo+e-q*08G#FXK`|8?xcQNqlKwae|hmH{KG%P<29v5@)v5I$VvZ~@bc5b zf}?rv#ryFf25EOHp@yb>hhk)z=y?B=YX4@5%&lMJAy7h{m`OFZu!0BJtavB?3hG-C zXta~oHl0xC?SqrgD6`SQ!5C+kn7<1-m+znn2eTLG3pX_Gh%5}{O>cI*g1PWwI~d`S z#cdrz%;=%=HSTcR5}9HD%%iR3LsOrd5CKY#RUe4v1lk=+^N)qI& zb(-v1?i5iy#+RLU@`8atYd(cz8rZGn+~ zq^RZlBk0lbAKp76%-)N|5K&RTP+f!O_UYOOhr337%((b*Syk1*Nf|yT^5{aw2^P0M zx1Z}x4jj*P*WJeIKc)XKHwR6YlhNW_NK#+Y8-R7lixAq5+%)>ni8RkWkVDDXW{K+k z$ku=tH0Tb~l}x~cIMRgmFh%;^#3hEoMg>=Ap!>A#-@jizfs5%d*P9a*8n3h)tUAn# z&^mNyjXS5(Cxho#t}0Mnk1dzE&egP1TgNM$2YX*jow(a%;&H>;sEV5#FJDGBjezqo z!4Sm-tH7wu$!&6&#KP1+7h9;wo1PUDeDS~@K8uka)OK>n%Hs_vU0C#pLvr5#$Aia` zAH2!7@!@|wxHWW_YEPr?A1{%xfT^|41N5i5x&Ie6p#`u# zbS*ckc!#(wQf&B@h{q|^rV5G;J3)=;drl+;^#TI{1Mu3M{gT|Yy!nrDj2=bndGmWE z3YD)5ePN7o+^X;j7osKsyObvu<2EJtW}$lc{6J9Y66RBl)2t{|XJI0x#Pw^_l`UvN z|1Pk5kt%AqmH?D}Hw!mn$CuP{smdw;{Z{UAlO0lQOSHznpS_Yg-LGX7=|PANign)p zSd(pWf3w=(*DOhSo3gy)ZgSFhTJPFj{Zr?q(3_88^iwzEpuEz(LdrW>+-t_~$Pw`( zw?68P#&0*fYgI5CH*OXRv4T>{DO6DOQQe=!-6_#v&^Qx%SMD|%ff`dKR#&+VYQ-6% zmLOK;od`htJ3gJ8N!on^C`Y8KkRJh0hwt_eC*I?wu^UYyC(n}qn!}ue)&U18s74@R!hZHN~cYo&rVpREK<)`K3G)4`Fc z^Tt&up}cjIf>fGYU6%n!zd$`|7jW_EoKsepH6qiHY7xCb^NLc*Vu`N6X|05K)$1)Q zG(eamBO~9(o#6b}OFpu>;d+!A?x|Oz#n9hd@UwUIhKW*4GiNu}-mS|K$Qqes(B4VZ zTASi;Z2pt`Ac3YGI`gAlF-c8!t7ixI+`4{{jcws=w%P&O$9sdd6*A_8Y6_h{!eTKG z+)4$itC1*;lqnVFeyFX_&p?WbirSE}ElZ#_)Z93L2$0LX2e;V{5qqM*-eX1@qFjRO zKE=4^&pl{GZ(@H9K3+^w#)<8jH|-S9`z|W*v5ytEXLYfKO@u$hH8wVu1;8r1UcM-G z_TNEk84A();evxy@?+WYKPWj|W0kmm+VJuJaNT$kD?T!g&#W%)v5!mZ$ z2f7}w1AXJR13R^#BwFD$v^3ZxHosN>%4IVU9l>ta#t9N4GBPp;nR~-wp$sG&_#>HS zf^H#a@)>;&U8V?bJN&b1v3$uo3|MBY2$SVsOMKHo_N5b_8pr;6o}X#id}y|byKX`HQhap#C>pY*4a>!_CRNzwP^-WDpPtbX%jB=&u#_u%|F9Fc? z5Zz-CQ`qkWm98^SW{qHB@R@v0*@U_fRT1zcsEyo?>S` zn;FK&$+@X_=eruAcmR%Z@HA@emdFj23*1ul?(O`+Fu!yfm9B*Xz;nWqv0yI*s$4MMEVJfAK!PKi+^y9(}Gk7 zOoe>E$YAz=*a1J(WD?=W=KpgHr1;=3?2UWM4`jG@;|-*y{)HcPAQeYI~94Ah1Jj$3&D{{2=R_Szg3=)OO; zHL?qD@<7?}r?duMv9TFTe~gUR&p4ckuJv_J!>%V_odF?g0~}>g3JRW~^JjE8iO|sr zZ`z5dH_s^vw7$_Y5*!>n1Ffe^sAwt=^xmA0?)b%&`YA!#U4fBU8n_oarhWUr5`g+s z0!_PTA#PbcOUgAscV2<~H>GZ|$3(Fj zmJx@keHIvVL7sF}z|cUBm^mBhT$6sBYti~4G$w)@}w;~t61 zWDMe^s!H?U^ZFS6F`Ha)_xx{QjgcA@oJOS<{LB!#NYsEE z2ckE(SXw3#Ao;?zOzaS*Rk zh}Y%5+|gf;biZqBr08L`9PRDP4ohGCMv|_Td%j=nQOejrg2%Sc)TYe zrxGrmFJc9CUH8&kV>U}4m1O;?zN}Z-^c~a}_PH9;Qf7az-C8@knz{F{=R9!}3hwBdnnVDp;jR7UD-y?0C1 zyF0b&c;QBfPh~n*7X*oF+I;P7WqyJ6;momRuzn{lKCXr2UZGWGm$hvh*ES8*sq_Z!;-a2^B3I6@P49n1&NH?zD9|y1UvG(0I*P6?D(=bddGP2NReiO9=bmh&`_shqbP>!Y4)Gw-GfxKV)gJrgxZxM&To$!k zd78`do^#szE)T2|;8r?8TNY0<((eU+a%62Y+ zgw9a^J@a3eS!tB)7^AVNC%uj!Dk`$G$KQYpY&>{ryV_se@^}8eEx3o=1j6L*cmn@3 zNfnSKm{qA)(KII)>%>VinVq<9j85n8mbiF~X{!?5nj9L}1!*_TBTko5bB5PCZLY7Z z?T1kmzUCB30@epLhC~Mj^w*vn$=`SVgZA)p6xCyAJRp=2(Bv-KGxEagma=6TQ0ztv z&YgpsT!EyUI&b&B4VKoFzva@*@i}OzrL$9e+LUfThXLJwvkUF1MJEkm%}}k0TPu}R z>vH}2oF8(P86n?9!~o!c#irL22VjTAYb!bK*BH;LEFg<9lC=@=mjo8wkgFMwl~H3f z?F&m!)C)BTf-xRD5v-Ob1D|L3Me{1WVRUXN$qTs2t+pgRk%jCj({(c*^;#`IKR;pT zn;#x=%%*vf%)BZgG^)^25*l)igF&7C$LQ5m=26cR&NT_PIDZHE{5q)R_)gJYEgBk+ zdFUaP7frl+&DKau^bb4{cp~2kDNtePFRlpd!h)JQ-Y!;@%Y>jR8G+_CcudEgAK)Bls4JUJ@5@D!>k;U_d% zU0T=)L+DOu)VQ^oYPg>dFLd&UVRq&6)1i2VmE-L;Vuv_bS>4Wkquw8dK%0XGst10Q zpS(juO@y__xzjjh(N9gVYGZ>-S3=Q@h zXG;c31BAtcM9_hCke!xt?qhGPP=`Y4Hbw;!^&V)Pxuk#}UiC^c;TKo@07DX#rp8B) zKv(!2lY)@*dCL$TjcuI%l(Q&m@|nAmKCNEkQgisu&S9>9jgY20Rsq>sQ4w03$XLpf3-HqPRQeOV(pJLfQsol3!=UE zVan?q?AlpFnGT8yyXbk_k#1l%gmH2N={PKB*wdW#iu{t&0SA|X(v2dj-Z#))yK~bq zovD<(YbPouX6`w$1?MY^4lKWTY_kH^pyfx=?YrAMAA$1pl#np&R7VA(P}w&Jd+(4O zL;daER!GXUmCo!)T3J<#hLO-8L*5r=SfF`$1WCZ-&5!TE3fiV&Q$P3)eNq!-Vm56Z zCquu$t+`GJ*xW-aeBM!Zw(wt7W)I$+Xh_fq6zJ;(=QX*1%qdyCR*8Z%yrKQE! z((-yWuhOHQ9Ps^fV*MEqNoVQ^Nr(T#)Bam+`TIY$q2b55L`nYjhg`JF*Ax+Hk4rX< z!{aZ6g3!k))Nst{GaZah2q zY^I+1{>>EW5!latqGM{@onoz$p14_EC~K|RpQbbL@@^MrjcXqtr#xkm=VG~5Bz*nT zp5CXu*ro=EtjT`Uj3#tQPMBOOd%}3Cb;;kZ=44MYj8*lDE?g)fX}sCug$a!`o$l9G z`lFlZHcjykXTY#ofY&x9eWm&Cw?!U9{ZVDp{H2u%n^M7*K;sfOR1&lT<4Q}*zJZY=YhtJjheD~(__MBN< z^UZ5)huz+K&1qvPA7s)pGn_&4XHpzsW?*FeNPYA%R`6N5V0a7t`>)6MICD8uy;h0k z8wvSPzZ|I+s@;okJ59HIGZN?7yRzIm{laqMNoBX0e5d@v)Aq)gxxVFe|Jkz3m3;DE zQ^btUm^7tr6tvg2QF3kDPWAtMx%c9>pmnq7e1}Xzw^2UZ3mbG;#JBh@Nuq?LIh&Z& zfRMR0KCz-1k*;qRB=Lg{`#LY#WyQ}}_$V(i&f(lDMM>wJh}jU?YXC{lN9WYH!j|O3 z68*0G@z2bzY|1<2Bi*f7LY%yIp4aTV-tx-oI$Z6o>Pl-^;G>RG=P{M7@NRV&H^xm@ zXyzQE?T8zfiMhF0hUn-<&PN&Jmp0&%bL3Uz85KPWT3ectjU&Jm*hgUCKbJtiEsspl z*&Ee4KS9C_EN?{2thp08>8j9esSo0x4K6tD&fW1=8#|#xNLrSS)MB@asOID2JFe-k z=jg+cxDqvz(o1`GOe}CU$$axOv9eb;N>BXr^UAe@){>Qk3z{4ns7plsH=hp5j+Q%} zW#9mjs7=1n>PPi=^~ZZtQu{~VSI1yfUujLgSr)P;)_p2;ebDrM4dscouHJ*FDKf^J zTPltg_i&@WArXah*6ByiWCyS3l&Wj^gp*15ZBJyRSqod=MRaH49sO+e~+)2R8 zU6Kc_ms+i1@yia?G4I5S-a(cu{S?te)r*5yodmOn>>@}b`yxemCc?6IJw&XQ*|BVb z$b$F{u5!tiRMxq(rKbm(LK2OmI+E-n@REoQVYG0`yFRa@xAZ{O>64YD{5Tw#W2Zvm zWbIsCJWjD!xuBiC!)%e}c4^Z0kEn0&_lZVJ5KQAF1A)-uzi;2ZckQdLX~*^rJDNMP-3%X#0wCJHlT^{ubsPOe*B()uyyV- z$WlgYW{HVeYi(lgPgSe2> zbpBnJOKQkln8pp5aSD0@wtHEJQe9_?FLcGJM-S3kLphYCe4n=MlyIq5dJV7IGzS}S zfYi{Oc;?b|0?R^g^?Zl%At7Wh&#S8Hp@yjj>0G;@giS$}<%jBQ*3KB)LFd}ilAiJI zDuWts=Sw$k-0*jLSMVZzt-QY0u$nLD-Eqr2AtpbR;2;8ac&%F%+8awBijpopIrjkR z-c7W^ffH!}xKA%Ua{}<9JGpan$oW1HY~jvK#st=^_DxpS=|)2@{uIi&xgaE9&TlSL zHKJB#p0CuWEn!&E9-$=EPe7^*6>q4|G5x zAXUgwm|>5)1gF@Cero5VuK;h9qoT;R`h;6?!0X|eQvx1E1(e83#n{OLRs9OFHq6 z#F{MgBVMoHy-N~0UpZcIiz9t}Kj0VHeRUUyj%9= zIyAkyqrWE80HrmDcs0N{67U-NGUvfcd8gEt>DsxBdg7ce%Np96cuTmDw2%byeJ9!% z^dnS^PrOq`Eg2Vc%*-;xiXsGHu`S6cGBG}-XLTQ<;-vFF@eBlIuu!JM^l1}2zb|SZ zyL0fMwAfoZx0Y{BIzX5a%9)N6x;!s8Cx(={e}N5I=><7QDDjw*W<;;rZV9$k9zxRb zN`|+6&h|h}1gyD64MlUh1=ByBAw_tHA!HTw>rOYfx9#KPV0}^&Sb$yk2^r?jJIKy1 z$CLHF%R8@RaB5RZzyYm*q`J=5DfuYAKlN~(|I4r@A5f}wfcaM4TYW%MkIC)6oZTx+ z*}qlJIe;3&itKaBRv1^o)aau0muJ}v8iIWpra-Z0w_m!UH-A&f&XySX_wxujjX+?{ z>%I)kILqPC+S>=IcKkVHr8Wxwt`=5?kAHshzryV|?6y6Neq*S^I#f#cWTD92(m?5j zHIoin6AO#(PQtY0E|90$!W9fb5RHS>cA(5_+F;=8OaZ$`)ms+py+_43pp8NstHq~a zVsqt@%Ewzm5eudCxdL0~dUj+yNW*2p@1~f!Rf; zLX7D({&1T7^G8eR&0ExaXp%J=E(>dTE|lQopQQ~e;0j^_raAV_c)904_*(0$6B2z> zo|A8RzysCsWZtp9Doga9DVk%JJ(z}N?v*uGfK^jy zQUJ;(XTt?)JatCIr~Mh6{7P+5q0lzHKCMScc{V{jODe8|=VblPoA-Z*<}to&$~Ei9 z5Y+$wNfqrirQ_81&g3VyF}dH`MoBVN0x`w=HwJEj%3iy{9gvIQ@0M-j@d~PV$FeZx z7uy)r6)|c)bGw(|*IP3x-(Hk9r5_=d$!~0$bDo!dWJnxps;f0$My3TkoB?ZXj_b|8 zq&Rud-^Rwq3*?BRu?8~4iXIT5D|XY&3F6DtsZ>8CwK;@EYf@>)L)I&W~?5kZ7PZwJlC>=4tlcmpS&XB@1g z6Kt3F=BMkR63aa`1R3_!11I8Ce==AFt~6V>PC-uYsw;!4qL=CT27Hmy(S!1iSFT*y z;tx*_ObBmke<@OY;Na;hbf*kYo3=yBxNjp2#-}_R>vwc~P%!MBS;pA(sOrP3Y}nrp#esF9A8+ss7qZMfEH_5Q@2}+AX9`NOi`kz z7J*DZCk^6KgQ+r;I+vDiuKM?0$ZK-g86rON&iQfv-)YuqecxxMa~)iRJo#S5+98V4;*)$R44-p z6&1m|;ac~r>q~tOK#k9XoPS+R4PT3x(+pt;RbV31)32*#Pr9dY5D7F6xY!@m6j-E2 z$_%=3B({aLD)%8@r*sarIF{9LQ90hf-zj?HM6pld#bUEd){2c-`xMBcJld}6$nXT8 z`g7P^r{J*5<^E4j|ClzU_(H^mXfu^<6hc(F#wC~YK_aw{PdlJklvK*3Ty_GaIkLqT z7OCU%-6{%)m=;ve7CZYj=93d57&FRy_>u}T8J3U?;+^WbH7H~|s6A5m2#t`k87;I^ zepDYkpJvaTzk(AbjU8LdPynGhN<>A2+vXzJ_M}K6oV@~4bLz_2`>%Y7H!Dw^*0juP zt<07^a5PJjkFgXs6Mt~8a2(z(c8Il7RktCqLX;$qV@?*mpOQcGGY@e5Xx90_;OPns zz&&}#Ox}&q`26`b4vcKrr-1jDBYkr*Z$5tf_~|UehNzRAc#s^`b#ddF^1BRmAPUv9 zP@R$e=?vcBf^nh#@j&lKGgWcp( zc`(3bgQ0n1ys4AumlunfZHo^6mp^(9sD9Hz#a|S8P0RqPy~2BXn2p2W{SSx6uvnY- zzHT?mId2Sj!t648J+H22E_r_~xg7YgyBNFhP&lRvfct+^+3#O;hS^@K&qDcrH}B**(K$^byid_d%I`PB(loA|uTRKhK_D5@A#Fj13C|2<{Ch9(`Iu-qqREx-pqt1~k_eUlPNqQxBKrL1p#ZcuPmz2Y?t!NCk zAM2s+>|tcOx;q)}){O`J&l=Efjgr>HjWGV#R>MNpzY@fr=7(1!MQ<4_WtS+ZqJ+H$ z1;rIXaR_79$QLzD!pPzUO&{F&h#p=(Zc9p(_IsCSk@@;OW6 zMbb(&7uC?+mg`QGXEoC<<*Ps>p%$OpjMhcXcHg#|@|V9F+8ch9M1EsZ5M=f#LE{Xv*MT8%1d6pV z;(Aun7ef|n-KjTkb?RY3sxDfgtU&^p?hi)7zc~_N*~yHAwDijI&F{S|*3n*OmoOXq z5QSzJMxs-qMD;wQ5>M!YXC4USaoG36M;bOnoh7dVATV};Fn^_h1C#uHhr%I;tJuB+~QImj-030s>Heq`;&u(Tp_pb>;*6+rkQ zn=W?d3eFF9?JmEwj@h~hD}rkF2t$@}rkUv&4~HSEweD%No7i~$16-MPN1_;il%%$a zS7sv5YS!SeLn*ZNOmAlI7OUk4c$bdeZ8Xj5BE`204%j+*^(=?Wxu>wI2-qt24&A1$ zvWomFTe$QsDJm~X_FRu|h3F%P(9@y2cREI7+L!C+9N9GG#n0(q>vL2P&wvdxJb59& zQw?hpG;16`IlBZ!2>$$0dT9cR|9272bz1}M0#`1=Lb0HE&uF`Ox)eo)h0frycu8+k zD@tQDZVy#5*#siDG}kp zZxcb+utkZgpFAbI4=6L5s5{QZb^R>XRhuQA?_P4EOB)QA^Sdug*-h4VzP-+oG-feo zI?)yUt)FI6|LO(2!7}V$AC9#Y9r#N0CdeMGPX&BNU-o$g!Q-Thhqn`i-bN7RDk`rL zSzr}VOgwiEI{xBNfSoP67-iL5^_FP5R}>U6&%)TVrM_JI0_P>+MC;{*gox>Bft?Fg z^(@wOTLGPzjI=z-G?wwVzH!*R69l<@`f0nUocL8rM4zySTjL@jDcd$`;PsWP_zj!@ z4^jT<(A^RTLz}Fwto^5IVwfdx=QUPbkyqYD zeTN6Ir`fDGrw)ocr1q@JAzm_uN&n^o_^Q85dex?-nWIPFQO8GA9AvSM;7N;t=yI`r zA#hgiDo!fpva$cX7j?WQNRd__BGEt8|%otaICkPRGXq z8FJ-A`&|O`*|WOt0$Lhh!U;;II>riskvs*&WNsdNx7472nr5&BK9uNi;!>Sn;!>7z((IY<)ao3c_edRDm*&3M{@ug3YPv&DJ(IqyOxKKzADJ&r;E zmj2W@LObzTX>@$e)xU@-d|pO#kp4vx!@>Ujwv$#TIk+$#AG=H$4lTX1y?Nx1k&q#n zU}1)d^P;%H%R@VDCDB8ww&$`bhXXu6HiX5K(yna;HV4|>%=2HE?XpsVTTH>MbbpAV zZd*YBmC|!FAa!S;zQrf8@^w`enSJ8KZ^Y7Y7$yt3L+W*L%_T5#9}K=6$;6BhmX?=~ zAB2gsNfyCo7auCbiUmtZzE4=9BIPX}f`8l@h>`^?|5v>;wbS$7&J7OnHjA6;XY49`N648tE;RSBmu;}3l&&M4dQ@Nb(PF=y zoh*8$P3TuF5pWY0Ask_BC`$=hxEhKi8VPiHOmI1FopwJ>OR0#fsQ6W=%-+-~`?ey- zm_MYmsED2DZhPFiI~|Z3q!4JObosqlwxc7|oD9$0T9TVHMamo&GVh;8&YOWkh*q$# zvGzY;)qVuPsLlxiH&Dh>^xZr&BqzyGLG4ik^A=ZUwVb z@E?(WOz=(fNWmn|JM)a=BUJXD#z#G;wj0&A^rrCT2o<#suz1Hw|M?w=5QS25L#`w#lFqeQc_K8Y??>Q5;n{T@tz=N;ut zGAw&h+-4s2va;LA12j{Turv;m$HOxlAqTo-6D4gp3Ncq{rkQ$rd}zOO>GIUtCTw5# zd|#Q@@c3soS)A5)XV2r=l1OZ|;J*5W`5}*=#dkzgZ~wama@($)=rcJ1GJYb2hqZhF zYmpmz4ucRlT?M#2o8_G&^jg*lnb?E&*YeYNr2Rn0@^;ku6xNpIjl<}n_yPhMur!e_TWp`; zp3dlyDi-r0q`I(LZwd)T>rc_6JpJC6LxaQc=w0fyRag(C35?mBI46W<=Wq#qPf=Yq zK#>a#&gC?nre9=!C+K_CpMUbtnLOyLaL?f*zjZe4MDFwR!JN+P)4AfAD)*Qv`-MzX zD;nkYu#Uc;=XceX;VarGdt+X%aGU$*pRf?7L_bXubZ{v;JHc2Mn7#LT^7^Ltchgk6 zTts)JK_2RW6HCwm-DdAHLrFr8!;?6;fFUuE&qh=CBpo*1B?S6MR^9p%q@ z!rCiK1SmEq72Ng-kZy>+Nu7@$c66>PI>HG<<}(A8Ueh-H~v?eET3H9H4p`S*^ z?Yv&Ho1h8|6Ov%=#Z2tv$#NmfHb^H+FBbAe7dv}En^a_9MmK#*n1&c0dmUT)`F(}Q zC!lazSINR2=HcO?^IXW?Hia<0@B(y_^Nx9#5`M&kRUhPPaf?E7q;G~ONyv!wi1A^! zi&zyoM|2P^=~0hp^3oVhTDb=!RCk%4b^~av#}wN&8;zV#64@7oE6~-0OmFJ$sGOdF zSBBLy+}DNg#5QibQ0Fk+ooGDi8Bnw~X9miGYKHDEu?mRhIe1FdwiiGpv+;m`)#?3J z^d=~_w`9UIF9C8Ph~BiVFfBb0If09=F>2=^0Lt>C;CoX=VN|yl_GZteV{8=a zD9@xy)wS_?@7PZD4Ke`LJ4UK9M(1%7%)MXJgnK+&AM`ff=P%h1f45&1H8Yo4#Z9!0 zJ)bUCWaiqfJ9`DIPWDE>fB+>L3jY3Sw=o#;@e;yj6Nz0Q{!LB=7g_DOwwZMdl;9?$P;KcLI>LDvPCC>xHODBhb`&_Iw~8~Q*|>8GyndqF zYi2L)94_}Z;XLl@_l(&^o9QXO>FxFD0MN*hdGg&2oF3GeTUn_w?}V$Oe09^V!@b$D z_ttf$*fiMm!D?0)ShH~B5NXQzRpoCWKmqZ5Qa)0 z{LWQRM-Apl>5e}7G*n5775eRs{=6CN%ii$$;>V+T2TnE5e~5h;iqYN*%X<7&Q_wrx5V1mj1DXsJ7F~^9PUP<1}*4%RFDGHjtq)K#kq9WZ7)8&F_Tf zlt!tCcBiiJxzSy_cQ5xBSgLdiyV7usd~RK;J8<1%t9&E>7yce0Fz8jfkA@~MIVq_? zRZfGWNWnVN*H%HP3?F<_fcErWSZD}`u zWU?mS_rTdzVK8`g)R9H@h^A&s7IVn}kS0Z#;SK6u5_j9Rs-VniLp~-JwRx}(LldxP z^#<}z1w9kOcGW1=-Fty4me0rU8i5rZmUgHJNr!(t&*RCnv|Q8Y|>E*l5{CxLmS|C38wN2EQD1 zbCx3H>V!klauktvyC9LiUPSxz)z7!ZL;fTb!Fi03iu9%*JhCg;pOWZd)F4a&SQ$C ztl*D-x8M>PR+H~ryKZW6h-@u>XTe%(uJqg%r{ofON)h$)z`&r5?aP0XPKEyO;;5`u53txq^XYOjuHie zr+i81>vqsw+@BRM^m6U|OXGnAVYAhH#j}PE{^@?O zusY`92=m$roMzViCDxHX<-EjKQF>G66O{zVC^nE%sk(UVkr@UG+_g3X-85r$NR=%P zcwmE0l^EB+UP{$~65|;ha-KSLBP4y#<8Lckzkl`d2W4fcI0-rbL}cH&AobG`)^L^Mx=c5rpLp0nYzG? z(OKBNA6=Xw8~U;t>fs&O$E1nSzvQ2L*orC-M+(T~l;HD#R%FpxLb6+Md*{{RvRmHch>ppkv4YlxYU|ETH;`uKMwm_9?z})q zdgshbvycO2h(H$py9Oai@Q#&0R&*}2={vTi0U6X79)_4|Pv-3f!CD$^yNISW4kk^Z zVtvh;B&@<5Ee6&lXrjt!JLXs8G7CvwSeL`>pBKh9oEu^M{$l9+G8XHE3X){+=+w>&rToin#d4YaMjeEKmI@gdRl*MW@>rx=O@v(1o4x%gf6ON2k1^e9+djj*^ni`liYTmI} z4_|u5SxWyTV6J$s%4pBQc2PoguWcdNf@j~k?8e+*+_E|yL7AB5dM20tylN?t1t$Q< zDJIHiYpVgImS+OB13SEd3CtG)f2bytzjC}zzTE@_+yxR_mOfJJ90=8Qtr`~v{A|d7 zE~Z$2-8o1qSJCF<3ea(3<*;=64Iyx&2lV_uJcmV?Bfx_5(Ih^VYxno9Fx@mMa(ilk z@5jJe}ZbL<{P^+S%4730gVVN$ZY#8{}O0rH*U7JaL`9sKgiIQ1TrTUvIj>0 z8pDDvIFE(dcq^3!6xVb+I>AZr_-n1TKt(On$XiOuN_^pavU@yd4D1^pl#9%0fAsnT zpZx?^YIts!G~vIxsEmic%zhv9j$!X`#kU@b;xo^jK$L#= zbju(1ayTE{dMHX_6jCm^U&a-GvZQ{^@pABSO#8Qc z7wcmyjp=EcvzLwOUxkj7d%+JwwTDzJ)&PCy{@(F@d!3fa4@@ z<7Z3$>PKFord>FLiO_SUTa#vRsiWYlr51d)G0N!aT`Bpl_?;DNAG1I2Q8nkySNc;f zFWOBX9cNVe+Z@rpsX{U-BDtio_G%p7NLtA=j8aaaFV~M_;9%VGaoKcvR#hAF$g{~I zl2e`>vU15J6vp_>10s`D z=ZMZ%3_vGud3RIOQxgU z6B8co=p1<=Pk1!WGRK@LQB!l=I8S7`Ew^}dypEQv&3+{&F0M%M3vFZ(;??az4>L2f ztD1^q-zrpENn6L4pLZES=T4|79e-j=sT4~|*z5!8ezjNz-VDqlCujkmDxpp@pB~Hz z^z49VRlQUbvqLwKoRAV7ln9Vt5Mk(?ul~b{xpVl?J$g1@XFO$CRTJcAVKB>~OninL zmAEMD>cWQ_mwABqeY2h1a`&r?-7(u|=n9?FB{qi#4HRF$uY71WbAKSC&*Tu+h}o0v zD@ah_la*!pn6}wJ8NJA$wKQh2CMQG0@QtnurRvvwJZH7zP_e09E~0Iw!1OdU*0q>V zJ~|^|I~oBgvas>N?nW4&F6ejp`Ee@YQ0-NwsVqVVl9n`d)3!$3Ke_G)P^g&jSsgW(ICwAD8*us2Lf3c61br zLpNplm_NWLsJ;NXa>GWlCtn3OGAh{F*oR*W-E6f%H(TfOB_J)S5tiVL^rIqq`o*KM z8e%=Vb>pqA=$HHuSRoS~7be2VN1}H^!NafK&X9W@jT)}1VCz33aPE2c?wvJy$3b*? z+1j3N_C)WIS4v7Uv$J-p%8snbB+H4Sep;eXwY1ZFZ|0mVS)r#FCvGv*8esy>xGud} z+fNmne$7XCb=~%@i!pWic+bbWRvM#=#R~JIWd?qARVU7UY!%qNkC%TIT*y@%tV?PX z$b^AL3dK8>KkP=x@G!0Cq@T@FobkXN`RXXY@wl&tZa2W`vFS{HWZvC-ay31#=(c(7 zxQXCR-{jt05bmU8oj8`DvREkd{gZ253Drt@V{{(>%balMGlXs_GtD~+MOYI}fXBo# zrl>4>?QWME#PqZdHqN1pt?@Oj*qw`ND_8hXTizM2z=U;>+Y@;(sgFkij)btopQ~rq z&^2BySG#-w7?+n~Ntg@+eZKsDkCJrgthG#%j(ZCuh*EF4Hh=g^T7snclh20LCCohM zp<$x$sp?OdUNT~$Jb5eE-k&)A{wwu-UoPpCUqs4z6;JzskA@ameUn8C*25fkU;6ua zoN!o>^(}%|pFs9@LaXzBsTlFn#y-iDGmx8CqoEpvm~AsaH4n95DK%c|rwZ4iYQ_Fz z?XKGTiO2NsYZN!{54vr1E!6g{hE#HxJCI_fnyR7R$<12 zDg@y4TD9#&=tifT;Bi%?zyZDVjUfdPx z^t-IoFdstAkz1G#+Lglo3@$-~xz5&dX8H;NO9JuzLFkg{Az^u^L83Bo1l+qzQTd!m zpm5=Kp;mjG%uI6q_SsL@HC5KPel7fRE;oWlOJ=$p3c@!LEt_uHQQ{(6qe!N764(%J zB`7+bq?wZm)eg5|Ay#THeosZU{khgd#9e0MPCquio0`7}WP3VHsMkkq*d;o=uT$(G z?>AjeSFv$=#xsl8(fD9K({G7RYxb1OSh&dFK-Qd&+4hOn$g}~z(JQ7 z)&hf#Jy>WN)v3P<=6N*T@yZAPBmyfyaXlDpq(mCXnLBNh8|l!w2cBk9{A7cnl@rNj zfq;0q(ARA(7ITFIjN*+ysxQe3KxvRS*k+Y)M-ofq-7kzWP1aS$#=u_O^U{9{ku|N-1MPW6dMueioG@ zRS$N>&D37dVJ~+y_la)7hJXk^?Q=wY`FA4wcMUevHoRSffo9cI1Fl2A-PQb*Kl1>L zxi&^m6it3A8yY2DqVD~$4=ng9G!n*Jnp9k)1C&*A2Opl<6jGbA0b(?5%*LhCT4fe0 zlxHi`eP4>T&g!y8&;VeVNR8o&?&cA81b_yK8KLp;D^~p{OJZFj$=aL~u=@y~u$s{V z76X4qPzsn0_8VtgRF&|JChZU++clgh0njy;*53ji03z2En$Q5G6)bI2HC1Q8$N?|J zyw@GkVyK~^S#c2o9Tq!GkbCG?E=J-I=okhH3$wUS@$G39>SP;L&hnyw8&+Ce=rOUC zhcwh8Pu$b5TzIPg&|~0{FyL(ye$DRIPnPoB01_l?tcH=gSAIegq8~`{VFaTP=zU$< z=i&q!Nng-3w{O0}?|LJ#{pESl4m6Vk0Kaoc3#$8@9hXGndHU$=DD^n`liI!_j38lx zX!`EeAdWj&&Slm_QKUa@1_;vMIHTK<1M%z??H6i?{CKn)UKQp?Ss-5b$f}H0IlRZ4EO@988g+5TVreFyHo4LF) z%LdBO$E1CKoemp8LI#bYHJR@VWdo4oR;XmSS}txlH@&83*u8(yh`kpQA{rb3`P-dT zm6(T2gTUbiq_dy_r>VR#zaa>cf6H-@MjYd`4Oj6!_p!qoxioglLuXC=VP##B8*^&* zjIOf>^{z`%LnRasJr&{dhv z?7OTU=P4*Owb@D6{Mp`;gjxdgFS~^3(dlmU0sFc+RR+zlw>}*0`)}y559iAi=Je%R zw5kqjt%*#EhsS}>Wy7L4Mh(N7-n$-doGV84!tbX&X(0)6?b2}=E9Osk6u712YCl^x zF?uxzQSICXym3CiW_yDXrY)?=6Yg2HwMR~z4fg=91ic(Fh8HgDreWXv`B0Qk=-%Q} zJluQuNch0I6QASl8c)+Hsjp$|IFN+0Ge}&Iwk-#4ono*UNFQ8Cjxy9J9Y4Ot3i^+S zNlzv?5mk=m=6z@3Sj|GxFB;5&CZ=5DVsdmw=%&vyO+uQZ-0!>!iq%_H?1qv{KoT zXpirG`zZi#vU)p;g}|Q08JzDFE);8aOcWX4VZ1pH9F{Z7!^LeirM0}N$9QfIX5^~@ ztrO-lCPvVNa4nqA-04daBGjARXRaInbmUfdU&Q*t8QdCV9#EuPgeRB|h83imSEg4S#m~C{GcSDNOCw_CQ zDdql5QI*-NFB5+Ai3|mzAsEU4VT4ZoRk02eHt(*fJnA)-qqHTfczobQ6pFW~)5$0C z{S1~f-ofJg!~cTEp-YqQYK#@pICcXy=KiJep5S(H6HW|}N;ugtsM*GxwW(V?C*X={@)3veTQ9pY*7J+`E zJg4i@bfzf;2Wu1j%;=xDS!emV#>8+fy{%4%X*k1#=A5n7s^46Ic7MN3(oi0)Y~rV{ zR^s|P^()B|uF6;@o|lp7JX7~|Yy~q>9MJYeUetT1h6~9X%Ea;N8F$^|Aqe0Xuhs0W z7DBga#P3b7nYE$#MN_q8jR~2Tc*BX+U^3x;qHhr2D@r-!^=Gx9rQc;}pPuc&QmlHa zfn#W3pftM=BUH8YBP1JZdriEkQp43}F^M#o?;W(pvf4yP`tslvsdeB6#yA-eU~V2{ zr6SQhYt0AUCRk=qXU;Mt9NXJ!G)I*>+*z<$xD?vH z??~uGML1+(gx_-ewn4zU*_@14;D!-ley!XXsA}>x4tnjqiJzuuxG%spLiou(&rKe0 z-K=Sna=FWz0JGLy-#v{5-2@3}p3_=E7*7Bj_&YSCqK&`ooh`v<*$&xt61Hu;IRY9L zPqizgAuDk;_VB1(nWo&+_-Gg$lm7ZP7vK3yzoG=)b2Jox{w+5#hB@|w1C5uOxLD5S zNl4be(v|SW4zyd@SP zAPIqUPB4$c2?-u>1O*ar#oKFtOU&YdDFMgZ`RcAPQfz`p9lgT_)blF_uCA_f2;m_H zn8q+`Y7z~`gB)GtTRT@sie}KOaA$JDj^*1L`m=K#TMX6ghG%sWa{+>R$!JJ}tlSv3 zAf_F$2)eo<{YndH{C4s4jeYIhjSogT<3(uWjI86_)<8BeXU?g{k2t3M;gOb%=*y@% zr?3r0*OJ14G7c*Qc4A@<1cP*U*sQU4 z_&r^sT3g19X^ zCF*iwhseyAoRH;m_aThu`JgG%>jLYZ_<%G{7Ie)Lf=N znxjD02$1d76}#!$O>8Aj~`b`X56KCFzfd0p)NoG z{Qi1dCdAbi-$e6Nvifd83+@$`t@8GK(RnS$G9U&QE;fXcT~TQE=q_RQS9}9QK9#g* z|2m;=9dXISQweDO~G9}$)S+8!FUv`7B7;X_!A$<5wZ8Y=uek*lS@a^16Yh&%`dIe6hgK)Fna3 zYVa)dP%JU8UAv|_!r;Mh#V}H!_@n*#z~uRF(3ptQ&?IX6Z2@l|L*Y>RSV_EO+}e}W zLi&BsC29#a^*^BY-#A&cxhH82n|W-qTek)Q=8~a^M>cx?Pfr~32utxqo#iudIFeD= zFiXwA=fOfB?@Nhv?_pfvY`(0Tqsh-LENpBUEx~tOuiDSGM4cTWCLln{ex2ED7lEO< zR46;0j2YPlTz2s&v$B#o6s5kgn<<=%dAlaztE9iL#P4VRZS|6Kn9CCvWTs&cVpaUT+t~xb zG_&X?Ls=fwZdUhxe7j^5^bI`3`89(E^ib5|^c@S2aKPsi5t+NoYxZ(+tT z%$G&PcNFXAE-3@s29(-nzUgQn^+f-2%guMdZ#$VVkYXMx(cKKCA$%1;!%6j*asGag zpVwDX?>+W|{llURJpT>S_8G4rc$PvsI1vrW#&hU1AgzE9UoP{=Anq!$EIzb4N_ud> z(v#$18c*)r$1^w!09l`Y^+Y)&?sf{ABm#4K>Za9ke%F>gWFF`;Xv)g$5hxe0Cw8<` zT~ofBe*|*3!1f!aUbukw*Jt{Q<__0I9P@O=(zfK4BCB>CBShQ8uh00OhBZ&^8CS9N zh5li+ZEGmY59pJ*cWE;0NiW*o+R2XRY=BLeTJI5t-qR-^7uMYja06F_Uf3ir!*p?9 zyOs}VKxAH6n}3cTMRPuu7SqNjXjXc+G1r2svV$;`wy%2a;$I*2U&jZ~3e)G^&xPLc zuhjFl6h(NS0*ZSJUVx6h^mDjTP+2pB5-38ZhNe`|5CQbuLDx}{O-%SQ_s?r8k+!_@ z1uEtCrURtXACK&L@%X@p*}iMq z*8FJr@u3pv6_|J77yd}`-bqQKIo3Ohqqm6{h3W2NV^&;;to9m11vh=_2#mW`G8?2| z{Ne1Q`Hv4wCTDoxej{j_eiCov`Qx@OLcR%N%=Z;E!=x%OviJM0|LfCmvtewCs1ky1 z=2=SvDdMH#uhdfk5V_>j#8*MRD7^x}?`A}er>c4Ue;-M5tz$S(7`9{FZoZ2dL z*B`I3-RPv0)eV{e`ag#ek_IrfaRbxVaG(=x|1{Jx9ZTE&>Ftz(g~G?%e;ntp2YcsA z(5-p&HsZdR>nRc1PVr5&;Y7d%o|UIrLf>X}NesxO>XeCt8q76hRDJXv=P(%FtI~HD zg2#C+J6(}-2E?mi2*_2f%i`R#-xQj^jboU=iiJ0phQ`O)To1Weh2HfL?NwfpVNrS5TOpg>lq@{`OsX9_ zL-{ZbuL6M=GP0+#)0V%;AO{ba20+|_fdSgLmMSvtYB{PNKTwk}QwD+m_S&x#S-D=< zEacu+6gWechd!EUdi1UO;ZEqlAmmwYsLx+Vw&NQz60(D>6|r;k@+O-WYYH>Ivkl_W zHFYAq<^6|6eJ^{A%>O7>DI-@+Bk|?Bgy3=C^eaEYaqyZ9K06Mp?VA6M69N*KECf3<#Y)C402lA8v>b-6I{!=<^acP-8 zQHM{=-7aK&wBll%d`C5^Sz@KY0KL++k#_$}?6NoOeY9Qtr*+S!OJ+4V!+J8ds5Ze4 zC6Z-aw5L8q6C@{y34fe@(zD0?&QJW72TuFObYXjaMm~N54gwI^4SR+w67+565QBv+ z?k=pN{FZ-P!0%s@R0uz5DVr;~OX;{Tr3|TaC_>8g;2fb78)43)D%=vk<}B#Mzkspc zlsR$>eSEEQo0$Wc+baf54UVdFp++C8(YjP@@t?&2WECY8YVqz5?7q5OhSHg3S=5U|;@%D#dhvX1bR00jrBuxGj zjbMjlz$^Ft9q$cdt^z{%NQlH6#9{vRJ|7js531PxPFK!JWK~XIPhO~Zlu&-Z!CwVP zjRHBxn!gkwDxq~gWm&mW(mToMtBz-LeRM)dW_p_YG?&(MW}S+KpWW>hhqDVa-`qCQ zAB(`)8=W)OLh(O2UYj4lSO|Jc6Er0__EVhYR4Y+*+zU2LO;3; z-X^}5&=2AjXikG`R36RXGFnYz6 zSnlwX2*)84W-*3pjbOW1ta;|sH{>kZ-m>dBwsw9+A7+@)en?9z~N*=ze5%b z$Jh0YKSOpQ0|bX@P^$}jJ4gZqV?)OCXb1b?+*h|F6+>}GW1wcEK+@!H3o-eVKVt*x zHr*0!$-3N%lOv(t%?u1LPbD8{*z2)p&)%O zbnU7B_9MwZeL9__H^eF|s2{A#*Fm@cM+WHgu-tiLQ4IDKg1Sxu&z?j-xvhd%&;jQ- zDf3wRoAypAwd(#{)`_jI`?L5RaUDO?6;oxNtUK_5PYrtM5(+1^P(Tay2DDHW!^^3F zy+RQk7gy!!ch>aWSJEz&Vn^jm#@rTdMQSg2ssDDvk}{kVLyEqd%@Y%oUg*pH8BjPe zGRDB4oP2Gx{-giSg_+gucJyBB+)e`?v=6WWp}^`;Kh9gYPZP(q-w^&Tdj9eZv(LaM z&9*$N7r4Uk*|Dz4%blU{mkdeVW3($Pi-VyTwP*9*G02#mGg1WF)8|eg4;dk)g~`cy zV+MviaHvFY7sb&c@IQDWi6=cU3CC(fzi`0n=PIbT=+**%4_@}jKA`4X1NnTY$B^v< zi%O|cRReYWF{R9tf5rpSlE?kKz6>^H0bXvBDo{6SH{M6vgr^TPylo%H{(K{}{o7uI z4+Bi#^QEG|amMyK@r*eiHBOZpVnF})Gv@EX+~;LZ=$0ynyI4En#!nhdklzz z)v$v5(0wl6!NNLjfUnMVLLxM#w)f(foyBqO7D90`XkuVr%t)nd7V_ogk*R~)z%w3izmqXe6r3T-3a zaI8L(KnEz~hXsD35-c}gF$>{j49 zhyw!k9Yb-)+3SV~BFy)eJr_n@>VE|y!%m`54`9V=X-pA@uEcGbF;qMU<&e&!W=KT3 zek3BnQ!CtB*xMKk&YpgwAu+W&?4-2-`(F#AZ(Mx^^Ctm&yStqhu3b({ggOgO;{%YA zNZh%<4!tg3LmEB9A$Tm?=$$7D>`G_rz7}LV6^pq4WO0i``60qd+YZNk{5pZJU4iwz zl+1-zMjy1ppTKI3hWGE^n*iUY9B{+>TIEy?0jDvoibomMjieh}TVz>%Ca(aK7UH*n z>?uSzcc^dR0iJ#IpM~9HNiy6r`#$9C%SC)Y%>f@RNqO`un$-r2v319Nj?QAxDd67W zzzUIlm_4=Z$Bz)VT<~t5tn`uGLht237u5!h?MnDwpDV|r#CXv)%YT+N?LlZS(@33V zUwW)!AvKXUXXAVJs2zazGR>6eg6;{+ zpyc@*K)(85kh`{L1RP1jz#R10#!(=aM@qOe5n;8ZGdtmq8Tr%n#+hXF%+X-F=i1Z@ zE<*DQ43(uf3_-r}C@}e<+|*IY(cW3Yp@#M&x@6t+NV8KlvB+hYGcc<+L<3kX88AIL z+G|q*PIG|>XNzY$=Pdtz>)=B4jtql}Oc4{|in2s$fZxL#)sBzeuGgl{IEnBj(qg_Z zHa_vPIyW3Y6#q&w=x#5Ku)F}VfC3V^_CIqn$s`x7g8iTqIvW*Rq6~a+ z%Pqj_1xpXr(KFfayokCrX}K75m4=1*YU!~t_IwSg;LB9F4|60gG8i$Y0x{w$z&~rV z+zRYx{_^R+4}YH3lij1fNiHycI{7~2LV8p&|R6B1R!_XN&h^c@fG?Q_spe}WOiuZM5|=v;|~-zDCf|C)Kpck zfz2&zdL9%#u=?s58+*xtNhDvX^gbTilwziu>~njEA~8euSxe3D+tVST%d?#~y>nBh|0WNMkbRw}xS1dkh5|gSj1IxgSmv z@4uep9#;3gyuB_$UZ@-^&|~joZtl;i(bl`eeEY6^5TP=QzOA_!y`y-tz7iT$&DqG3 znSPN{{F+&FdH3RS`QVBEKa0-x60lrDEvhBE%Ev15pKN5_vkrAyqKj-pb@T{pV*j%? znALBPc|^fSD{9M5ctFOd3!A4c(-EeRj_u6^<_l6H6Qb=A%b*x05fPK;aQ>yhQ}hHz zk_OLYbsEqSdM@o1`Z)i%J;qD7Jb;E>IXIU7?zW7oxBg)ElUHJ7CCuw@*q0tqxuo*U zI%&*azQTFv@RZZz#us1?hXwJ}w!Em*4h|9tF`s^fo|@5^1>z1Z9HSDPo4}?fvYk&W zZX{yhL6*yV(LdZGoR%*5U*WG~7F&r*4*(KtuR%Za1Qpv4Aml3qm_pf$8OZSs7*OVL zWIiUT7YtqBAK0Jq-9Vm0s`O_wf98@pjL&EA2L-J62`jS{X$?>w43Zx%sSVJo{}#}Y%rPs!l;Tk7w=c+- zcb&y*1ZeEogt9kG*;3u5WI~L2XVG}Vr*8>|&f0QfgibQ#Pme~}BiHuG_)zCH`dfqq zmg;a$j!}j+!6G*+Rj=izIUKd_qEwEucwMqR?<-=?YL>!MQisX9tKC^*aC!lJ69t zT-!F~`-SXkc7@d=I}G0jxldb~lV$U!Qb%v9ZQ(iR-1OW4UsA2yL(P* zEKkfZJ)8Gf!qS>>I50L9-Tune(HPL&1>MV!fMr`+uY>N4UvsMUTz+k>ZcuRW>Z1-L z&L)0$#b8rW%ulhKJcVKsn@8hr32C$&ws!*9E_Z0M3yWF6y!t(6Vl$)b{>g7gYdstx2*pyxm`eq^+U_*pKm>*)$x&3zo39uha)yj6A4DS6#1%E+OClJkx7 zYw;r)qSrU7biHu>t>rLfk0;8#&LSU-xzZV_x=Vz z5>D|J6E$viude!f4a6QW#=@}lL~LSW*Qj6KP_uQMFFDfYs;jgg?6;vEyK$Yx&UO*i zd=2Fo@vh*5d zwDNb8VPMleKK9Q)PS-wGX?^sPQL5TPx+?e>M%X3E+m@f~<2D%J`>tNHntaA|`03lX zZ}#-{A=(26htCy47U*C}7p!#1VTVN^}y9(I+9? za97&f3S7gkAa~_MFqhit(^k|UVn;QX+8BM)rZX%cDl!sLT2|($1~Lv>r8LEmj~99z z9VXdmExigKE>ziH@<`uh&Q_cg>g);Pp=}irObzFPb$XVS@uk$6ll=XC3y!J1Im&%S zGKCiRj-hso%v}=0Z3GTt*Y-q~Hg-wI`aimkS;tWZ@!(%C)z#Eg2?z*447od-sojx3 zxM*KkxBbNugh(=FJnn$JAgT29(|Y!{;i9*1zXqH;2z)>^QX;{U>MGrNn5dFBaP)dn z7IxfxYCXeZ*}AZGLURc(lgcu_vxPNEQumLO;KqRM%gcO?55#o?l-y=erNRSt4uS7% zq8Z>#yuE~hxzX<(BSj#2>^SCs|3mK_ru@lpO=8TAOJB01tN0_!+IZ=!2aQ9?%3#cA z991Sw~EkF1FAW4pKBshi=yr6jzPkWqZ7mkv|m zwr`pytGFiHOitF8;(_dy)Vo$;byZ>&ZpOw7+m6@yjHo|Z#}z(7z0u=(ay7SCGf|^o zk9dS7)r)&pL)expo4GvJF4x_?iG{s*COoQBPBLlQ<#7_ij0=S! z2tp!YwjO#WCX1clOSm{Vs4L6Lwx_mHCFZr7^5?w^VngWK?e2Gj3O<=UOV7QpihY{8 z@vAPtLEF+rAZk)S-(8|82%r`;7}-3P67DhT+DVRTfd+|*8h$!2F4?8_1e;i5RbIF)Y=SYJUb#?V<%2x(F_rTKh^uY2?Uc1@!e;gqC zWVwwo7_Z86kgoEI7#lTj=atIvi@kSB6vfAF?<{3sGMU(#yJMuFV`|=%D6)_e7A#^j zeLd|{9M7x+*cqGzUP|~8fdsU+BIlfRZzYBXPl>jWhf(T{6dCupSyZhzj<^U1n2(HC zofD{>J6QAZ*m_E7(R`*db(ifl9S;3!dhy{6t|?f+&E_q{G#3rZ`n0S)12{}^U)f+{ zj4;rQCcU`Cn;`9$M0Lr+?K7(pnoFua^=J=bxBc}CUEt;|dYRiTZf*yudZa6RBc>g%8HdZ_?Ne&d;IPp zPIyM9+-=LE&2S(nz0*S*S%1_x;s3b3UYOX63f+cg^4)TmD%Q9PPc~VXkR#7d7gfVrv_xLF z?>*O8>*J70@a{nSJCW0S`zn3?9p~jEJI$rBC%ELBLQ$RqYVufpD>Sh*^;-w0 z29`Few%Rjir-V9R2l5;x*w$2!+Kp9bEsENIb&e0a*OBP{htJacj#)95q#!RmSFsxW zQR_S(Y{S9KB%^zyR?Q;zzizBIZun`|;zw`T1zs_=MsrqQ-UKPQQdo-osd)SzQ<`blxvd7SU8vP!K+KmZ}@93nVtS){(z1&w@2|(+n+vs>i^pj^6j?T+r2TIBS%4oJY-nhp9Vt0rWtpLzY`V@;TAsUi@V!j-3pcWT z3iLQi`GscEpLre7H&;S(xmpn=P%#Ce`kp_a1#;B4gcnwyOc26=Gs~U zi>#NFeer1zWrsF(hWmU7xISunxSq?LPaUV6Ad)t_348T6T=4Ml22`g<78_xm1^}!b z3tL=8{cG89-Gtz}iTt&R$$N0&fXq0iyOlae(84nF{$cB_63~(bw1ij z57aYKW$f~CB=j2}?l5+_c{yi@smY7o#$F%$oaG4-pR@2fwmF2 zk{$@0-UI944&BOJ7vq?ka1|-;Z@Y0NG2IVh4qJ9S(@n#w&P$KBaDH4mre3^o~DxX)qh*s-)H5CL&|(!)dYvI_vPm>gZ9YWaNe=V z`ke$4SS2PH$g^j=j7|Y=OFdjU(M032oH%M#cSa7Dj&>EygZE{k!aC)CMhA@rqgbW`gNtWAtboicQ3&4P{=v| zaS{JMp5vI9Qeui;H;9>?#_uR_%}u{GcPR}2rnk$8Y{!F1`;aN234zAA7q;^&(te3} z^ubG3oo?Oag>v#SXG`6rOx^5v%g(-%vW5^eWl=s8dDYa~-2r=1tO}9m@j6PF_+BrO zv^MB0*~Zjd@lLj(Zu_p07No+8tzbVq;V78M*XU{m&tCwSw6gWP1dcgyM2MwD*{fA2 zwjBDGHrO%+V4>AB5Y(Lh?vmfX8*wkF6fyVU9x9#oj|7rj!80Z%Mq^@Yi=-nVY5d_T zTj8~1o{JE*?pex*{IAIW3WM>)O%7_-K!y2dyoPOQv4|r|=jHq+nrh>oG`dGo?p2DW z)onjReD@Ld5XdlaZHRK+@qvoSmjdav=n~WP?OTD{*U#^rBT&qXSrO`vpx7W6rSV8)QSX8^p)nr5V(x#9?0alcXI!s|qV67w~!9h)9AUu0gG%1M$F z5??A-Lw3(2M?;iBcb1%2D;gev}WexG+fk9_&Kl(x<_OA+Q=(?=H@%rm+8y`5cGyYOOU zeK^77O_zjdr}%1_W1cPN=1F_;E|<;nXv|?|$pQ=RxR-O>`TudAk0oUx-D34hc^OK` zjG76I$n#ZJ$a9oSRbkz2)k*$n1oaS^^ey3F+pALdsvqUicHpI^QvRwRkL-7Mk{8{T zEBT80bu|NsV7C&wkZC@@6r;I>zsY;*S9P*W&N5@FGvMY1hJ{I9TYPSF7?Bp@J`DRI z#aU)-ZGsQpK&udB366%8YQr7{s<6I^bx`d*lxVRVQ&KhmBGjpfY@h2npg@k#IgO(- zDJ-iyTQX;rB>wSWw_@K7Syu2?;Iyd_WSH((5`HU6d%|;Q#;!)z*5z}fQne;G6f~Ct zdDgWNl~TPmZg<(rqn1nK=Wjlu!Q7`nnbYodOib6(G4i-_WA+VdF3 z6bsaq;>cTHI#Qwd%76RzU6<$jsJp7qF$_j9*$D6C^M821f23&L3XtXt(j#aEJ$dIu zbvx&)mas^s7d1^qgm`ghFz;2QiF|)F(%aj60$lf@BDOWc*PgalQ=)Jlh9r-mvX4jY zzAWm`Tin>cCod+Ay)M8q!*w26Q(aW_{Zm7OY~CFSkVeF%r-`(o^qPfj4U?R*Z)FhFxPuiq{$j#8tA06OO9-ZO}Ynbh+Y5T zeyXM&K4g2LH$iE(H~moqstSMWqtZA%)KJtAl`6t{-w>38zG{yB!BhWh3;C3(y|p;> zhulqHGZN;o`bgQ_220%1!oT4OPhs^d<(a(wGil|{#aNL{h>qa5j>RD|E6o37AH<}2 zyzG(;EB3%4G7S^q=(_L@@yM0^E!jzoFwg4E%b&WKnz*o+Q(3nf><7Hxi_stCT|QjU zvbD`)su#?ilURzS0#gqw9LOdOmv2ku#s#!A!;T~}gbJ%})^IY#pjoaSStG;KcJC|6+ zgR0zc=cfb=cci!nEeGsqnZ2>)dBWYULpneY>A>P+6bwmojod&~&d1?l_}~ZCS_GEw zH-}#6s+ft?I-8Sr)JK&Sj+=RF7E7&bK-1$PzuO$O)?Iz*__B1N64d`&KB(7=73(Rg zK~(8gOn%{mzvDuyZGN0K1Kj}T^;7f1si<=@EuRs4sn;0A&eN9mSx@PDt%gPHzEHZz zIy)2oy>0e5QmUr0ECbiW$Fq(uJL(v2LNrk$*|pd_b?Nqw>O;{?%b;D#Cp$`kk{{=2 zc^->F%1?CSgEW*bo>a#<4{bF6ah3k8t4VMOdb6-!SX{dXjrLttRl`MBp4V*_tf`#D z(p5ZT?c^xOE|izNbB-KA_s)8VN7X&-bxJm=BB)ztUB;FW6Jxa=9&FbQM>g?N!#!Ru zTGobtJD0PGaPERjl+&xG` z_k4No+__UuFUT^3(jIWrZQ`POd6@VQ_8VPeGGx%Z+mcZ|7qLT_Ghj3Rp;(Aafy?$i6K+1zQhPE)ExqCrzm zEpF5B$* zBk#Y_p?A-5>z_E)@aU&jIUdX5S>twAB{vJ`SG}vx#a^vOWoG*6L2yVd-Rt$K;00W_ zXPY03rruZFYN=vYk7}oG#1B+WH0f@giy4j6ohQLxHaLl`tJEduFKk?|mg_v_ba$5G z(-vM&thT43przIhl?giI49lu(MPIJ3NG7cDteTM&Ic>DUQB!_Spl5rJ=rj+TH`ie&W2LOMr!yj6TOVS zWH&jAE7IU#MQqhN7f)IXW~{gE=<9_6{c^~BIAE^OMC-AiPbp|ol`|r$ zhH}o65*sN=WKG>4NahLRu?5fhL1e88RVnqD+eEM!T?hj)1~3$L`ug)7Q-Ia}sC&%O zJ^1n?K!-V~hvPk<6!5r=Q093k!pDcQn%FwJoENKI#l6c#=bBX)=WiSIK(9sKIbRR)LFC@~ygsc8T|%rSTrZ~>43`Xlry*Q-^9-x%j~Dj1 zUddfz9yFV;+h&a$CQU+=UqtI{d~2!ak{ z<&DcC>-9u^pFVwZczJI&kY@mNAYI0GhYJh>3ahHBx^r~CI}`M>-D}`{s4GAR_e_@( zxfvOEwX37G^Fsf6xXNOiwBMbZr&Ym}^@{_GMfE<;J9IT!m@ zRrATCTaH=b^^a!ao*$~^UV1B7w8n8s;5Ye)7>tw4y_^gMCOy}#|Dluu^a=ie*%VT&;a`!VZ z@a1BPg@aL9jnVHW%Xn8@9@=L1?=I2|GsRkrSM5Z@Z@M}b420A7Q16P24WF5WWHd3G z*meKNX7_C7&6>jYXl1((c~Qod!vZjPSD{oWZxB<{zZA#LCpi3%2jmXKkP_mGjtUK0 zbSjK#waf6gUU*?OG^#D~mTa~~@sAh(FXu+^MmuQ)Yw`PI?YFtEc=J|4n5}V3aT7Cw$lsx!cFyM= zY;6RsIj0#HBL1Jnp**!M8WfPbsds3qOjPTioe=;y;2!K|Jhsr)-S2yi0hFSbEeV(8=cW z9sB)c`x0{x75N?Svj?tHrqz6T-E}5SoVHa{LQ23LWKLhvHO`qpq~XJ!`*0QaSGPv` zN*4IvkOe})J#8}3y8aJa%TASi?TjGkM{V?sl1qYT^|3Ck}Vb%`AP2nAf~f z@pbzycTRDMI%9RcGv%qG3wE**-|v%5sc3&;GhkgPz7&1^t87yf+o{Hj6v8#h?+{Zv zNxVsBYG>KAPY-GvR!HmG9GK^E8%MKLMP+o8jeypMm;6&Hb<0%w$+An_-vAv)-bf~CIkJdhN>$@_FWlDw&Tz&^ zol#o8oN32cyUR#COOv&QIu2>Piq+tRIcyg2A3R$9+C#%NsiySEM;pj*Hu zU&Lhsii!#J-j440p4Alu4A>%vut;%Wrd#|s7a;atV7$vbyyuXZ*w|fD`ua6I`c;F# z9Zv%dS{t3MZ<$St#S&056JuVm4Q8WuJri8~BSDvy_FH(9rzi2Gsq3`|4XHF$qzdPG zWMu5oa!;2x!O_g#VJY;~J3{XnTs_sd3?eI6y$+&O zXt+{G^#XTjSqum^cb^eV)^fCI#w8}!<`+oGWkabBNa=WJf;|_>#x=8+poQE4VPSvO z_P<*kdi$6Z_r9ClkK(>BP%5pJc}^=UTx;Tem5lG1Zq;MCEhC(vWYsi%eK+q7n(}iw z)@az|Qv0^4VxKk&bFC2zs?tMbf6!%6kFL$FKi`utt%q@0+rj?omAlwxQG=!fU)u7l zqRE6Vz7=#|GgrOr3dj4@QYo4cnk^wvQhgc1GxYk?gZ>p?FK|N3xnA)3Lex2C+k`tD z1;hpx&>YZ(E`r06*n{9Is7n_G@u`H8*p|a`3CNv~Lyf6}!&P(9D0$8_N8<8#JUy@9 zFBkoLy2B)s{Sj6Vh)>@Z zQzsdZ>K96?%y8$|1d|``+4l&I7GzI=2PrG@ubDsLNqFuJODd^1LC=ElO%~dfiq$B& zha9&^{#mD1V8`wCm%DwyKXIXDP(A#5+bnAGM)9?^XRT&fB%3enCSM#cfBQmbIrQuc zeqGkn%?5@vG=z9mm-a%uyn`wpb;vtwJT_}6|9^~q1z40@)GpwFLxXfFsHk*_(gF%f z3(}pUG!ES@Afdp}B}g|2NH+)!AQD5P(k(SK+&#w=p7Y=PKbHr628Mz8zPYH(g#4!>04+t69>hIf5&fe7C(c-++B>Og8q!@|pez(ZWWW(?4KCB%Sp z0;pmI9)|a+my`T2P({<$_X=iqJ2~!Z(Ln@{*8?oQ#sE?WnWrV!#^!lIU5<*zWTqyu zIE+UL$UWRBSn)R@rIzvq&b5N0w^qbDkcc+Jb39NCfldqe+3c`PBlr z7a>gzay$G;Vyx%odd-~CKE04BZHnoPIlh(#)wdjjVvT;N!w|$;CJvBz7}kY&5gg9rQ_O?a-C}$gV+p-*;4FztWUb1*Ud;*Ecml zHY2iee^h6S{wS~<^;T|db(<69k>k~?Fu&q6rwQ&VAotJO%hRAi@w zu+vysE{x;(Sue2mOU;>Md3PHcCzUW5%0)R5RaNf#OkZ+8O8x#~yvJ)}DSB-~R5QVJ z{c7Re&RP2n<|VSxxg2~f3aCRZWo!5pD@HaxgoOGfNRDnac*e_kghBuEC_4ekvb z7)?MkxWK}#lcKtjoUXSj{WZmPCL|_9=6CE`ac`gY3=hZfc1}H$J%RyF3OJ1K+!BSKYd;S zWDLzlC@jR0`Cgp|H`h|BdPjqkQ9~3QGqb-(RfHgE@iI|CzRMF$TY4|7(jX&}W7AIyCvS3GI6*GB+>E>)?l%+#hMAaj|s$NanO* zLkq`vreEY(DH>fW2w!t6B8jKIPh0vdu=&wSLq7%992|peNzeTZX3#WGGi93JbD!B$ z;RD!{AyZD2;s3u4k@QEwEQ5igiI)7+PhCB8A+pJHAnsXo#S?&L3%96fgA8<=5!6LN zk)5ML76`tgn?uUk9M-?zMneJY{sxCx8>abUL#HE(i42Hlce6gP!Dz%`;Msk`ub8zz zkJL#@N7x-bejF8=b6?Eg5NVWOsIi+V3eX!?aPYX5PL0oA<;r`&joSt9_Aw#2Q&Vg8 zjXPP%*aym*Z0=OE!f0|=Qu!Nq5`y61J-?MjP#|WHk5<2@=hii{QM&$W!3tVZwE31R z%adV3Mx}h-G_AGvr~hVDe+TM6zsDVc#VjcwVn#Mh*HvLyj~w2hUrN5Zb!P4eEqv`W zCm}QJI1Fs!t*ea{ z$uvydv}k8-k`~X51uaA)m3@12)!%LMKN$q|Lth)F?7b*l{_yWI+>3Ir{v?+qBoCQ@ zog)Gn)oiZX=#pW1(2!^ZNR*(=8#S`mQuh13EIr36vT#!_En=|k#X+?ms@ogdlEC+P zB}$W<^$V`(`&H=4a2+`a#UnlV8;Zxa1$`Tyf8l(_RNA3l9;Gi4*b)JL*zqKh`Dcc# zOOa>KtrP832G*6p>mA_xh$2+c*fpbueQ!%Xe^Ls@-1`}HT^~V!RsaFI2$#?Qzf1WW zNN&Cf1<*M_FLj-tHH+6E(jxQ;%aSf~nrB?H5csux@~5(TOaajRmHpjM)6~1B1G|l5 z11?&jSATZ}fAEln->1_sKwb z@j``#{V({_1Wn9E+>2Q-g$QBqt2cUAJ+^G1y9W#*9nTltG^w8hw5H7*E|6_|%r+f< zYw_Rriddrl-BPGIeWm$DZ#ZGQmN7cRaih#&FO1sSGhy6ZL$+-3x0hP&{d-`~@9Ah3 zGaxJpkc#EZj|UL9-2|U8aUXe9MDI{wO+LPxJ+#A0w79@s8TD#oKN}P?7>!XLxDAQ$ zwM6c2Lwy%j`-Qf{-tE-HHCg&JU8lMXNL6R&;h9H809HT76cU4c--DIE@1WBf@VmSH@3pIf8e8~kKhG8~8hwC1s%Wh52SO}p3?-m}Fo9FT zahduaoZrb zdqb$(C~3HxUf%-?Axa_(W_vFCW&f%`SW&^lz-APCFAjUOOfL$R&34K^5SrZMUDHcE zDPx4vJsCCMrypjA*?htAwI>Ic7*W@g2yBoQ#3wbGum6*t^^ZG~uNSXod`25=IMklT zdjtE+%4zH`VD*8r9Vj3Ccioq<`)f>R#aP;{gq=p*xl77EI|TCI2w?6=9ccOy*3+sL zzUn5qZb1;o@tsP$%g+_}RQ2Q$cS(Ay?yFrDuvhUTI<3!dh<|(08#{$Yk^_WiNq)H3 zxgl67er~!4)P97u5b>3^=lFd4mNhiuAYKZ?igx<`)ImUC6TkNn*j zf7t^BYia%L)r}T7$J;m}U8$DB3|Y4okA&?5UHk7^aPL_AMX9t&>kn1|`+kqRWZjg# zxCuUWEMA*D20z!b+uh@nXQ*9(VPxM?V|reqTguT`JWrFD?5$i{?&xu=lyO<`OPN(c-<|XKcFHVje4%!?fomh1@aw&3C~D5)k6lZz~Y)t z!+f#J`(5T1P6BM7lhskF<@OF|z7d_}D2RARxq_vPoU5^qwzhvF)Nw_c7`#_t1-ozW3;zwif!xn6FjQD%DvZ1kE5`p z(kSH65t3nS02wuuiKfqh#XIFKD?TMav*G?R&VN6eLo@7?ouVs0fa|nZ{P&aKRn6LX zcm3R2!M`*07nazA6Eb)0JJ<&34-aw%J}twfKmTYpv}ATMSW zNe;zWPi!&1!9fSwU78%EvkcTP!|K9gjQE>#uj`d7>hby@`GeyXP|UG!3v=V{m?2+v z3o|Nfa^5dReqJzRTXoDQiSI)$Y#La8l9y_i$Lw$-;Yqa12KB4Y@jd*~SljzlYEW`% zgy)>~zz5o~^jXHe2Rm2{Vr(;+l~)f3#@<~v#RQ4`DAKA7G|L>bohB; zCFQb%mt|4S=BH7*R+s-SeOOJipnDU2ZyGjyir2L$f@0~%M$%wWWo|ZdDt5>}->vU) z_7B}XNmd_Z{s-jyAG$NO6VvmmU*BA&*4dRgmrDqR)#T`aDwgb5=0tP0OWB=DY6M{# zk_6SF@OrBSdjB5(ZA1N6XB#$r-F=;qa$nIKjYn1Zhsl-*@2b-J5q!||AD||~$%`iT zb1PWNx(rfQdrYr))Z2un$A7)q=3FR<2lHng9;(^1Ln5BDY4MdP$iulmqA*qDAKZnh4(pr(RPM9`{k4C8g+b-t!2{+4J>~6a6m6+1@=-FZ7z(Drterqv~zI zMHP~B`?L1oi-(Ovv8o-30gu`Fi1*TMGyhvZzvw|5SZSZ;T)ZbN$c0M|CJ2quiNYR| zJKuXMXGA;yWcm8qlLDv3Dcmiz0|{;PAU$p91q_5-rhi@O9~Z8Miw>BDT!=>QE4SKw zbk+xz0;?(kB9`CZd&8MTA-VZx;DNd^m#zZ4^O#OE$HF_JPiA;)S^l=;b8R2AyL(^> zvOax%GB_+`m^B&AM#k3Zu{%EP64>1dfG2W!Jt@9bE&3EG) z8qmZ8h_Bb$Oc`)k-T{jYWn97eBar)k6sM4=V0~!jPw<8mGI#p;R}Asx#Zn8pb(u6k zSK-S$Lz4tPpL@R+PCf{{x_fL%MvvK`F@8A3eEL5N3N369CmH4oPuD-ZgSW`cI>4Mw zgI!~cSyf52iajKFmN9VIsBQewb*`QzNBl)8d|TZ1IAs42yhtRDNa$W&pV+m5!$v5DE{>K+@{)h7}Aay~^ zc+WOO{g@a*`0}5yUK{I$^r6sER3gptSO8r2CYdw>r7u&W$HSQ*EEoIDa_RdH+aj}{ zeOW{gWHQRPKA3~jN{;gL+SBb)P)B=pX;2bdVB)cLa_-wjhu8Z zQ04K+bWM|21K5T`Y;GN_aH|&$vr? z=B%!8cnGD;^F9agByNLJvI^^Uk&kZv6}@K<>Sk0iIFp`Q7TtgLo{K9UNqZ+}Hd)qz z@bjarx?F$Nj(XHbZDxWb!e=Lg`Y&N=cx)?DaB@DuCScP$0l}RcT%};l=abh}?xgQ% zlb6oHm?x{cOWs{{w{(xor@q2D&#BpSjP3_92TEAvL0+Cz~ zm_K0Vq{Xel>kWyqE8O#yz;Azw!?#QKbgM3Kq&SxfGWOwZ#+}cjpTn84%)0TcjT`*j z+&L#XO!@5M73#slUB*v>C`&aENwh!W=5p0+W6{|zK9jG@(6@if;VLBk_z~{s3_M=| z3s32ks&0gaKY$a`R|yeB4j5>-XM@)(k_=rH5X#-4KZt5w2YsWohD8eMBU0d+(8(M; z#W)p!i>qj4_%MZsRP&*)#Oo=)tUX$XOqEI6R15;Z;{AQ6a&KdqT?%Rl!0%P%obNpz zNpq7*+nt{-y>nQ9t08CG)5JS}_L|$HbWqxhgc&7cAMSJrY;gjmSOB}@Q&VSD3r14G zC~hN|?>VH+idh28u^O*Y>q*pemIV#KYw3(=(t~cNY8Rn4kB(fc)4T+K`I}!G?UVRZ zl+{IxWRp>7gS2SPQ5<#1*aEfpasUgr7M{g3{xWjIA5x~z6_ zc&*H=Gt4(U-e~}85pIGjKjm1j>dqT$(*#E2>Sm)KDWp2K_kzc4v~C5*3y3+U(l9L+ zyPseSyJ!GYGAH}w*= zt-2j~y1m_9aNd_#{dyk@8kQ-ZFzfAp)bJ6p<)?Nw!xMZKe8EX}>ip?wwp!xiAghL0 z!Lo?oIhdgxXK1M%U0jJTO8p#oWjN`xwfP{h&kVGE=78ay63Jj|S2^xq1B<8%k;y8q zw<_lkUz?<%r$rxl!hUbLEBH!FiLyAEu`ToK?5W%TnObY_4m2sv!p1@4430TjVbBEl zSc3EFvC$czsiXOdd%gG(XlI2qy}cf-gMfu}a8KgawYJnmT8CNrLYsdrapiAj-T&i(r<#r_o!wh>=sX-4D zwa`)jy?6pw8_IH|H}1s=O(ufM_GIP0CxYACwK{Zv@G{ywo0swTb`EmAG;~l~24|?* zwb{wg&hJ~WvrUXHCLi&4;t5TB`#a8Q^-W#v{w^&dL3`%osfIQ!r$@q9wx}RIU@_Tj zrFsXieCwsktTgBwXU*DV%kuC8YE5`djkg019}n!>Tg9OdbMQzGBCV1R2(t#VELSo%$`$Mxjb91 zyj6eQBd_#_?hd;=&hkWs%;mL=aTG}B4^?z(Ul|I2rYc!Sejap$u6Ij6Si$pGF6BEL z1*DafVy4`#NG1p#wXYnCKNyp7~UIh4hJaRJ5^QjmX#KMP1co`gD$!}|T<&$HbHg148kF(Hz^ z-?nB{r{}E1)9XO++m!wYrdkx2?m5d`?H5JUdqvUQ1aB}95#0CFi76nn^1R!wzqU!s zdc1t$ytlU(3|5FpK(|RiGfGNSD|Vj_%%MG4w!Il4X}F9H&WgGu8N4fp*lPf~Trn@= zUS`DLwKl~8qGB`Px?EEfuL>Yn_y<(-VhJC3xu)F(buwix`4`v{h1_pPMDBuWzrjl} zE+ljA!7@^>N(ss{Nkl-)lzDWUYPy&tzOGmkqvYL-H_Rk$53>DUu*5kPgBC-`ce>OB z0Yk;6@hdLxUIC9HI501kwRlZ!UQRK$#y!uqlFY4WnlLU6Z&4~`K?7=)CLrb8_U&lx zVsYGKAT><^IuEw&+($He5W@e74>5nD;k!sh{&=2qRuB2`66p0BR$^9;1P$dwKah&k zRaZpq{{w!*{KcDmJ4SHGahW69#@lFbJxN!#SF&ty_Zvm1kS?RY9n870|JxJ%o-JX+$*PZbL z^kN<^TdY_PznZ71*8>D9-wbNpx0!P3RJcFwaBxS9SL5%D%XgnjztA-kKPl+_#QsyN#S^+#0{dJVX6knsH#^>bkZ`R8w?+{&>d^gU`U^-EhXMc@8beRH79;AqkGStT+DkBz|>0Zvtl&_k-8Hyp=gEXVe!g>uy}Ld=Brrl$#l>F5E=!@I@QqkbE6PQ zJAgsAd!jIwg=jtGS^IL)9UhaHlc<7P>?pj^Ly4k&zQ-PC@(&&Z;aR8&0zDiB&VlsL z%E~I?;N+a)XVLs)n0j~HZ~y9Bl}0Ntbh6{taK0O=U;&KYv<* z{r#&%`MJM!1NK;kYZlvH$I`Wdi>y@J64v|xkPRADH|zf(AAn#-@5(nZ?69pcq7JSt20o|_%_?wc(hh6qF)?uy^wYR~fFSzBytB*C)u2!L`oK=%H z?ttbl=-K}b%}(BTWq(chUYz`k`&rLcXGqc)9Qa&pT-*SdVw|OEeW@Xo1%~Pj0B7^W zH3f$#)=cyjpwExWUhZ>J2Wk=xBtQJ>?f)DkzLa`t`X8E4*gJHR{c(U=P)Aj!MEQ_I z`G`Z!s1G8+lYsZRyM{~GWig`C*g|#a4eM|v{^t(X73N!43O*n;HL`CSW;zxa>+Oz# z%sh&Jp0>-J%-5)iuteReh@s;RHi{SX^VsR`LEf}U_75YA^Km@)?)~aGEzq!X@i?!I zI;uZ)q_Ll#B^lN!IKZ0=iv^x^aDwYO$RjMdnx!f#?;9XV{V%vxI7%ismT!L>mHSkb zW=QwG%Hkk?<{$6(=c9C5-}3LRtA3aoo8EPvBM>gy8+tRtw zNT7Grr$Uhu;!m=;H)Xw)cddJh-yg|RoIQQ0Mo2k{Lmq8j+0~kDrJ8W!n z#-QkM)5KILAO~a7uXz@G8IDCt!2njeeMwC`;FSaA;W@U?8=yP7Ckx1M<5$ zm|}l5F(t;&Z+1<!Tc}_dci7F;(-A8{_=*j{mL&2dSjX$Cp~Y<<>ORkGK1h zHpK7m-a?pud{)SIbw;PgrooaTNy^Zc!i8Qa5&Uv#*lfI`GiH|R>ZIX@Np3E3+E!E> zO#l5c+ip@4ka!g+8fqX-6bl%Jel-F0GSbc@gNbQP7Q=b)E43Bsis@9zcID{hqDilS z_QGUm{4*hiaq8W>fCMTc*}M8OzsmZ7yzEh5J(zksZUFiTmhP)|@bH|Rocn;^$QU5N znNy&ry@o7$1tt13l?MnRp2Rl=7(OV$Pl`TK*LCHT7&$e4n25G`RCB?q=&@s5Wwb+C zNV~t`_o2KV`@ClDwJ)0AxNC}YUnM_LYnQu>(x!k8UEBW1HehzF0NG6}sk(JVlvG&k zb}16Hqpn?Z`@nXs>e6eD6s$Pu4iOsl{@&VlHpsmkIvk>C%+IIIGyAH;NpI8cu~%pS zvup?*1pUHWXew0v=3(jU-5(P5j1I^`TI3eh>rMwcEz18oAT&0l#(!j#%33f+@p1!K ze>rDDzrJnJ!<}#wI}KZw<>2S|EOi=!or4L0Au?lKbk}C4f1>Klm1UtnIpG2vdKjdv zGYD9ZTY_Me=T>+H0}aiK6*nHtq|t|U{-FfttE zmF}9+-B)mmrPheAZmtx>lXFyC&&HH2(~_pM7Y#INO7jwB9I9jwva*wiXsf!E?%n-y z>?{(uHBP*$AlY<-1ElUBsw2B^_%zc(f8w{(GM0>xv{Ck325m!s6phd{3;Z53|0+>s za&j;K)TbChVabXP^v?UThE#R2AxMrerijay$<-cYruZSNj%JtKjchx78y0Jey}MwCsb??F#tf@0Z+=GIHv7R*d+m;W(0Lju}y{Vx|DlY)shFlb1BLkbeo|EQ_P$E#qQ;)i(!Be^lMLrC5e}3=xf$+XlZaz zQb!wMaL|0V2n7|4aB~&-q+e36=k|#3$+=a%U{5AjEqftGJSbo9u0D0;RQ}E(xB;l8 z`yOmRk#1rXR+czxLJ$<5=$t$Q>i~sIkWx+xpKJSF)fz+nf~39L46>WB)OiTeGqNy9 z=I+gVO01wmuS7BW?4Jq6Kd>%z(<;)yMQb0=pKIkQi|t65!(Ti8UtXYD9D+kKHAyD_ zarTS;_yCvx0D~SNc94(@Nf|Prn{7A59umu^PYyWwxn0UMQ0tP-+X4ihEa5WtwMKM8 zr>C}L>GAVVQn^sfeV_Rp(nXRUhb(CMh7`|K!_#mwPGA1aG?aHM+`Y}uOLAGK6D#cPYTrN6t0t0X! z+8Y3ms~%9OS~}|hx{R_92L(b7GWW_b)6XferM@4&xAQ7`!_ch2M_-Y-L7uz9@oK=^ z1P|j*Ha!|{*X!7Qwk}Jsqtr9V8Nw)U0o)pRbq2^nWYW5uE``n zd`N!q7VUSDNimMv$XfD#1(WhGW6c)y2AUKt&x9>`Lpc5KWa)!D{7kW;`bCie4*|@S z=%2US##Xw^p>dCO7&Jf2K4=oR)OWJdNfe>BrQqX!&8b|4HCzo&=HCy@m712zwQkH8 z-}y-ugbM?ZIZY|tfMm>aX&x*D^U|!ohWj;qwtjx0*zq2JQS4pQx>f^=b3mY!Fl!m! zF2m4wpD9xD35n)|E@oM>R*f16#Z&RiCrfuw5SMsg1|--s+!n;>GXNFbR{8FMqM9Fo zv!r8LHdGG^G$nrq#>4Tc3~x?iP@ZNDlm+|;m~a1tFbpvR&odOB^3%*wxCD2Va0xdfW(Q5d9)DS6DH*(I zKooi2Yqf4}N)|YfS`vEU`dQLz7SXGs3a_TcXCH z# zvqUp#Ol9W+2VS={9Ny4kDmk<8TBui->Ll9D$(VV15p4Ks?GAtx4BIrcm=I}6un;7+ zL+J&}*xyB1%;qA-JaC@}D?ze@{{EYgU6RwRPcq6T_+05s{mDZOs4D9hjxRA+!+f4)ojTlO1iuJ$eY#V9+y~dtbm@mZ-(c zmoIxky=qHWSC_yqf#I6OPg@HMi#IVlO(4}(u11*|q@|`3p)Q<1#7O*Hb?GgS;OU$K z$OxZhL1OlRDv(6OHPu-Gz9cn%q&Bh%=5P<;+C^1-%eLskg8yaWbB&zFOH?ci;0%>% zFsZnhk1R$H9k(9|!UTXw#ryoxhD5KwG=Q$0viFF59BnZ;6FxsM<2Qk^{{x|x@!Q!} z0}EvIBJunj11Tn2d#oA!4Z{ufHH|m{eKJ)}Bn@_{*AL&KsK;^KYm(|GFU1dS*Ya}s z(heJf-_O-`Ep#-2ptLeNWMd}7TkbMVQtEiS;V0qm3mwa~W?lfU-Dz_5)HwL ze=_!8o0wbESCi3J{RydhD_OQ0X31n=Q8FE@s`zf@CHD@h^;a6m1wXyr_;rTclO=c6 zl~?`rtZ+i`-<9N)JjnKz{Ap7LEk>rUM|TKH~%Yu5AXUc5hk}u{qZnGl_^?#$4w* zpkHuS_yVXCN{IjDmkc&{5Z0HaK`g0(z~@CFk({WK&!b_2cnOB}7s5`IXAcZSf!M~v zi5JJAL6N%UsrygvPKZQ$=R?_=Po;F{?zhpd=Vz`7TEDB7APBp^W@CeE1`w*RzaJrb}t+B$=oYpK)YcL%%v^u?M`a1 z(Wy^*t97wuo!?$sip~8cUAYu=m`|JZn@*jy4r5`i`ST>_Yy@Og1^y)zgo%=9Gld@2 zWc)~0qS8wJ2*GglUb6xLbs}a7%7(W`XL%HUs^l}^@ne-C2&aCw+W|p9jre->4H{V zdkZ26X3JL&5KO{B)Fv2LC&~}KirK(4`6ByYWL1Du%dnO<8(uldK#h)!N1wQA&$%UX zW#qel56ndbepqCX)InZ+rCOYz3Zw11eG%{edT@&HGh{bIy=;k$YLs^An{?tnUuRn_LJuA$~^nJJd=Z*dDZ}>ax@OieV?hQR*|vb;s-14}VmD(RTK{ z07&;)e48|yIRC_&Dw+ed;!19+cJ%r6IL4YafNYCO)XB4%dKWmX6Ieu@f~Dt${&C#} z5OOTZ1S6kUnA+3!U+E5`5L+TH zm6;kCz*WQm<>4r5BM=NGj#33+c^g3Ot@MAgb+JRa&i7ecGr~DD^X=|0gdO1)?MMG5 z1n81N@`HjIxw`y907JQ}ZzO|3;cDCS;ydtu zbPsuCZUMofKhX+z43^PTu1`UuwQVLQsYV!_1YuH2kB%yr-BkJd)v0gd%QjWA74uUW z+$xRxRQ4Ph3M)!&dr3JpR^AX+j=XTI<1u&A>uxrxtNs9UN3AS9^KDUspoBC;5LTvl zv+n$iD-MXya~ZRmVcm}t728tGQRInr%3e-QJ4A%55mG@W#|7z{l?r`ZY9%a=NX zf%S+Pp88x~yyP|~?SGxw(;9KUcnz{jy~dZSEuXvdvM$-=2|unXz6K}!WEM$mTbdyA zC2)wvorw*`^wD*iZwpI+ihtN|P zY3LcS6rwlqUFPdC^gRne$*Q%n9SOe3YwUEstvP-Nyh*x+X_HM?tdvHFL93F=#YB^i zoB_~GD4{bEi+TpPth_N&IqEd7Xf-**o<+dA2QP|`MLLquiwy17C-sKa0SJGB?kA{W zW)sapD6qF~2S7K|cw>mORG(UNl7+^oqymwpau8m})c1hik5KNb+dvxZb<9qO9_6@| zLCrxTVB2Z`Y z{E)WqEGVt0q;wHsrbW!%ImWiM7OCNdR^HrVCbZ}3^yFeV#P@e%U^~j*Q4pq+yG+Bfl zc^Bj+QeIoEO2-3*3HRyF)Pw}rPg^GjTcgFZ$iJgoQy1~EK-Tc(KoW_L+IApvQp}54 z1Dt_@KH!jd%QCKb$HiN6QBVRs%rpwgWVMXhas?Yu`X^omRU;vW(jM3k7&r){SMg3s zipuIhAj#HeSB_k5=O<8mi*YG$&7719+Qd*}G#nW}A}QPB@Ug9@#;nzf**d56s&wf@2CDk6= zttfQkdXGJv)^GE8_qm34)0ojjnFoO%e|&S68cwXyyw4k0#@E!a?^hvks}>AOR$pD})fD4OHsLLeBgCsx|7wv!(C=8o z^Bn7Xw8ooR!Mm#u_*!ybqiUn-QmoP%Qpa`8`U!Xh`Znx&B-V7FqHCHj zm6#g$IsM%H-Cqyw{_XJrEr=2n+eRg848wS-?)550*2%@eGk&!r#UJCO{lx|NE1rV< zZ3G$s-#yj3Q8W~Q%XuY;>eH2$+V&zwGhw?pw?>Mmy)#0*_3{*uwMVOmWAIO-01174 z<>!jsUEdk@hV#*L8)zb(076gm&s_S3-FS%gDh15x2Gw^^I9Pckl8of^igz%^#yw;OqGN3}p^vF((Li9Xt^w_qx$eFqD zU=Ut9<2pUDP~W&}m#xD3>APG4Uf|02OVNx0&K9p-(|~@~(~`Av>ytc{9`8QE%}njJ z2dij910v8A*ZpPRg@6{^A4lb#x}h#yU*68Q5#_iQmo*{CG+KGah_O#E7!Es9}O=LoaKJe>^o zJiJR1cQw3AtURYCHx_VOR-unul%qb&iW>8^~+-g77ECgBD)1yz_u=u5Gyg^Bn#37 z1a(ME@*FMWae{O=4DzdbTo`Zu@&Ztc| z4@97_S-35Y-jYyFJ&_z@#WnVkNZs|cE5Wz9-T#x`Rov-s$ zuiqZ7k+W3cNjaiKy*=jkFp2J78kIj&j#8DRwV3$ww6=F+j&PH63vPqS0&|K$i~+9 zBfToW9UE*C3JXtZ@;SL~gQG|KF0t3Gy3JNP@f*4XE19h)S137Nq1eaT@(T`@q%!rp z>n|XJ3B$gINnR5vBSXGygT;H(SC`Nqij|S;Lds)t<+`dp1;Oq&=9w2ykoq}a<%u+4 zCE{Nzkwv1#E7d6d6U5SR^|{X`Z1m@e=jg=kk^Q2rt;b6g76OG34P@{O+TDEjFJF$H z)**vJo9ZaBYm=6gqs>rUO2MQ`xRA&XbA6#mUi1 zBSEdnGR`Z`U3Jcqjf|W-ycAXj6Djaagn8YRiQ?klpzPY7qc-uL;9h(v8T|Nu(cspB zcOdLs)*>us=R8LF%hk!4-p$QTQS`K!_o}3|r{|f@*SlvzJWB;Yl+anra}b4hGn%2{iI_9z?evP)JkXL1u0BN35#T6I2p=3A z&nzWB5R!=^uCx!IJM<4LdptfTutcmBHJ@p{J2|s;aB^92srt&?`4i~&GwWTQ)B;4n z;m271M%FBkMLU6;;n9u7uabA0zzf`KJE^+1cd1QEZZDqIlKi;Xs-3ypb#QV96I?=$ zy8})3ptkRi>ddQYQo)ce4bkiQrOCTAhSnq)n16lx4nmVJ^4DOQ`cGcpGef|N^u$sS z{OdJ=9Fqj@9gS*U#fs|nb5N^=0@Rw3kXA#|++8+&6WM}q=5|Yh1*O?*0qj3ejj0~&fbQ{!@4v+;(aW>%Z>>pw0`^#`b@9@px(6vv1XUb<;i zh}iuaAm7hiyB2=(#qhlG=9j_J$BFwBqrn+3E(P|Y7WTDkKh&*aa0_rulWx_4Iq$08 zGGPL{$gUfXT)xI&C9y;$BI$Bec3^u?G z{w_}6)zn3Oqfc3xJf*6k;Vj(8F^nm-&IJf4ZR#Pl=eNNLqA5%LyX}V~LyE#1g=orl z)%qEzlg{in`(C=3ypI~rd-hr}Rpe-E{KTF&l?;MxqqUH&i zMCJs}u}s!a*2|R1875|CW{9*hU&=SlK8S{S@$oFCNCw}F)|*3NE_SEfRq%dcZvL7V zZ_)Caki(18x;nWxMG)_VX`xDS7 zG|kCDc24G1q1a}KAuEp{ujh{Cdz|-(Y7R9c-)$$Bz}67>M6^_Q6zQ z&PwV!W#PV2q-wg$x%bauVHE1o7-4P&uEn46AmWCv(5{NUcNAV5y4&tI&sF--CJgic zVm%mGDY&LM4Wh&vGZF*uLhiQdSIZ{KUhi4Bq0}Lh#dgGICDbXz(MICii*aSxUa{|U zxOD)Dmbv+o8qOnJ-6Q&>9Og%>Ydcd^^=i8C=s#Ku-Y*8-@|id_`mThRi_z^T)hR-bt77n#UcNM99Dks%Q1X+e zj@kr?&(zNlXhGj~Ax#6-*nkZqU5gKMqCJl71Uz1Ma)r*h#Qk`gv`kx@*kCL`-x#Bv znI$|9w9+4}Rt2RiA)URP8+TKh78a9i&lm4FOQw-^*+0iZAbdKhmQKTEqD0J-_kl$T zn5;9U^4sX>=(v9lA-ip3F?l0mu7taD(=0MtB~^@`NU5r?ddHLB1i>>e!@|Wt^qOm} zeDq^iaY@O=)5L7=hwCsvPeI6R&VKRao4utlv^6n4{+=Kun&6Tp*!ezxeaZ)mTMrkE z;dZzx_}MGHMulhaga4vT3xbmYQhfl%(x_LeV0WAkSk}b%K3SBZc@t zK2hhIYhSbxUgA_{ARLA38}R_&G3+wOt(<=8Kul*vV{ z+}$V5hneFd<_`D6%9coNIj0VK38foK@R&9kT8n$*1tJVOsO!d3+XTHmqi%97ms587 z{beJ-SP*gIHp9_7rNyroNL}&8Ms2BmUoV^Hb1%(1@OoqRUayypRN+mc@eWpep87Oi zQN6DlRpOdx_@;@UnlxN8d*3g+|M_Cp_5B8YXj$O9uYCpnigJr)lI2M$ay>bUNp7h}(mgc9o0+)C1XpM*~!G@>cq_)&~n$YS!fS>R$5C345GAmSMR z{X~iS-YA)zFwH~MuG9c)&Uj8#n74Ct(6x9tLD#pypQmIjt~hxGPjjKnB4sVizY4;o z5zi)#^m|_vsTR~SQB7@uD9Csk)Rve-X&6khLW_!drqrG2upqA;tcq#ZfA9QwM5%M& zIqEFu5{3ga$uUeh|CxR?3q-1~HWku2kOF zO(L{pWsf$$+Z_%U^Dv0G{Je@5OivVzgq>do`@V)L!ji^hiFC_=J-|g_|5(dB#W8Nu zwn>za{_J+A=5t=FGf;;x1umuBG2Ps!q(Cb%d$y*cVs{|3xbDTS$8%1Wg&P3-Jm2GO z6q&tGvjkV5Cct^Xo^=X+SAzTuP}em8h(bR-_D;LmeGpLIi%3=`KO^Hgmu6}|cwIbn zx0tEP#63;9t82UsN%)or#mV#tMF8>S|LLI;b#n_}`vX0&`{K7MD%llCC=-3Oc3kCc z(W=?42Y#?~BjafXrbpgiPs|>$&=BAd%MI;uFP)OA^xO^2IJhOVgDT9MAxE`E{T?2k zU4QC4T(QTkah1<8Zz}#VO=F@1ZzfUJOP7MLYuV9(R8c690Gl9|y!KOMt<_MV-| zc-C!<%$cv(n63ca70(NdE1Tf|(Dl`EQEuJ$il`thf;7_74Ba3S(h|}=bO=ZZ5+W%Q zDh&fjH$w^14N^n5v@kT%4Zp*^?|WbG_ud~L|9Hf~d6?&%z1LcM?S1(7V^xHWS@gcv zFfKOl{(e*cx#r+Qkztz>)^iNre z|8%1Ut4-m$dJIc@0O_Ia&GJ0HRlW^kVY2iu#Mxeqp3&UX+_nfaJyB@p;+x@RC-?Jo zxYt@qmHu;EOcZ&AFD-q~)>DFb(LJL|?BDM^+9#j|w|kFbO;iaTq$+Z6vJRhSqBkfd zi(tHPca3yueZ6-Iw_<~2j}15BN%bhC9TA?kf?<`A+)MO^%xuQ`{mzm^@Wz={_J)k* zOGh%Fw-DEGkxKw&b~+;&%EB#}cb2%oQ7l`nL{8O%D$*i|6Ryv+?upnLAky4Gp5yao z1hPSe!xw$bLcN%8R;a~RHyJrRlXJ(|v9BQo!LEhg(ZR22PDNENyM+>Y9vewpWeXGW zl%nNF-i@S9P3C%4!yoRG1Ub}u*Wv_TPg>JT)UlIK3}vTZ)&o`%@7S+d*UKiQTc9C) z97#yk8b35cx$gH{v|(ZQ-7|36q>tPvUdLAxjOFx*rz0;bXu;4u4RaR|K^lUl5DIAG zY@-IOtyXZAkut%~wk*_z0-8Jo2v2lX^YU6!k#`({+z3;AjJR;pK6lb+l;L_;DoNk6 zl4o1}NSXL*c}xi>`wG*9#mp~-^W=+NZ~sTTjQ8J$^GSWvA_3@z4sDJ!n2`b+WGJ4R z_Ey$D#KJH0$v*;)xH!Yd_mY{3#(h6#@s`Pwx_LLW_ttmzw>JjcYb6{=3nP>7lga-gc(T~!u=h?I zCxs03BKTs9F&FxkJhDvh_eXl8MDZ4rSlnvf2<_4CPu-%T}U$0A}#Jx6E^J zDc$iR7j#~E*ujdiK@Sg-&szkf~3qrxHT-!dvX*y9yepBJ4Va|WDmM8ocwhCBXIm^t`FZj} z&ac0}zqH8jjRD=Dej|@K@%v1B(&N97e8vfgl>R)0+1&+5Kw+EXhV#TCtl_cjXXbq# zkGh$s>5r>@=Z;QkWetoPX4VfVaHR&2j~}Uam4Vps@Tu{EN8Oo>%D%hF`5DkzG4->p zei(6l6PJ?}VZH+8Y0~s9$J5npALQiZq}csEdwhu>b=}qA9gp~GI@&ddoK|(k!WRp; z3`bM@E>DAT7i6kic{WQxfRa*`%gdh+^2d!xKD^YT{0%A?xuctsA7q#Ytz#C*CCVw? zV?nh-H!VC(Va{1f{Z^WkMM)yfwReyg2^&1h>p}B_$%yT(nTq-)x}FCnuR=<-uXGqM zb+V1uzORV)`>Jfr-X@`MlSBq@3g>dPu7I@U(^Qs)z6O@1L1xoYAO^S(2=pDl z@)uym&3|S7^0+KOSHpF|$ywKIT&)>aViXOga_AjKVF^D6cbJZcwow$Dc#zPZHaB5; z4zW4CUO}SJ4Gtj`s+0+U;FpCL_`QXo8&-$bu)Pl;aoCLMkB*IP$7Ze0;_c1Q`5!9} zdmL5ud2P8Pg=SCk^^Z6&Z&RCPIbFuiJH!!Zk=>f$w*eYSw*ZozmPkZc_H#sd*&%f= z5My-O$K72OB!Yz|6t=y#-&`5GRdwrMK5Y64QVb+i1q34)hTJMLZgAK-zL zxQ#1uF8{c)$>yk)vrT~=q@1)oqec4r zGO9v1l@`e&KJdKFav{y3dzAayvO1c|Fg}2_Q_1_}eh~z&HNpEX+9RsIaG9*^C%0x_ z=|PD{615W2P68*tWzDFEdEGbWlq@q6&)w=I5&7ida>h&Zth?3Z+v;g@EoU44Nrlw}9wN~uD#QB4-GyLQS z{7C!B>1nSa1OhSZ8ph-7N!G#MT`%^iJ*8~y$p-t=G3|ZF`mK=J^JTv6!)UnYlF$0i zSx+=<4@ockDq6H+Y+ZyiggVb?Bi|pl!Nh6x=qv&WYe-oDC$2UUO<|c;)z@D>wMw|e zf?QgI12&(IS&{Go2B#+%@8VG25Waa)8B@*oWD5I^-*&X>%rnd_+u`-2GqEm#EBjU* zKG5?k_V~-I=kT$!rpF@mnHyS{YqoHp|-e>kzWR)09;| z7LApc1++q|(YqOgz83+~`F3J^(ts2#4&6{;A-SeUo07OVfF7mc0Wq(c;Xbw2q^?9G z>=%`hz<;I1-?AmzBZr?My*;Kb+}m1wuR5w!DPARcNu$<>J5fpCGSWupBiHXsWY`!D z(MMhKj`3$p6y0tK40ob8(S1}FW;bF<$G;4JjrA%W<5Qq z<7OX8;M=zm;*r7Gy#mP>67C6E@g)hTUA>-PFi}28f`}(2h}Fu;=8HRxAvpFe>ca^L z-sHUoiUo&CRQ^L&aJw<@1|J7F>1<)^H!-kCc6vdMVh$~N^R>{h{tBtu<+i!CW#F~@ zv(pC$#)c`2f-&G*v7ohwsnI5$})OwDPuHz4Hh=3g4HYuF)$93DjzZh?k><$M< zlG3~}NCL;D;0Lu(S(R>-cF?v{0G?L_cpe94)?_vC-+8;;KO48%_o%yDP*^w(=5fn2 zUP;NF3?wBbUDZEJx|98hZxr>fXXpEZ=A+ASq_&Jm1ck=R8~=d92XoDWh)>Ph;e!XU zxy>=cDdwAo-DaEQ$l*I>oBWy^y3C8(>w+@S4lCo>?xZI*Y!ZgGkBb zY@DE$$kTQ3bp#0Sc5sO@$R!Q%YnX$GNLwMH?n?&;wM@Z*di^fU}r9=0Fj|;{B0e;yN|}+S}99)9h+z8CD)+ zV6DL2-gAXRcY|)h_G0N|Us(Z8#RE9i9|SJ>z(*2o=acOAG$^e2I$ho|nQjntm}~sw zAWNU5{bVhi-?Nw?D#B}tr-R=jqCi7ANi6QH`czwlhBBw#w<1hm`0ruDC>waZQ2uE9JLAT$698c?=KKbpC|G>M=zx2n zQ*SZVTj(yvKBIqqLkkGb{~%u|=w$BGqx9X(4wLP*uYAR!2G5WJs&O`ujd*E}O4ICy zT~j- zo`v?!cm9ZfGqacMCbU^T^m4D@FcKX4G-;O2AD+@pbmn4$04MfvdD$GCT~sNETpq;x#}8`0CujnK{N`!lwtgzh-cQ= z4J!cwi@*wMDd0*GJp@OdrQn@=yYQ1;CS05vaU4S!mnhX(_>mEcpZz&61X_Sd*eA;v zw&i{LMfSqYfrCg`TJGebr1u|KpXfk=w0aIg)vD8hZFhHpzuqr|?oXw-s!ARPv;zRR zaH)|;;N2Kpv~6^tUVU~Ifb-9BV>W<){RD9hL~dQ0e5Q?hZ?A(^%=R}H)DzYip?7{3 zMErqw9pUuyb`Ldg*;e*YS5f~&f>57T*a~IfNJ8C7XkT!#$j8>c?L4bmNt&u6k~`m zwqFkfgALHWc@v?aJn9#J)QY)~D4J{dtNo@g7x$!VAWwy-!GE0S6E{=k4mVwFSU=5Xm zBiz8Zaiq0^z$#*JLd?9s0Us&Krmc2~9_(W(I=RB%yF}-v+Wq%cDkA^(Q0i+~dRRGQ1&bT#&c$lx{-6GGMtD`{3G3z<`|sR8!x&w4ipy8I~WAwSZF&QN|Bl+leuT^CBk10w1HM ze^dekvIbZ9H8ltKod=yT^e3g859Jcf7Ap4Om_dXadHm$l;Tx|lqefZKC!1mJxJ>Le zK8nG%TN#>mk$tn(^C2m|+VJj{;F3q;xM&dn=W}U~C&z)jjiQdLg^HrT4GH!5;9wox zB=hImsXANaN{bdcFTh=Jn{-`2@W|f0aNB87k1e#`%2mtWa=*q%8KWVK#}*)qmCjDB z|2jMZMF=tIhPp21mI~U#tsUf{Ac1Hj1w-a} z0HB5OlkXY;tt5kPDCeTX_JSB>cL(O$vbZ?bA078mq?>=7D6`Q5(pGH;INqy%lwgUsxp)bnB zcaml&EB1Ct_d@q*F@wxdFs19vYEOdQ39sqyZL^J=K<4SWzHVF%>sQ2~uS0j}xc|5H z`e(VSKl<&EqVjsbIZ3Gkjjle`t0-(cm<6J-M;c5nvfYX1q^G%jMnBXtVaR&Xp#t-i zh_UHm9vc%xC3K%?Z#IE#ALtl4-C|xId0nlZte*V>x{v1kvfljMdaS=T%4mF)s(v>)$Cpuh&( zv!IpTt!NBQ2ZC=^)b2^^@in}~(aWX)5rMKvK=Z0d6@!dGf?xkeO;N++#YKKZ_0+{@ z@;;!=TsbYYzVBOw18SeL47(D=JHtVv5;Y5>3-eIzip30S(NsO*b9Oh4zsV*0neX+t z9iS11L0lbgjHLh`sM7eYoeC_#pv)*rsyq4&{`@|}2i?fe+_YI%r;Xqa``omJ!WVm3 zPEZ%lCdTMwC0si8*nIvY43St>FTol_Wj{Z3es4DjJ=*G!|&BmXAj8nvTtgnIZFV~<1>Xm{vG=t)7`IWAu>VLS z9Gz_FZ*TS|gfYTIktrG%uE-W7rxPl-Y7U=8%W|Y{>G5 z;MI#Ua;Q}oEmB7RDwfqjK&k0npgC>X1Ng#b9=_oLLmD+*VWQQE4)cf)oe-*q$~Xbf z>TwIRHB;kD11bWZ-V1=JN$$41$V3SCg0t*jep`@m9wA*MZd1q2L36&Svf;(iKXC9N z3=j`4k&Pe#WKBSz^$A|^;`nJVi`#>8c;^f_b&cI5w63i(FKQPU?tS92RX}_|JrVT+^{JLh8LQ%I|JPT%h?A#N)hXar*Yx{vX8JouR$oC0vf@bj8%RN?nh&IKre-yQhRGwQFv*-+cn6%~iNZ}f>$?!Nvs>K-m=G4x#o_ zu9`DMr=ox$7C`1DJ+PH!>;pQd_E_T_9K2&6(HjcL{Ix9L_mJuCso*}O{pE+W6$7Bm z&8}20XS8=Xp1Zb>59J&vB?%Iw2)n=H&#lXILGUkz+6%d?4PV)6xn+v7)A!=6|6W!v zAoTr_({W6%0^wHX)?&yn=07z#k?v@Q@}?HGK770ujd@UktWcZkR=u zZy2w~YV*g>A4;*^qu#bT`57b%^u;z;!E3WP?vIiVG&~eZ3U^Al*~>`Ya?$mak z@NoWoZz&oB3bSbVX_czdF*3Tt0pD4D{H9{ydD0gOixT#w;qtJ+5sn7m&oX5C2koz= zAq1$-DEhrh93gX;Ecbki+}s|Nb*23b;nzqJ55^Acn|(j)SAah;0$f@s8kc$SGlL;L z#7QF1jBn4zfuowI-vF-Ojb)?vy0T_^Qv`JBqz{Fx5QHto?!Q?{t|%E6ZEB9cw_FS=&PaO7U)+8yiGf?LY|xN3I*kCU*4Q zGD<&xNGuTLL?O?Z`zEPvWmB?>519{Cu5AI3k{ixUKpmD(ma!p`Kk|xQ_H&Og>G_h7 zA>$Gjz;nh;7}2-9)}!sOuCsb|k-+<%39sO^$LqLqUSW30wx{R8i~L+`UilV10TRKp zNs{4OxBiA;2$RGIR3hxcCmmltfF>eZ;|=Sy*W68wU)bGz>c(w!cf|)QKzbeV&P8~c zGRj!Fw6A`30%(a+4J*O+Y}Y9c!`1d4NOzLgaf>L3N8*KrJx?>Fxl#RZ$@M>;HPA41 zT^FSn5VBt5?+VVuw9K6EzW)wI5MihPW=8<(rk3C1;Q<5EQc)vMVz^Tmjtuc0i4bDR zit))u&Sktnk?yzI=2mA9nUM{Z@q5&Q6cres;%^Sw^K&yqbbV@3-nhSF3`7~CA7D(MNr99( z2goU)3_lJsZy7AWx#s}y6JL8aq%LQRBN?vL;a}4@%T&;1M8CHnC z%fmmDR>QMck`floYaMaJ4*EMV3qjt9Lm`An|l5lyfh^VjC^9}F%z~z1( zICinoy451dMnHJemyWKVc3*`tHGk+c#ePTtR{Bf``HHAAgCs_a#kMnBrx6D~ZRUCR zASz!M{Wef-aN5W7gF6eQ>j0Pmd(9%-i0~X#&<*+MxFt%4pNqsU3Sv zQr}Rgh~{|2X7d6H%wB5kKc zVtB?__D+t0E7JOb)b<1*WbGM;je>tm2*oR?`{5o^1zAeVz!iFb(D_LjTNY0P1!dk< z{B8c4b)PN{OF}Ikj1Q-K)XVtMI?+Av0eAB5z?(h4-d19VqxQS?pQ!EysJLu$~XRO9F^Eoo(-k^zjErDrw0P0Tr4-6G><0fNF^h zyYWT5*>Cf^7`cY;lKR8iIUg1Kpv_s7oFNM|>C}i0G?r*T$>eC0NY|}SM;DaM01`BV z5b#mx(4PTD18K63*i?7>(c0076p zRDu3!%m%Eamz{j;wr<4G4R`%2VEoUnm}9E08S!)CGLPDdZ9`+HCh73UNs8A{+0L88vvUE7Q*IC(9JsFE z;7k0CdS9ZQK!0`Ikr-*dIy?a=$=@Q;pHWF6r15sF&*P(-Dv6IXkfK404c|O2U1L~& z`egOtf>Y_*hT2J&(7I#QwfV|5;jI~}5%C%3%`URIt6n=xB*|W|RTq(FofVpCLeE&g zY@aZWp+Ql}AjgjhQEAm;HDZ5tG`pYL>k;Mi976DCS)_b=C(gjhvukQJaPjO*H~WT+ zN_k0LT^Q0{%pmG^C$V(hZ2>fY2(Rz+^qCoZy(f1`6J7UEz!pboABW2_QuddQ*unO? zKht(ruOxM~14FrE`}osx_ldY5U{`XZG6n|SKfDdI6=Pv*^Hhtu5Hr57$ZKHn@w%*i zEA8>A-uH1fD1a+-pm2q(>51`-OkDea47 zP`*q*5Cmn8Zr4~z#0fii$H0$zptep3-RPx)c0n4!e0uq#e1 zrvZ^;JuShbpus46&`L3NY%@di5s#tUy#Nnq(XL;2L0yuaF~(2_I6YZ62dEqXdL_Ot zDWLZ2do!Xg{6u>Tx&1F`g`N_}m|%B<~1gCO2KtFMNbXm34X1 zDtAlfmD)A_9_|mXbudo=!hUenE(_z*TPN>_?c>ko1VG+)UK`F8Ltv_$-T|Mf5R_%T z+SRaq{BW${@<8lza;*NaLGTV9*s@%qB^It68=D;MGf6Ii_!6EOi{32j%b-~Pfe1R= zAAFNvH)v>*dEr6f;T*Gyd5Le@`R}_AL z{O(FHB?IcxOO!9P;D|7cq3z5}29u5YmJF88_~aP-KoLSch>!E4$*7nrv(|IlmU6fe znGb9n=v1Xb#kygjxM{x#yMl@Q1#;f&z(K7%%qe+khuvlUp<&CYa`uR~^`-|*APoM~S;{#uaU5^m3ZnHVxjty)p zwKxIs(SGKv{_An2C={pyVl=X!`$9dC06zJV2ojo;vy>{iYRj~q3QJFS^Yx8km69~L z-MQ|RfW!9d|6ZjK{sVWyMm%J-o#Qe?cY?)Abs*7JhaD|zH3l_{S{Ks>J3*@aj`!$w z+8>WGEEVZKK$J7{#CXIcm1d<7-=;>tP_W6#^5U~!H zA>_-|WdO;+&sf86@nXz-XR2)&bXN|<(w8I++e?pa`5TKi+5 zs2}x1J#fcmZiF)aEo=X&CjpnV4BOcpr1N+HmnvVwvYg18N=|b;L)aS}{H}_|8hduU?n5Z{Cb8|-JCQ>yU+h4D6P-ojQbX4-x-fyqWm{iY_pF(<^2aWmbw?W5q{Spv%9;I-n{%pw3J@fzvj2>6 zDL%BMK5DmH<-v3m@R0y$U0hbCM!dSl3flBvM}T)1W5TkIpJ(IJsh37`+{tn5c{Ou- zdg?_4T{pZs9lJVYHaxjEL0&8`3`)IC0#_A|m>#Fzneo3SupoLME;@du)wbw-PkeaI z+mFF1Yu_yUXiq)|*4dvd^EI5jm33HeCW8Jh)hM6@FtW41ea&5_B^yuDPF=}i!edB0 z5u!;ZqkfX7jNm-L@1-&;T1JyudRg>EU8|MMFEKar#HDE=srF+=s%8K&UjDI-0S~>2 zH*3bKMd~4+(sw47JX5BcJ-zm_;Mwy*%pi4xKuBNCzP7;D+TNE-qZL#CegNEA(rNlS z3GTIQr?|668<2Qh6}*`?;@6{2H`wb)`q@!-^SnN`Pjaf=VtWnWa%f{NMe9Q7Ehf14 z1oXMs0Ku7$*lDH}oc^8Ro)EHsiMCl!=H7MleP9Z#XzFpPj;@Z>cOQH}R%TXI_xf~t zpNSA@aEj~zdebVxDa#oCam2R8fbsYfJdr@@gNPkf$%FI#lZ0Z+&XVw3a9qK}+fNsN zkW&phV#l=D8Gq9W;9S;z2kq=+Y?WP#o+X8vKIx<((mV?7c3uo*P6=m<9(tWqvRn_0 z(|g)P2Zx&k+QF@0g9Jmxaq%DFp2lN{*rF7dprUPXp=6f)(#0m@FFyMNS=R@_(Q#L< z6>l#}n2XZuXWg&jWp|IpMn?@?f$#_qtv&$7FS_<|E?~wWFK+iC>L4O177hHJGQCDK zT^8<~7&y0M^b2)=d9&Nvu@`N$-x#skYTgUKS|faG3zXbEp49eQlczdjYI87bJM-}w zY&&T|e@LG$+Ta*^|Cl>suy0MtyXd(lMP=d3s#D_W;v!)S5g#TveNvBRv06=bI>{a^ zJgik9V9AV>y!%bs(pOGtk&C^XSt930;M88gxNk_ZTno&D;ClUaExDxZ9lqBx--xjP z1!AjFJ2aUkCVn!x{$yR0xA zRia3mD(5=dt?KjZUf7cAG?SH}_Y(Q_TQg@LZG-AB2txt zqI;b8Rfj<4EcY6F5v~PRrBDXXim9pJHx(ho3u9-#iV<(#h>~7bjCj(=dsyUCap{@^ zg4xF7T*hOdM;Q`HRw}AnwwDDiNHfoIm_x_EWsglcfQ+{9B`ghbt4&(5rBeNq@y3S`f>``8k%fL~m71no??J)vh(`Pd$QQwqSfJgUW3on_IeHJ=~xMRAft>P`hvWnu)3%Rk%qCtF|NpidS5(dDl#>Gvv7R z81heuq+121#auX)PV1GznazgF3cs}Pp|s4HiiI6@SSu#qvvfXVX)OElUoXHS_9J<4 z_hL8C)1C)S4LtcM$ybN12YK*s5lV5cB-ZPB^f>qa;~IS3!u|)D`D4HsHk$j|g4r-rKjaX!`^u5Q`dcp5+ ztcT!=>tkOB132mf1KPewO8$JAAr0Oc$=5_NFDFE!+eSYg!bptISdUnL&-(Kw z!vDtm-iy;KWhuN2pGs9s@!4!c_MBTVSr~d87`Gprvt_#AQum@HXBqd}_b_9(5NNev zy(w>ef#?%_z)P~H(A1-W<}_b85GT_Ee_|Qf;1PG9`m3?&lA01R_SZd*BFBOEj_Xjk z&t!X45ns%0UoZJb8Tk}QPiEQs!aiq*F+J=%*g6svw4ok*Ld+XKKre-UAX{KJXg9lo zf+6<>Kw6;tQn!yQYg5g{WUcyv=BrEN;r>Pb`WT37vS^CaB^d0??2o$*@HdHzT@P?K z{?(g{M)T3F|HHAlu5ptNcCJo?yE9?sO}*Td;ZjT$A{D;I+^{aO326=+Z=S9pceWuv z1sbi*{w|BW5z7`TyB-^M+f_bl9YNhBxY_I7<)wkn;Xqb*#-t*OoU5i*Xw+qo`;u^B zbDV~8w=`5=J5DX%S;sVw_-R<8O_EKj&}Zy=58}OLn>FyYeInke*!!f2l>&dfB|XQUA(QU%Evr+UkmQGw2WFyXt^Z6b z#n!0Tdx-C#_PROcUTRcHn7#g+P5~}HZ(wL;_ zW>eCZb9O`R$#PA=XrD-LXIKy~=1r5` z(m4rCPcb_sW5+W0$f>xgnrOdbnpskmUZQ$3Zwb{c3>rN-7FyPh=EB$r-?Ch`UyJE> zy2@d&7YOsHaEu<+jvo5zVMxkQw&l?fzU8q=*>UE98tQR&di)7&8gUGurl7!|eP{QI zl7AA4X&PeuzwrFuzQ;!#7@&MzH<)8_)~m%xBzVcG`)KIyuR3F$xxPp|wy<4?a;)_(sB}=AMTzS=etjImTTxkAdB{kSA;N${-lOR55B`1i zOR*iZFSZ-;q!zOx=3jf_e;2-;z%tRc3~4_e&{5tYAmk>AjSsA3xJ;OnTL9%XW1fo-B!b8+n|Tv$~1d)Gev_DT*?eGN3Uz6=uAjbpqtAU!Ln>4z&mtNDU76?RsmWRc zWdvPN1ggbYyA(nRQAoQV&JEqhwvj zdA`LVC)yB9K}SLZvVIO0jJZ;(%;HQy>_=^QWlDf(s7RC?SC zN9kPvWFjaJxeNNmaKl~m}uO`vz*&1vyPk73x9X0{_+5Q#@V1~?s$heB@leNyQv)p zVDLrcCsgS=FE?^R>Ur$tEMWss+E^IJ1#>Y$Mi31o{Uam%pLE5mx zp4e}=AK3B>+ZTK8^M{6Fx+VMJJv(M2D?R1zFH6iu6Lm(}c5k^)tguCu>si~MTU4RX ziY*Wu$fX%jGn zGX|`hKv`hw_~$iF+8OC^eN=3ZR(^5L87di5oepZZ)x$ccp&hz-B3HmWd1dDAh`Hv_eL;w-< zkxj?bwM)i}kKcSwTrBxCcwK}s-Ax}(=47wuW<-`ve9!UUd#mKBZBgfE|5ls*=io*< z1YrC07U?|jjFZrN&SSd-Wxn0OPq=)V7IYWlbrTx{j9~i*dU-2#4K#ldMMgI%y2Nk= zS&<{xG>y5)tztjR+b1RB$L29&sq50Jm?@5cl-{=FRjO z?p3H=i1=M6HBs_Ol&yiZ`zwDW!{MirdjDsUg1^cuHm+ujWvHP#!@BQ@kG|>*CPylG zWt>hq3n3WwEh@;7)&?#KF%&Nz#yK+u#FTzEqb+l09|~RpeiaN?rL09go15~622fXF zSnCB%ksTHyXKx~N(YPhMRf6C_y z>X?bw3uv=T8o{TrOGx#u)wFT3Anns!HHKKXS}u8A!tSYP=5?1S@%-0p8cRc*H+MY< zPu2_*@cQ}{MNlAuYrUr>9o}aj#UtbWo$8}$(V6PDTHZk5Q9^NvySShtJSJ;A%Zoz3 z#mQaD8~6J6Mz8-mh6V5UD|%3!%9qsUsyXw03s^NZ&*ox<)D)-6Smc@WpMJ=Ea}sK6 ztvwm{^Q8?*1T5TR6t(47rlTFtH=b3~OcryJUfV^=Lave1q{X}CeN!Yxbc4yUWluhN zy}gu~Za9S{HN!?Md-?NbXJ45-Lq1(Bd3Ddm8pFl&GH?FFb*@6sgfg8*i|V=LsXyL0 zThQ%zeZenw5!NBy`re9J8ssSgq30R5fYh!6KI*S?p^N-yfeV(TF6w44|38~v6xbza z44l56&mKx=whNODIV^K+RK25_t={_VWMO{#xO#Oby`oX4Ed?Xq8NHKQUe`U}%iK2! zskc<5r^$)$mUKQHPNT(r9G{_hT6zw@2{ zTpN|5hS$Edi3HipV1!3uLb_0nEe*?XXsutP&1#LE$-~TnW}mjJahZ$k^qyAr%?H(r zSq#LSQJHEJceq>LXXgfwt{=E6rFQHlWg&tlQ%R8Xs}7bkzN$!0XHIAn^>&_9T7`=P^2VQMx zVwwU*`^$jmPFEl05?-3AsuiobS84W-$->KUJ5Smf;GU-nTV2Bxv@D=cT>MMta}@l} z`($(X0*I_%=BWnO7fS>+nB@A;n0dArjW^Md|8rMXU9bCcrFm{dy20ZYkjciyQ$)FA zpkp}i*5jw@o5!4S?pSJ7qDaOY#6KLv$+Ujj1lT-i?8zPw)Gm}+-Tkusm@6zWa94dx z*EO)rK3rqGH10*wPiOBvQf7TlWkz7)qy&zp`GI$K8(>O5iz&tQ&t)WSG+w1QxBj0&6(k|BVwfJy!1?ZFraI6)M z`%;77!}tCDMxV#J4!jpKl8GLAXucZt_;m?dnExUcfXRRBISlc_f^r>I?f1f9b|Cki zsBPzI#_v0W#-yd(VyiCyibiYnz`!(X0@V!EqO5zV9v#(8ma4(dcdMKA>cmqQZ=;2` z7v>IC?C}K9s~Nd$>jy*NaZ7mA11k2MI9wVxeF;$%^$(1wn-1dXe4jh15#WRF5@GU< zkpS@({dNo6hj~O8{{nTqQKkh~b2S{hyhnODq#i1$@7yNM%44j!+3SKDLnGf3UC5^` zP21wIv|@bO98*Mz_^JA^eb|M3Fiw-foJ{BCF&FbY!fhLmIG)tkHM=~#M2BeFMWZPX z*d`sc+*X{L*>?G`Y|H0ARwtD!RrDOOpQ~A>hI*`_hMj3#O35<~f0GI2VlP7P@)WIn zy{)J=*4d(bj|O?&Kkoj#BN?ppZV7N56L}A^_C(S!W<8w$pYY<4=4NHOfjpRRkw%wk z!7nkogLA$H&C;nw4s^grZJoV+M+$Rt7q(wIl!e+FP<-IKi=uUb-k%~H-GpPY%QrBvK z)+!5C0G@n6c18HAM@ZkV7x49atOxr#M^suvrg}wK36*SdYUY4C`dd25d3IlzcinDl$dBhk zs^F0*`Se6@gicnQlVl#h8X7?4N(Iy2%O#1Hx1oWKVf$i9d;`(fx1aZp9CrFlMzTf0 zEfhOc&#e3BWR+oLGbyy!Ti`2$f>LBg#8-ORNBeAx*vg*;NB@cW#v@QUBPoyrDjnZP zhf{kwrBx83EyXef3guq>F~%&lPZL!Y_V=7XWvYfX3Hj_oN|(PiI43l;z4e@Z-u?EP zsf&&))w9aCW{_dExFMTzCW{G)gheNsdp0DeFg2>O;H;OpD=GHEv5HAJUdZ8t%cWDY zLZ{KBM@cp^#KawElJVdP)UYQvXEHrKJ^yi(h*6wyTFN`>f^EMuW1l9z@LJP0*8;%q z;f|m=@F{00fc@ab~9(f#{uivQrqMmhZNp&pD-nJGa80j@Mx zh1Z-WCD%?$i7CAZ0iS-!^A~~TQ8~LVg(x8~0B*3D5#(|L-1GyEvnq zOP7HI&uhezF!FVdDiN{uK8v2Od)jjNb`Y;87FAuBX>fu)>O)A)ygzyDInMZCcn5jI z}`I>(4qf>4;$Guda?`tbKsOWZb)M+QWQn zAMHa_KUCuvw_M98a(Hy7m}p$l!P$QvR)bQYk&YD>IEx3IYwA7D!L^V1S5Ff zX?v(n7x&tabB)l08sk^Bs?-28Zm2;SUP&8@-Iy}%ZwNPX!%VcGAz zN_2fDnEEH`#wwhOUp^H+fMHUVkLUh)e)7|DUMWc}{iDvJQ&@1;z`)WEgOjM;TQO%D zs{r{($kdfknzP(M@kJ$3Q0)l7rSXxz`!d4#QKQjt3mqUNRD#+u%spB8uNJ=l-41(= zH;qP7!Q=#YTlmPxKo^rVNz)I(!0<9c^^wb&rp(A9l`nt04oDn` zA;%NKbn}Wb#n=~#i(!GH7s~n%CvS{;Uj@I(6;PboPT4AlZKw&WrO%8f?R!cKey|Vw zK5YDdkDt(+AO5d9TjgcnrfTus?9X6wlDCw}V%{%y{L&S9Xyj!TE-Xy&2 zCy~szUE--%x1JYuYcUwt;l4B~TUgU;1argYjP6(x^#{vGElq*$n8)I>D?U6vdpH{W zh}KumA0TigE)F>M5uBGZ;T?>DJkuf}fJ-qwu4SnDJI!nZMD&Ka_3)nS;2se;YlIHb zL;w?V9^~?0y|ZZGweZUCXf~wC1N!5u84Qpe&SVmkJD}V~0K{A246lYBW2_(*dksE< zvwx8i8iP?0J|hPm|BtV?49n_m)`y9k1|_6H1f;vWySux)LqI}W;RcbL?(XhR>FyFl zI+X5w7tgczyZ3(l9e=-Y$QRaH*IYB_%$#!$-b9Gv3{*h~OXC|Fhl?|fX?MFoThXmk zvfu89Oh$LyVrkT)-pS%UUu){v_u_c~j$=*Rwi~L+aAG2S8T~i>SS`U34OzDVvQ?Q% zuXeyAn`F_=SDMnD4KGl@<+(PMQ)BhF`55#}S)bK3Ig>E~-9}}@_e^spUvqQUJ}WU5 z1a?`2QX;C5NKe>Kr>aaf5dh2b_!OFr@*;c~D)<(J@Rt=;ZM|F<3WHp&Aq z;~pAm1le^LHG%+{79jraNJ4eSt&5PD&h|2SjQ&|OHTw(gL8Bv50weKq6nxA2Q?V7| z`<`jFZ`yS$bKqKsPj(Kn5`=eXy+YS<0SY1dRoRKH8PTba*+8@Iz$wZ%2W-(u1_v|4 z+XFxsm1B%aeBS5VK$Vo|^bFX$zprLPU9`}^Tm8^W8vm8aEQ-G&3p$8WqKoR4v7tA%8KPGo;F;4^iZ8y{lgWUIsrR?S^(pgC@QItr9wl0z@tds%S3suv`RN#y z9y(y+q7^OQG|ZyL-L+3E=9L}pG<}l3p5V!PfLjpWUlUThs&(fb5awy7 z@-LATa6%A>gxqCOGflqX-K-}#%R8E^hr-?MzIO_xGI7;%qR|AJM>Nq$B7gwR=ohG` z+&yR#vOJ*cT&Nf6@DH#v3l@4#MM=RHr>1V)ss5EIilCy+h^u+_XC7=0zc8l497{^= zJHAmG()w3rZ_G=c)nYB5kVydnBf9|Q-m+T_P0s=T$jBVEai@|!+cX0kZFf08Om7=3 zhF{ZE=&9b#iT&m3cKtwC_XEtVHx@ArLoh|+OFU>n?9X~N!4MYI)dcbt`njFvL?IJY zQn3ls{~!rNdKc6Ig__r-94QOfxP$=y_x~U;0*k$ZN9UD$IOoiuB@$)AuqojW z`E;tq6OD3R-e$09ED#vX+Hh29%x`%onx7bFoYjAq?z=aZC{L8Q?Bk?SR}F!J2cr6`7WDK{-vr8BNIgDHDvBG< zu8?B)l#IWi{6LYEK;x}KZ8Z60%r5`ehHmwhiRSA&ZPS25tbqqRY@>QXp>YH!0Zy4G zZM$kDb{o;t;J6TRHRzWj_KU0iQwI&&AH{~;glCsX#ArNgO)3p<#z z{MD^xGEL`f6Hm3mJAv)7;F6{CO)SAEHU|Kr4S(VQUdt=T z^7Ay8lHsa_dD1F|%8jJ`8v~Oj{(ync*2w0v^9`YFhfmr;F6m^v2T_-(W~_+dmlRT^ z$YeYZ@#Z3Ql$ti5-~JdDNE-&t(q*s^Cx_1%PUUbRlwvbhq5sVV_dmg@$bBI-Hz$#94*XxL^?d9PG(+q^mA_Dwe~Hu^Z8VJ`Ji=Q7=m{3a(j@{yQaGcyh80}+FV3gmgarGx=qiW$Dh|>{G5NH z-QtcmXHC>Y-$sd3F1l1m3)Mm8*>tf*#jCd~w?gF&1^kW!$Q}?*r(Xg}a!TQnT6+V=J zq=27LOHN741OM{+W22+OxqQ{t#!GJOG7j?%h`01_0e}0MO~=ScD6L@Ma>hl(UB7Jv z{P}M~68=Qy{8U;Y)ZauuN|1t%Kc75C1)2&nV03SSV2UnF^NB{ZEtz1>;6;{9%R=h~~weUdSk#NO!#B z#Wr=>Qu}n5vhZyA!IO7UhmC4?$uy0NJ{mforY~ye?f`X53BI>%{&Jk;eETYiy)-vA z&{u?u!)~U!hLl612ClHBlsLzF#Lsa=R)DTw#dGp)icf~_Snl)K#?!Yjm42^^0iTOy zB*9-0mILfU?SOh`H6OdvA4SD)$wptzS4 zFPF6QYRu;O+O3ArSEB27{6Z|&Obe^n#i&6J9KbxvqlaTeshL>c!F#svE__A*92_}? z^CXF-+72jr+{+*qOTbVj{(fTjKopm1?+VB66$X~Irht8JZjV4BW;C%trKAb3^u_mR z@W^ZdY`Xt=o+|1W%`3H(Pz*ch-dGZLQfZVmEyNm3M)}^vshSG?pOuhic-~9d&rhaw zR6Z7RnZHJyoaxqvJcx6J3(BoDT2&?Am#>=&fRmK=%r&+rjLGv~SRPTjybCJcb2ikh zmwl5P>=#0Rc9M3t*6M(Ms&KvjwNuK(wKhQZGxR#v>HGLbYz9_J(63j)n_EbdW}r|>MCWx4=RxO{(T>IA zUoITtHvyTf^4oEi#OnotlRMp*UG}~KFZ$dJwOh6-u?!`qiu2h^vt}>i>!u&;e!$>} z7Xz-P&HoI*!Tjq0;tHOf<9dN>+%*D@`A65H>^^H2s#1AJ8)Xm{vyZWJM$<4;#@^5j zlgRKXf$)yj#wP@4;e#dN714I}auH(Vc&$v*x%xCat(Wa->0PpgmFPkemo3G@QP8S6 zIKIy0>rv7!w4K3gHK>ZS4S7f3vyw<<9!wIIcg$um9hv7jxohCprX-HQXoVpE1-1e( z)`Bgdo%c7sVa5fB<3HaY5a{;5?}4(!JF+Js(9u@T$Ia=)V+NFHi?Np0gZRd8&uT7e zTiCM5oq!bs#Ej{@WHgL4tv*qc#4_pWF{mqlF+$ldD7}Ive2iKK=>^h7l$}?WC+{xi-$?O(d zW$fzPQ7oSWbb|)3hv+DswAWWjOn@8T86jx}Dr)qcFE0JuhV4M&-}u_7LicxT)ZRaQ z_jgpl;Ni^_CuvdjW?E&eB}I6Qq#%WPJ+i(aX|7=>@92?eUNJ0&WKBzMx~EABZ8JRb zH%kjuHh(GusVtAUoe1%#=L$yiT?-UhO6gsL)|>-8AQea-C7uMjvy$(aNnp?scYBFj zS>bcvI(C=5#2pPqOz*$*ywOzQIRz%RhGybfbvOx)0_B-t_uDXfsA5fZSM2iISvXv~ zCCW19oS%rAZDPCFIgt2sLS#MH+^)P*gj4%~fp>+dF;VFkKZcX@N7dW~-O}RC2gFpl z>1N5aVoHpd&RB0t*~>nP;+@aJyPGH;j90EbranPdSBnbwLm2KV%c(GNE956BONwn8$GC)z+Pz9hUQkvGdI-HKZFR>a){lUH2g>ovK-&5Oq5McKQD3#+|jyyAc} z?Q||fs*4gW84@xwR8zG1rH!G?Tacc({BuhKv8>|At~2A59#9!6ZKGOpre*rWUlT>< zKHEzLc#%+GHN(JFhI?(UEk062_1L!b)ff<1|D@&`-iohW)F4B-*T#1uOQZjVZ;j<) zaP>Qyhw%8yXb3}>O@}eZQ9P(Pq$*T-;qr~tc_UX&D+tOjs>sj-!g**t7EUyxQ(R5H z$Gi%}Vi=X#Gm$7LP%J1~uEYux@#_eSV&o>U<}QQHiqeh>6Uh~(%!kB>^;QtW42fn# zngh%PqtgpMupY>GYwxGtaS8?#l?%uDmBze_?)m zofd5^Xr`x8Jz3rU4Vxp~;v6~Ca-fygU<@%qsTM9xHfUc}WhPE#8TF&1gqGR|c90kq zzwYO<+S))pnDe53{(SQL(7}ebGdzKK($$`lE1u+CEu{fu&=>x0;-&9K89tK9rN8qEv-L=6p!M7X@HW6oYphj zCf7Og)br@0)E{5%!-%>4MIUnm<5*Vn^7Y5m?(?5RMFZwG{SNaoW4m%bQ8hFBO@#tK zqa=kp4Rwtdj_@vuQ8w5^S*}I`crFir3Gni%aQbk4rObgOV|J=caj>f?LVckU! zTxuT8yC$A@ruLfI#8Tx@RmOlEu2O(E_s(jiXP7!=5SH0sRjp%)Y6B=TTLz}aKQakx3jRqEh>#K z=m3=)Rc%!)z`v0^;@%}dH^?j&hoD4)Y%jBxdHnl-|}UmV#6sA8~@y zp_g!tdi`}nbvvH;PN+vWr^lU@@GdP83H1O+Lr)C-AT`O$3TV0=y&^aM3mX7j)%QT6 z>J}=bu;vWb6@=BT)&QEWL^cmb*wzNWW>~Z4j#nyEd1zh zPZ6y`9;rAf(cr&C{I;g(+5BWRkSo32cG;=w&7r~XaVfEip<{@mdy*kO&}$EicX>hz z2Xjt7X`Tf17U%SLKd{ravw?l+{Mznmc>oME=Ryu>ss0D21t2RigMAXv^R`IFPV)OO za@43vZ9R#1t<&X^SS<}9huno=x46I}=HJPz@fM?{9_-7`O{2uG%OkS3>y_`aUE1Yh zD3t(W#h~kDU`jf))raSTi=x;Oc*4t$2N)fE&0aLm7O0fR=3JKv=UAUnuiKgo#k5)f zyzI${8J6!p;Ey(SFHkX5-pbp=stp(H!v-Jy2e9RY2mt!dyVT=f{uf)F>y@0x7{=cM z)Ls6R09b)oNVUg?ScI@$fWBgku4I?bmZyXYEhEm)XDfJQ92%RCsMSS%Y!Fs;b`G75 z_{#9(Hp+h4^OA$G{mcem>r;#_rLf^2l~DEGL7*s>9+XdjyZJV$&180EA`?&=T6<}F zTqFIScm#fvx!Vz;6OIR{x#}?sR&MN*Q~mu_WiXo7&lGeBo$Z_ZSSN4L9P=F^w+^U6 z@fPB>7ff@`mDT$;+_*W!!UaN8YdEFupQxd84^5YwxxRz&9(Cr$2hDoEJ0ZSe}p*r;5Mb30>Pb zJQnT#j&ZbFBM;nH9Yti9mulII<`2##cl6gke$Tpf_*5Ms=*FC1QzHcap=@R>4il>K zm6+jzPcO+WP3xAixypi8q3=jHREC_WkUW!e(VKTPdEXkw&asxzz`1Ob-7c)84KW&0 zh4gf^%ouQj@+0ruAZcg=n!yjSH=k)4{M1m)b9JqZQriktEGqxP_=Llc+8$+!1l zhZcWpeMZ%1lznvTJ`Ivwzqi{wyy;`U-a5BlfH={0??ygtNW$)v>vP$u=Rl*%u~B=e z8Fi^wT!Z{q43{txD7}{v^kA^%*~r8K?}3hfRLJFC>3LeL3sW7$aD<00*X4W{V-6#x zvJ<7#O{f=z@31_pdn|?f3Lx%lQ^MT5&hpR&-gV|uqU`So_HI&f{Xxsnjefh0dUzAd zmw;B$hi`v>L<0o@+0&6qU}4cxy{zscO?QwzB>gnXp#t3oxt|vG{*vrzif_O8HfnbZ zqb345V=P`fJvv;IVnP4x98vS=(mB+mB-YMjoMx9Wv;82a{Awiv1NiF@N~mwn)8f^J zhyctLr8XI5eSwwuibyPfi+$6x29g6r#H6i zWTz}Tf9Yf{MJrfQ(sP<_TcOjWa&gKk3qbLVUA4rHFCHElpyjY-NcK#Xmb=Xek)-Xa zlGq6zr?@q?P*e8YW%1wZOf-IU{{G0yL|^|CUg)BAQW|%iRf!yA?L3ne!2uY%X6TDY zlHtUEx%o1&APp()c$6Uie~7!rOGs~s|1BWmBTk`UL?zHDT(FW|*2soiecn7Gz%kyL^up=#~81OT-MQiHW?1H}Rl^ z$~CkyPx_C#!N30dZ!M*AcSU0H;YG83)Rghq5n{n!!&R3G6aVto+44pCcnaLI&zK3i zR04KE`P0^-5gN?Z3RG(F+`3eDY&+OV^y#*F(Qe39x??Wl356R(McorDRr?Pzj}uWw z+xg0L?GRRdY~I91w(RW0@%1MggrAn9bQc0OAqT#KI!*k5y4cf0b%$xkjyK#&z>oQ#dOnO+Y;KPjT$Zzi(`3dTd| zUEx`M)%m$=E~Mg=e z$4)a)fxJ;HiLwjGmMu>^9yE&XE`M`nZ%_=HUi7ZOA(FzBEC^-&f5Z749TMv!)*%a(cdJU6{rTy>v9c4WX*eBPmZ`AUonG*K@YNqfb5;fbO zSDCXuFPVSnO)O@+K{b-?x0$Co_%QHiQqmo!jBM=r=joqG`1e}<(;#)l+}3Em+YzVl@FA2TH4=gr6mO_-l# zVx8geJZ_z%Ge`i4+js7J;UJd+BrAbKqSWQ8BfmqG+%?-DAI<-vNc9!i{x^IWEmOgJ zpdy<*eM$sZPPAs|T_bGEcQdS|7lPZ#QmGsNhI8~)pg4`HlcDamnwk^~oLXj#4We5J z`evS;hO;|JOs>h*gP!f#+04*;3W-!hsN%q75cT6PY=ipnqP{9zmxj}!Qby5*3criy zV0B!ehNJRY=-435O}Abe!29)BtDU#N*hIj@+vLBAE?}h>afW=sxpJ}8VzneVP~lv5 z`MeBKp-y#xb2w+?v<1u0DWe`B1l99`Y!#b*4B|hzIN$mjes}UOZYx8Q6l=_xB;Q{z zZEiZGwJMk)Rh)r5c-!ylLb5tY&<+N(%TgaFPtZxm?h?WvqSXk zLIe;cbJwZ`Qg^0&uqqjeY`sdE>RiCX)>5lhxf(eMOeRfgFs!HgdupZ4(Dxj8*Ee^^ z-KEUSX!9qEnp-1go>1Q+BuRxK>4=p8=Bka=F+85h!6N2c=lX%&FCC!V9|W7+scPUiOwxdnV3;w zkS^0Bd=e?zmW8pO0Q=s&YXl?uv$Hw~(-A)EQ{tw3F3iyTuBjxDqVk&?VPzF)*pSIF zh7AgCafu+Y0$Z)I5OeWI`N0^OYeS=BU=MWrRfR$0QK7%kp!;0Q(?V;B)j{#D-*#0M zV!y6Mm%NomRJ>?_Os0BNkOWNfJG{a%yj5*puhnhlgSoceJ$RVlmvDpbv!|B`xMz={ z?4NlrQXJEYpu@lU+ewr#_pg4m5MibqsLf#t90(F*cbfGzC&l;0n90p1;dP<(Fs^er=~tBz-Q!8gvu~U>LO8`Tnpq3bF&@_BEI4XU+bDdlK4wD{HQl*t2=Fcm zx<88m0jJ?*by!?51153sJE%$qAB-x7zcEj_-1-LB&o;a;^J?dC6Kh|ckDRu<7^0(N z=PaU_lw+>ld0;1A;|r3n=p(KdUl(l)s_RnnQbGKCzs+ zKzolw=*;qa2&R`%8yX z(#5oj-e z8^9q4{D`~|bYZ8s{4?k32dLZ(KDxU9QMLbHxz^%a5Tnl{!kcYY3{Ce(`Ehg|dUQG; zBf%Wjit@8;NK8t`S6Y66qv9F8-%$t3Xc4AQZ_|)g&}Dgzec3Re(9K8{512Q^hgI)` z0rP>wYVE2cAHYp-P3OM)n~^*DK2YwpZwf{HB5D{*>@XvaKu*OkI;Ezhk+(I!MkrWv zMuiOn+P{A$XZDr3$M!@=(Vz$u8Oo&8dL`#Y88#q(GDIcoKl|Vg-xm!bpnVR$#xt6dMX4CkljXL;C#tvNf}vie{y{e~@8Jia>u z4q-1iejexPU*-bwi791vKSsjFHWn;T)K+9f9GuZxHK*wu8YL(EJF(<&*R0uiUec_z z+AdkZ#3{c5?>n3fh;w<1L}33zNe>{yC3d~;_yt>?le_m-l~TcMXGZ^+<>mndn~^7{ z3-*?TDaKM>z`!O|d!uX-%KTRIkjWe|p7n=iBzlN+uQNcw3jo+JUgUVSP9I?Ycz%p$ zDbUc&kE`%#HTc}SA|M=A6#yJE6t-CO3-B}Rj+Xg5<7t2j^h0nou#7v}JrnoqeY-#- zsgCcR1glI?#1vGa zfqm4xthHHzM$8}}dsdjmh&g$hAN8$|g7R-pD^Mv&dI#eoAs{kXjla^QSc=TmJWUSH zN!;2L%}d#5Nl&MNu@$tR&bVfMUja616kV%w;-))k4dsB=o@`1%~Rz89f5Gbw|nkKqN1nO&jbr&v#leicC{n?62&G#WMw$1xaLYb}RP_X&h` zt)xL=5_=Wq1GN6v?SPFma6&(9sEs$ivuJV^_-`%%A9YB7xgTy7-#@%36f_F7H<@`tEMg>r_h$xH@`1=llgSzq&G_{aS zB9D+j7TtmOPQQVlA`<(V&qV}qbed^us9w##D!Y;r_sCN|^;lpvi%c_L*B0Mvcx1Zk zD_mFqrh_ZKO-8*R#-+x;QZ*y`>D2(W_M-sOH&KfHV80u;=a5o8V=4?@kvU&(YND(v zdmCU8Cbr#mZpKOYPRfO@bj;->Y$UD2hzb9a9l=hlzn%ZnA;C53^|0?JRo|7U@ZrZD zMPoQuPN#>TKp;ejnXS{sQ}ne1obc1#P_k4Dh(h1N+4PPIZ`n zy?EdIeD%l_7AA;8S^TbQZ%V?{oXR^zw2z_5aulmtEM=~meL>uMOuhLrNAnSj*SWt&96^u-NP5=Tx~6*eSRK=xRv!M=sHC_6 zi)jb5Md~tW=&(dY@rPDKCYA5(ne;Fcb$-!Ki|Scjj+>50O>X-VYH48zr zn`3LCeXM0@wiF!mpbd_pvEUvV^~P}PJAyrELfo0xJ)dGwoc~m z^RyB34vh^$6!tQ8e)Ts6AN7*S1jec$7N)U*r14S|UYQ<=eDq8kqtg}-yKF~exoy(1 zDk|8WTph5(Qp*K6>dV7>0VedZa5j9p+dJsmOM=R&%-*Z>i=Hhu~VH8N5&LM0Cf@C{1bPgZjeG`DBJj1n(6MFT$Enl%Pm`UAX|M1h;-!~L! zUnL$ZB#-KLCrH%$`7ovZZE@0e5yV^|Cb|zHOv`VJvv@~4>dM*%j3MU)4R9zVNEx4+N z&MnK;c#6BAC^c#zjlJI66C?MPR`zTTy{7hrSsQ8J<#ala&ebmSkjlL9MjU&;bOtgl z#UOBd^;#&lK=YlI6p8-uU&^9ncB{xL)CUQ`7FT0Z4JZ#7Q}HNZL(=a$EZB%z8ss!( zuOF(h7D!2$!kl7&$c)zW{bi+_UNn^dK9ZE~MJO6zR{Foq#Qwn%K*2=+%M(z*Nsi+pKBL^p2XPg@%(Bd1r4*L-t|@6K5~ZQ6oUAK;I=6JZ^SM6VD} zbnXzR#ZSMJ&!Z~kO;I|gk)N{mO{;i(_P6oYv+HhI4bC~w$hAz1u}nZ@HoKq7D|NS> zsn|;$us04YV*o5Q%aKZp_{0+w@{W4s;1@?gSXP|mH4Z?NWqzqnAR&BWvIp}Jun7Ex z)PaHGE^*AybO#i}Uk!PbMiEoX*JytXJvK91;Ot$1D}u%s||B>WZj@g}q%H?sDZ+%!36I@tn`c{jP1TdYtLSXjL>E}=s0 z`gt&{(%evlN(IcZM`4XZX($Y&hS73 z)wsNc+Ijo3gDor!2bK^p^YE%gS^!(D&{BQz`v}duafM?OvLh_>_3*M=rWv|JP&Mec!1|~gI@qO>^SA~1>`UNH-sv~zE$_ZJn-guh#W;8LGl_7`qZh5R$C6-AF6qeP~1a7}O zEd-Bny<^BEDJ2NVZTp73Q=+NhA3)r|euzH7Tz45Up>m>5_P1@V|GtmyHP|jzDEc0#qwV@d zNM=xdo@u{ZlyafwhCCYka&c9Q5vwG^$E^LvE@W!LoG#1C0QS_rY#?w5;pKLDXEe_4 zyBjMLgBb5@fkP8hwtpFJcZi347yLVpOG>P7z&6tRjOx{EOho z(l#i!d4uSu5Xl%l_cF+=7M%u^TE#>IiobLV*26Y4B*T-tiTA2Gmcu}$2q!k$AKY;A ziSjAFthe~&7DQT3}kPK1r&6|z=0s(WJrk`ULs4wu#73m-%p-P_U}L(*LX&& zr4}-Q9`KpfH%6o&3)4*U(ox} z$3hrCuRs(jey>C$4NImZ{vyP>=D=L``9aD;=v7!dYdU>oTv}>fn`y9iyKM9;Bz3tf zj*@$An+)B#3nDaI2}#~R>me+1w3_p*$T$R025A00?IW=2<|e;Hjb}^WK-4DjfQ_%* z@98lBGNnCWa9ER@fQak{I@aHnR(a;Y!L&mB^BMwA0NS$c(1^yFCtkAhd24&!lbt|6Bp3jWy>szEUIj>Govo8fIa>D z6RU(|0;CC(UcyUdM-9@c%1_0ZcOWS*fVH=`@zu(mLw~2-Vit?eq(Zzk->Mxu{)+NO zfJjd6#j8o425*>S_XI zW}f3-#S)dEM>k{dS=F#8aZ{Q3tIXXL5|qc?Or?XDM0R#*`U-0>Ryu|^oCuD5>TP{0 zrIlwnr-vF9S%2Vfg#Mf)kmTJABJY=|qp)SB`-GK=MVIaV8~NK>Qg> zPR$Oq3l|eWCn!qXo)0~X2_K+R0rJRyAeAcg&uBg}^Ak-GhB+unpiCd21m~QuAQmwn2<40dVBx(Ll*u27+}XIp%2eO>0%Hl#o9yef{AFWldrM}uRyC*a~35U z8EWpmy1e*CG{+SO-*f^6Ac~eJayBiwlRwpwl;Z`yKgFWvA89GD(`s>V!E?%Oi;dmH z4Kd1usjcE#OxxrKZ?;4+9=e(p=JCn)==~7lGS+Px>~BNnby$t$`0XEGx*zI2CqK1g z%gt=D&6%6dQ_&nef7POoq^2w96FUu)_X%^rodYVx-O&ZU_wc|EWe8K?SZO}YFApz( z9kkwxhh{Yj^ro28fh(dmK1P{^AOfX1gTJZ$cYRbC4(1yV6YCDk@Kq#o%<5h|YKB!? z9H{Ag7X@y_3>=>u<=Ct+04;FRoxwdyf z3eww;Kv~1fyLj*WQHN{t_YY2@B(c!5@75_s4707;iLtLJFAy8gYmi4NfqKk4%SJb6 zRFh+%oB>T6MuY;5o13xgEtjrXcyY{Qwgg29_m>Gl1!7<|ZIvj=h$clUexq06XfmI` zAvCj(U#f9fP`IGRucilNg`Ntxwb%(ixeI053fB>?4AYLwNIA+2>#i>L`+dA}Bbe2; zS=AEh6RX%kN`r%9JmpgE6MJh--eH5LsMVc^`*YS`U4bbw7$!JfWMTf;JKd%1E^TOu zCV1FGaHQpY-j-iyUZ*(6I3h{~aI2_4U0%cGt~Dt^oxjv*S)r3A;%2Im_RtBac)-=1 z|9J_Z0!5I%!?)wNWQN+S-~^dVlxM8>O2tJP`_6fiCz&qRA_}@ZbJyn4(!VD@K>p_2 zvH=cR?f}{A?x=_){Svhjgt*6P25J;R6?sUr*$)S0nZ^JUOlJ=%6{@Wp*$zfmZnRir zr53Jp*YT1eNQ7Kzv?KFu;U)YI$@(*a4e}BKy|^%7QSwor;0~(XfOEh)>@sL1>i}lM z>nR||xalClPey?t3Ja~!7DoL8`eGMZk1bWp&DavBTNwH+$xKmgT4B_5@TPdYKW^9w z$Tv`7y^zFSOiieD&3O6dB`&1|StPvy`M1>v>wY1aq{?JUq!< z=RxOYm1a-wiP!Ik?OX@npoPS8wCpM+hIy6X`Gf#(MM!Ed50HA9JznfT8rVkmCGr=w zWbSh)RwFAtB8W0XxOLSZhIDHxZaOzzkqYn3FWI4SWr89kO$_n&wd}r-2XA`k)C{iz zA`$)no`>!F($!(7m58s_@re$vo4q$NPbwcKwIYNfR@5;|O)zHS5=F=OO{nzsD8w?b zbo7LRhCl+JgCb(6$02^V#AhIIN^3Q2I`{1z=lv44LV%gs{#O>b7gM828E^pLrL{8l9{Z5`gL zdKmu$o_r+2dT5Q+K~*GXAt?(XPo`@BWIG*`e zYtA|py?BR474XV9Rs6r%6*rPcTqqEe&bW2kv{AiH`LxK>Luv;1sNTGqT|*qb9@J)0 z*26W(-R&vjiu?SwH|yW^9H0@BL;$=ejTCSbz05uwueBHY->>}iXCd7}KqB2`>t~eK zawj3plS1Sf#cTEO^t_r0e@)T>yOY&F5jj3K_J`l+Vh=XzD;S!P0w`ddR57FGww}3) z4njh^kMrZ6^cz`$e++Q)8n@>DzJ;D;D*ZoKeGt)H z;{uU9GMT*O2VVS@CczH&0Q=8a3296=rvg#)WEQ3joezo>N_wDH`SnQ;aS6m3)Rt)@)X@?kqc}e%R-uoTbQ;vd?YFt>`xFWXQ=%7QeJ{ zd~Qy)aLLLyHKRItMYrk*FXek`KZcEA`^+DU<4c}G1y*BL;a^hc+pRh_#^MRhM;V_L zzXLL0bs{XB{g)AA1ZnGk;xN#>z>q^$a{B=N1mU?_)yFdGYGa*ko?A2T)suLgp*&a& zOt*t4v496s?&Y0DQ<|*X?(l=yhoA|a5L&;R8{O$<^Ods?w6nX* zbi#j(BRaSr5;MWlzb3@Sc8-v)xVvJ^}eCH4g z6k<>(;^fo3Kdc$)NOzRgjAr-26fr<7Ia5VGqbpJ}$WtZwL&g=@$as<6^hx5P9CB19 zn<~k=wfC|PM*CeD`Z?_=we%ozLk9bsmPf9#zJupctt9!yr>PsqIc?2HPigu>OWhC6fE-&ecN9HyZH8-6IT%hxY!y|1E_;6!t(8-8mH}DJPjWXxS+uK_} z*2_|`AMd07*wRl-P+SE=wUjJjieE8?{hRHB!YHT9h=M^ zyWu_^T2SU;`-u+ycbA5K@i-P2@ z%7&7(7`bKpd2fYd<09z0cDYev= z-WMkw10$cR=B|*=QT{@jIycJ8b4zRl$IwU#{xeKgI&IMmudZ3 zB`8mQ;cv}|vlwCkSqeId-b*i1Z~J}q>;Hb2|9L4DbZM{;dvOy5UO|~;jeaM(rqvwH zXy%07m$&DZ#r}nk`bAY*41xLlosld`DZ0#`hXmhCD9O%dX}@L@Nb{(Xy(7sSavU8| zoXdMgx&$>(QaK5oZtvgIPTP2p6sibe`3CD!NHH+wn$pi=h0^Ypg{I3q6ZZYFb&igp zY`@{WW>SLdo5EWZ0B=KiNE1*tm~l&m7^d=ok1SxaG3}z~PAn0)7ot4oV<0 zL?Q7P0B{oM^8ry}0F=9Cz1c8l`-q-@kz1j0VP9$ne$=g8_&8&&)mabhaeY1vi4i!c z=wapuws?>%>$avJt=6sl$utAEm)hfg_4-b(_%y85;hIxsuyk0ic_>AbPr_(7fcH_e za^c=whi_p415%MU`RaehEq);VZuRs?&V$Kjbk;#N!VaZ32*B)*_toSGEwgVe`Nqr| zyM$`gfb5{VBLg8}#%ARZP+m=3s-n_NicG9PEc4~72r^sF4T_N zRp)Un;{{5Y#VAeC|ku=I`!|YVB<3!bv;*2h{2E?#`V4BFOk z90O42Wfk=z2JSl_fNGU^9?78FSN>Atn90O;?N1gw?xZX7JI%r)CExNHh?98uI z3NcxWO8{{A1^_I0I%^1G`IYKWugjDb_ta@Ia>0K*1q1@2k>&XIqx&a8(Lt%_%r}C&gWx4Gf^r0lHnP68xahsSCzLE~I#!Yz6R+b4bQHl6naYNZYZjQi$ zv7}v9(n)=5@L|d+D>5x~%GI_iX_b7T!p__)c^$bI*C;bZY#!z@(?inf${0FJMAG8aXA{{dDrZ6rz;t|tMQq*-BP z@!9_K=RVd z@f#D`1XJwkX(sGP1)vJois`TLf%1n&mr;Ex4`v^Xn&qkGYF#zc^Ata-rG4Hsuhc?6 zL5AyhZ=2N1DL*_3ed3i06RZb{*;<6eyZ{L>*AUHmkM$dna-|NMb>~oi29m6fIF>6( z07&O|O0apJi@g@O=u7@GB{468{bF5V2l2-m_P`J_PLbe)2w;6_UlNJ6n-07s3BZ{X$48JlV-QA@JeEo z;yB~5D5GapUUu(;-5SQ#D?8fy(nhcI)d$r-09m;T(N}dEfB*3{G0^~m>9$;mj zB-U2PaVG@*Om)Q`Tn+3cY;Om)g11gm4#PJO;I z_skrqb_7GIZq+P+j|yL{0}XWDTMw%_?-Or*^2 zuzFWyKef)lOD7yo;N zi}L^j#1tXo54&s@5M5F8FJ9XRv-S)AIK1?4o05s|t_m}m?pypnC$5|hj9)`6Kq9-& z?2<#SaT$11O*OLEyBFU-GxFjqT~J%x$tRZbY`ixJq>_*G;55$T_n9{v3g6@L$bjfW3H_BfPJyM zKJBe=YE>7e*~y%{U+oF-LJ~?8`+O1^Fl9Bpg}#aD(u~)$X0Us6Mw3>gxpQ4+X#4Nr z$|V^riTQSahsDKB>5IyJYu$|C3&Zn@i|%X1)cI4+S+lx;R2_*R5L?40@YsNWD$fM% zODf63m#V^Kivxo8;y^$A<9Uw&WF1wgbD7D>x5qCT=nUr^zE#@k2}DWo_A3%;BEhn_ z#&=IF(Ro7cg9d`0>&MRbA^cLS?D-c%tB%(s4cK6sMSES1W44eOH8iWMdWa(L{zqW7 z<%N-g14VfWu7>7pi7soue}JXsP9ksmcmOAd5&J9C8krP4l>hcM#;us>3O+tNLl;;+ zK$KcN!FG!itRne-5}y{4p)#W}D0^QTCbo<0kX)gmGAJ7?!`4I-6do;uT_X=)P=}d* zL_SJ4J9I|sG;s+mf3wDU9C{#Nlo+hVe>D()8&|nTl;|xXDW@NypLY!{VmwDHgZzr! zzpP|Z^TpxfJufTT1Wx}=eA2?Yh|Jfv`Fq&p=f1Svtf5$P@k z={__E=@3wm29cKj-ZOD$?stEG%$lWZ#wDE1yPy5U-uWm-#cE_duIx$!j>NOk{anqg zI~8BfDCs((I7|YSs_&n1{@Itb1_}%Um^K=yk>WSCiucQ%FFi$fckNzwqj<0prKG1n zv;ecb!M*2M5Y!GGsyV1bkf4@q=)M1{Fdi+Gr;?aU_V&CG+74KRV zlc}2@Sgx0Md}LmW8knjm!!>YWz(M;!Z<0DXlwRFoBH9@48(ufyYV-6V!-hHfie`>V z5(QiSBlRN!LoWGa;Zk1ie8iZ0u{!IoW!Y!0OzodbxAq86ZC~*0{HEN>r*Lx`(7tCB zaORR}qI*;GO{ZgyC5872Y{jHT$7h*t^N(q6tqNv|oj_pbx5~>J}f4!Jx*Qj08!K6R5ndiaD z9@{&;DX_z7<|6v#cOwxJn=SOK8Jz97Vf0d_37inGhZU8Pk*_*{wI2a1UvlMWU-PZ}#J^Pgm%f!yc2E{l_a~$AaR+)Z%uU#;Y38 zs)WIJ?jsPKQ9#^7)Iix$LV5DzGPTiaArf}gj^}rTk+`B2RG_segp;E24H+M~qyB70+VT);+ zm(em`!RH>lCAMO8>iE->vE3v8w=F_4fY`SbF(noTu)e`ry0o7KOt3$)3A9CMPyo~` zP8dUo4P9T-VMzWMt#BbK%i-g5`!7`@Fo*E-k2!;^CsX~&0utfav<)Nl*!V6Ha38q1 zO|du&<;l=FjIXU++Q3q)W|yNFr|^g@gv#5~N4Eb-Hos++l9zQWiyq?HRJjhu5S8|8 zS@_o!s|x&$kCk45KCc_hjbv6u%}}7N0gH+=$@n}zt1?Xf_azYd2PW#5vnPTRr4!xX zY(uIOgN2QZI7Df;VWiM810_;E6vp&;`pTfUENvaE;)b~1C0P!}1Se9dD>y(Uw8<6Rm$|rl zjNkmlovPQP78@J8P+fu6{pRYFR@7r>=IZTLK|$)?#U5Qq^H9~d9E*z#3K886T?{(r z-XQAue?EHxl%B++G36w!vG~{TeaI=|Qz!5)YprloPf@R*w`oZIR=p@4$QYg}fJO~s z`J|%Z8sfR}T;@w>>B%_@?1-9=C3KF*(ra}O&leopLMHu+*EQ;_vj^25Q@c7? z=aIwe{AwPHS@vJ1vS5Jy9i2iUCboo5g8jU3qtWeSNgf<5-7Y|IDE-$+TcrAJxm0qW zuyaFs7Q-Vmh;qdSI;QxVGyS$QqP~!FUf_=YxryHq?E;K>8$0c`GGAzT;sGZnCw7$m zl!8y1LmuU!Tds*eVgFA~bRjYcc`24_9NH;$1o53GaWJTv)>zeCtJme!eDWwv&+00j zcP#4;`H|^jOL8Tl0;zYaVoy=BWA9|&noSk|QnHV1hgjmR)eF-~NKu`pe?{!Cc*&pK z-(ir=W5|d(cmcvVWWgXiLxK7$Qbn|v;V;SLE{uwT?gYFIX3~9!lk}|lZC`giDvQNF z?p@RqO8e_>p61hGaFvNA&G=7O7{++bbkUGTUp!aXbj6@AjI1o2ypa?JiKVnXdER3(>hfC`*l$^2 ztP;&cd4J`$;5kVpG^dF2-sdDWdGrO(Rr)o!f@5aH=lBla`VDS}QxVnsOS8b+E~62y z7VzebLVWcp*MXHT;($7w980qoBbw-g$&A9!EJwP&n&Uhm(-=Hi^=;~Pm>ASfyX{Pj ze(1m;)s1_M(!PzBlY$0rV-ya+k~{P!};OUe0*@YwX;%@k1lcS-V}M z>EeKkN~)&X!>9DsHlI+F1{#lETih0!_)U2|?2=|;9Bkw(x1!*~S)P|XV%tJ%xu7rn zyO?ZIJ$sxhmO`{sLKv${7ypU_n7A9>yS`s308R zizU7UHN{PHx-)xNbbNe#BQWH&S7F$+<_KVA4v;?lLROHIVTpLX8 zuYoZsH8azXg#JefmLH51^>G4j?`5S)*Q3M3!?l!cKCT{Rgue zlM2YcxSCMuM*^EYaPgnP%ZIb<2omUvS`57dX07j)S+P*ds4?8IO?Gw+J!d?7@8Wku z4Cc5G5Az-z7JtASAi;Wl=gmd>i2NbuK&P^^tWJ;B3w$3DE~qiJ`1QeN2^?mVm$0lU zIz1sWE|sj1&_Qsw9>&YCz~I9G+^E~xq}qwZZU+WKv3$u zMo1y3w?D$eMIUgQC!p$=Pi3ns1e6(%sG>f0k`Jbc5F(l486fI?{$J=vO^L3uC^>*L zo5M(6PA*e=s{aRZ#y*d}>4=+QkP4h-;&rDD$y~@;z8@xM$oe2V0xC>3fJw^~3shF3 z3BtHIXI05JKUknVaX1?C0h>8n#c4V0wwWW96sN=zs$;YFzrJ&j$3-8`lQZf3@)c5& zk^TN~){T$tI!gNRNehvH>1d?)39MD&nU*}lQi}*NnN>8YdmDl8DL!75>&%_KSSNWb zov_0gBQxzWa6Ks@sxwA(qBslayIj@`es!Iab5mga7{QT1rjhV&zp)z71KSjvzn%Ks zuwc;B@(uk;qr>p89~mmH{z(Z?anE(a*dNn46mC~PpA7Bz-U_GhR~kgJyqP`X63DhCd`o zwnLhvuFh^t_tlR}$uU{S{*>!z{Hl{>`4^E@V<_G=*f~aau4El#c!?_P9Md~>>iA*4^B?S#*wx9IX zN&0n&q-g*hY|3~4N6|!@KMtawH3jN-8bh=%ch)WXLlOOT+QTsly{>F{NTG;B<&s9cqa_Q`<+)cPa*-q>a;NY5=d)ka)r&5 zf;)dl`j$#ss!z8iHdsLT#GV`ZbQVb!@&o=#+Mlv6I zE_ttKz9Ufb$?!Xw-Hncp)`o<#&|LotyNN7Tp6K9n$Wd3$ESvNj~EMD+#+m!MNk(Z_3{) zF>^)9cOMzvuKwM+<%n+zR}JRnx%+}23;yntw837KWOCOX)KKJNx*X%4_oSe(=mqKs z!TVS@W^Kx?{Kph%56VeFx(^Runq~xYQL^s6M}0rXVp7l)7aE@P-#%lUut#E{4*IC*U281N6cjS8=3fYqXszZ$(G#Yv$8XYjv`KrGoE)OZi~3JLk%REm z%4)Q~=R=`Q0)XxLSz3nf$XCAU*Krshw}VT$ex2Z-34c3>e*CDmmVcq9O!yGB6Yb*D zem*M6aYjYEIkI2ZAnukT`!#aa7xDE|*_PophH}8?a^pbrP7EiLi+nYTzomJz{)TUF;hSOyh?|0&G@q8^oj$6-cd8@lN3gfa3kmB>DAJk zk*R!;Mc%^*`j#Czmk+(wW?@qc{zN%yKy-XX2!!7SA|9J$WUIV*@RBdyy+B12p zp~P7Y3yw*m&(&|GvhCmN_lUYONnpyF^oZuf8CIyKNRHM+GirAgNmLRdocB?PRv#2& zu#iUZW7^gMAN*mBL(ZuV^RxY5XM#o}BNu1kBQX3T zt5jvf@^8BO7-bE1Z%Ad}wfaTG#)>o`@Zo_kpIR-zc^GS>d;)kCdZv zLY7xC+a{OeK7!6hb1?p(VxOeu5nqclCrdLzSba~s<@gPHnFz~fYHqOzOKOk6iXl3u zd5BjC{;@%tl0U;o$l=fylKjNQZ%ZNN_P z8`vq@+_H^oB3@$BbK?Yk(^P*L*|FX77{6%p;-ks@?n!=Bb&S}~IVoH>MuQ@C=|hQQ zUGKwSIxp`pUN297N*n_eDa&sJ2`vPki-u?5&|BBVRHHpA z!9(b2c=2g|w-#b{7klJdQejis1aeIc9zf9%O$A1oLi07PwU>(csBO!Qq0V(c>iYtU z)6om}Y{7*ZOg+g2(u?2cjI2GO_^6IWv%-zNL?rRIrmhRh*95gIbg zp`82gu!LMb^a~{Bv3|`DeE{SSWrDcZx6=tY$)akQPjxcoc9w zZDbalSh@0AzYp3Cx$;smAEqPZg{TT1$-?u#rvAF8jrXMZ3S*08rGfz9J(uyuh z_@#b&HFIL2@?=0*H|O9h1$}iOr%;UFej1se{;)lJkPkK&U&RM9@*{|zk~#D)V_Z7*p8E$(tDauH^dm!c&czc6a-Mv}6)cs!mk-~X2H`S1!%HQk=U4Xp@+xMBE7-=InQz5y# z9g?^1ZRIJJ`PLS;bSfo};T>8mVPG-#GVMRV~!}Cw1j7hMC?Hqyzd8LU`ME|LJY8g~W+r7&K7&kY;^y3E}ax z)&LIrA7LCrS(L{gB*7SA`Kg(ZHvb5CuY^ndt^9ts-mrwk-k8|5LoSx#aJcJR8i{kq zf@$#nC6k+x>7HMk1N(s4d^g%!V<)>;!T!mArt%h2)DBL1^RBvkOQ#6Du2CQ$gfHH0 z4jKsJHjNtg$s+rdY7sT;l_gAcC4A*-&XpHcy%BIqAT2q)eSF=2%=v(XVCCmK`kLu& zL-i^=1o%s{gj{kD`qF~OIH0k@B4c+%>gJOAbis3pgDqdZ2QG&%tuiN|>60)PuR+Sc zb(}yd?yRHk;5N`h zZzp3iOEBM^y>tE_`tVhI5$65LqYrJ_AQV}rlop4lT$IORPe(E(AfYCty&9Cws+K3c z_&z9eLdw7ZxP=@GFqtnCf9y4GSz}&AyW<7l@cJ$K!F--*5i#^cgefqI+k1ZzrNBwp zB(#Gos2eB7z_MO#*HC=kS#mtEMC;I9%hw)LJcVjuYK-Mh{*Ff^ZX?+6RxNnDY~Ptw z?l*~pg|?Gw20E-9LK|zH*~~a4&gwD{B?h*0luknMVh(+bzDtiDg|zRBK#+GUWS>$T zMg;-3G539x=>G;>3NqVk7mxP0G4d;a91sduSNWXnwxe=jjXUzZmm@hDL{yVB! z)~zosaand{uJX5vHzlRl5YHcsz7{cQTecqDUbhJ@yX)K_=O?)dtBj*DG_x~9gEw2Z z`Sonalx`5zWvkT`VRxfTJZP)3$%r6Rni%iqwOiT8NYYM+iA0Vh2@Cc39hoXi@W522 z+9F1*W-1FP#dbZxo5`Uyk$$iZHs2=D(k0;mbF3E94dVqgJeB58`sI_u5h0OdFhlZ} zI-m^CfDMW*k~WmPtqlXLZGmrid|@m*SZ;GmCZn9L7% zc6P3G@m!J(ZhP*|RIkW~-J?F|HV9ZcWgIs#mzb-3c8ALomz;v!+f_y@0K1@OPK z+s74-Hd)SabIq6SBLC{ zjhn$3!FJJgi=QTleWLE5Fo24~$IC@7H7I&!mZgsWqJF+KidTNAe{EsHZ>n*D%il3( znU3fGxS8>U$ZN{r#A@d=f&sjx=xB#ZqUTUWk8=>&_76Ry0+q<&8m9GFAO}^dTb5meP(Iy(PR= zeV9|F@&n68DLhjB%V#TrBHZ_g28z=*X5mbKxJm){TpjMVqEp+8d$v4C94h}yP0M{s+*pjE<+`1QFRLrUU zf(Kx=%#hgRG!Hlk%n^2 z&uLuP=1x$1Vju8MaagZAIBBBES$AL*SWWyezu)!#b_(di1XRCme)!3wi^hNF z5l*wl7Q#sD?9}Q|BQR3HbpZJy7J8_pRI9?_*=p$HVZ0(2XA%tyu1VZ51IBBLwk~DL zlUiFM{%*0Yzfi?LwJCvj|Rm=3)yK&tcUoK_P6LD zzP10-!2a;{rFfv&JZ<^W8*IC3D8Cyjr$vv0AQ_qiK%gO`wETZXRCe zxnv%x5$VlIo~lN_k-bv=L4Pnt_?CHlx*@OiY5#iBws}Ha;(UcaJ-e6kg@vnbfWv5Q zj~Pwx)5atxyeQ|{SVi?>djJd{yNX=iiZ}u2}heQ9Rma-GTc2;cpU!<~B>pxy4(qSwvcN*)@>$VZ-fM zwt!0FlkR(nVHnH+tBot)_FQ#w*CppUT@ObL@YkE zvX6yxRO9=o8@{jp+2!Y-A0X%?Ts1s9S!WZ^>8tcF`@=bUsRMAAmJ6bNfV4%LqZBm` zkAA7dmdaM6x#U=LoQUBQ^1j5Vi~h}bLX@X#Y+51G2?9v%gxT7(%o!=UePwxd_Kzd( z&)crx3}7JG4m>ColYG;@3h99)v!rNA39Fu`ACt|>fn?crJi?*8IWfADQ#U>tD_SVL z*`i2^d0V*?(jQ#v;9bHsBd?{E*lo1qeADYgCgEWap z4J&xnznOoZAiGwzN7%+h4joI+%)Gh-w71uF+1Nqf!aigJEsl=&foSk-p6$DaHiYTFs{jTYlKdmWJCQn`O$*yfoF5`W62{0h6<|ijqmhlW zlc@9H{gQ(gjE`M^bwx;=b%zrz36*)K$hN@fLSB$CvgKa@kk*@KDqN9bXLI#$FIuQF zT}eT&?tMOt7y|7KMN4ptH*Rmx+^!bW;~vTS$E%3>dM%s`5cVNdC9!?X^aKQ}0)IJ; zqARIZKG7WxbrIB7@xH{p7>Rov?ZYH3M)s8^63gJe?}*?MQ}dQNU|QM z;0ac96|3a_UKfzl*XXGHy!^bnTG@4D)hABR<5{$G+?P>_+vZ8_aiwURmo~2@t*%s! z0tBMuHs;U`U}wvM+(A=SB;1CLg){Hv^zzcVw-VPWMoaH|2rZNcNwQe)puroMk3^M| zD^05+g|a?XnizBPi-BY?ttZ%dtm!Wd?qwg`={^8OABr4s(72;~hI{A1Gji&kb^6rZ zTw#+2KJzpr_LQ~qYKacxLy6&8E|~2gW;=)pUcQw)@R^41S#?j-`u^&WLw-_A8W&(v zco-6^L+PR4n;97(tPS@S?f3%R|JHGf?-XUTn#+wG_?oE+)0CfdmAYGc*+fvhWYOxLjYGe4g1m zm0B6|(4XH~Xgbx>d^*sV2b?sFVb$DAr%R78ylcNpN&)w*e`B#|FG0L=IrYjvUg36D z41ne1RHqF2teLZe(@{}g_K3F7Wn6Y#c~ppyW%H^Muv0e}k@I*rgaiKV(X3nJx0I3l zVa65dDN)|5BzO(}W`qvx-PydZE2!GyR8wd)6Gc|#?^)o%-w{w8f0z3l_n)JQ$gO0t z;>?AJ4J5t5mTfV4{lkMAoX6hvImlfCi-&Tp-javOfkOFS-P89Es{4C0k3rZYBXw25 zsaZnZIrdFVlJy@kyB3)-gHI#;$tX{h__S@xG*_k7+j5GtSXDSTHJDQsrtzW+*`2DjLs*eI^HUwvYKpdd`J>-v@S|&Msq4 zMaI6Fk;%R!cW{$*=)=S;Yf5iJj|j`idcynKAoW3|T5YB6KBi1YFh^+S*5So{N_eL;myZt$~i+r19@)@s7o zb|S9}e;hDv+z)Bi^7RAW)EL0njs_GT*3IueBsYuE%q~o5t(f1Xd6D%>{6nZu1F9`f^IqjPa{%g&BNp)z$k-(tR;NS)8M}UmXUe8g5>I- z;mCLb^@Lg%YabOVF^~C0iTL%oi_n|z0 ze7`sST#`}e>yTd$I@&^AjBb@d?sd~UdDo4Oh8AYf&R%Aoigs!OeG|EL`}S8wHxyzYRlV`()mBQaCX$wFN@#C6_U)p=5U`2it?4g7(*t3;f1mzQ>rf~C|+8d}r-vOai3D|(pgSkQA_ikHu1k&cd zcQim`(Hg>S>*u*mq87|wT>Jzw?UY4SiTz(mtHC7yQ$lH#N-<{oA9%nT?SIwxj`51Z zudqTo5CbTACvjfZTKchTJ&m0CKAe$;!Kyq>9x}_@Q@Y z*@6hgcSW+b8mJSHKF95T^VRTSz5VK>tLnrlyt9v*iN2zy)xg8QvpO$;ky@aXE4?bS zo1r8rX0X~o$Ve(>v`{5Ce$28?Pg%7iZCNw-nUGGIeSLO$=tw*Uy~)IpVZ~9G%u6jd z3*yOzGR+h9QUlu7*!kWD2&k7rGlUY9(Bn{&qi;BlKS55rPEyQ z`lpy*KguQKw9fA?V+(dYj#2+V@Y0+19>?=;MF_p{YB**O;u!a0lt_X(+SE|?U|tW$ z4lQv#F7N>DS*c$-@%K#xEKjD7NJ z0ju#{qb(n>+5YN7q1B4x1+x-cB#!r&*U!l3acta=OOE1ne}-qo)_mwtZFR;f6oa=A z=Dt@9QpvsMG@LlXE$V-gmH$h(7j_HTzf?~J zQ=K=K1{P%inP>Y9uoX6E@G?UYj3Li~&PdtK=DQRVkCGEQrK9kRcT&1XYr7xRK66G>92GV=M41Ho}7+=Rx+G#jz@*dTxU*1bmy zFZvi6CGWDu+q%LB>uH@iFVlJ}K|x^^SPvTd{~9 z6*OiR#LG8-L@TH)xESMsW3o(dpzEDo3Ua@=eA!Y=?#L_rmZ@|W9bICUu7rI0$;!0W z7W5LGi46W}In(yxIdFaiRspDQ5B;sAU1G4 zZ&70a=-}D);7PEPEDE_<7Yi)IOv8J&bP7|!Xi%s21_*hG2PYtw_k?BHHeu``cw{^+ zZoVQ#a7K@xU%j*EU;=eO@j*7c!MPbFN1^!V6OOuYX7`1^wW!!u4vZkzlB`#I7qYoP zMOvB$W84&SjH?8G$Kg?Je&xof^|1l~saf-ny14AI=bBk?x|X(2F9P_#iD=HYEI4HM zKpxuY`{JXr5J+zoR8$`TD#kHd>0RiBmwNqtOF3KY%A5XzskhGm>U$|jF21q3IHhNj z(~VkzU=fSYx>1LZ$RmE6ckAw%eEL{~YaD@6;g2|afpoS^HH!N}a1w~W9a5a!WRotq zonhxh;XHWP!SKiHi_>r3fylC*y+MhE5Z9{u(pMP~zjj`o8BV)TyNh0>@) zy^b<780O|Ad3ho-LADxeAjK>JNg6QO5B(pfvC08GKR(QD60&%oa^@=$W12f3mw*ns)Qjyxvu`CX>ohU7Z(nHTUZ#4wdzBq)xh%Qi+`T9oGl!b=uxnhXi;h ziVrYIcu+zP1{^YsXIzTE3gP-BI@8MV)yRj+-ff9$3uL{^SW)@dZ)EaT%AO4LjbTIT zoVwNoN0YziFEN0>ou2730Pr+}1W)flI^#_D$nPdUB-WEcRx*tD1foz}j5Rk1l*IMd zWJX?Nz-P9oig89;khnYd$+j2#lIAhSndj=UK@go+!Cc|rz+9KT-bS0r z#L5{$D{G#XmX?c?n{KV&`KGn)`Gti&=XQ2>c*n0nyf7cc3mtv-TK8`5(LX7zdB*1F zQ$KHCB5kV0Dd1=V4 zh7+8G_uIXLmx);mgA-9??Zyeevnk21_j!Up#FE95r-q4f?FObzua<0h78G(viFv{| z#v>4VBEtr6{aRRro1$VysOz0@px~E=#=WY?`Elk1N)eFes5H}-pa=Cg;>XbsDXsCt zUCE;8EqktxV0J6sH?d|}YzmBWvyOeMR208w;yW^A@WHCBX=ME_<3l4Q0SQn@=~=aD zYl)z;h6}?!r6cL|&;QFo-3oL;nF3;CQS4oLG!e5-xoU>2Z_kK`wAU~N1oc=ZYws^S z@q`0&nX9SFB5ceLM|Zh9ZA=EAib-#%BCKcO!FDoknB8h!DNLkF9&b&2zkSGu%JChw z5^jrv6Hel1_xvUT`0-o;mCT^wbEm35G?!_TlEC=xoTwS=P{Cos@}M`J_PQ(v({TG!5Ef=U-Ny=k)wXT$#bu` zH-{VDn)t}b7r#oCZEfzR=qxS0U8V7G7dIj{7F0)x$O8Hyh&uA zz)*cMKH++<_aves58F*1kDj;CtxVyiXQtBpbbI2FX)q1@I$Qn7dv2i$op5p;>L?4B zTbNQ%o}bav7m?UJmzm)4mz>LNId|iErR5irfkZYu<|*h2xL;rXDztA?LS8 z)JQR90(cdPWPAJo!scho&*u5)BMWS<2Yxqr81z!(KZc^J_W&r-&zD>}4G6?P@aTH} z2UQ(!TNN!RB^6My?tNq9;|JVKZ42(xhBvT8_n9c4sA}4+W1nnS^3nwd7`W7C{NkeY zF@6e;FIN|7o#^RTS5-;p=jV~tWBTMyZ{P|vp_V3IqrWMq zt~jZuXNykfpwfLGjBI#U)Xls36XUeb^;Tw}Uhei}OIUAvOurISKvsmQ&7!%Qt%?(P zDPFz01Jn;Hx}5%YWG8~qqo?qIZ?W=Bpt0!x4*Ep$-{ORq4)UU%t`uFQpBP#fYdfou znY)>#?~KNCj9@1Ia_Pz{z5WoDu3is0w7VE13fw|};%Yi*e(m*qm{rJ&Ny6haaj;k( zpnGfcNS8^+vUQkTn8p#Kb!~-Mf^C!U_u_ld9F2^;Y?W$Boo<~=|5w}oEBkihRA3DiGO*bJB$t&CSLT2I`2%E+k}Co9K+NCu3`m$otp4lry< zxw^WR&(6-aJO2Q)kbh&|uHR#P@lbq1=v+Ka-Il+4t_hS*a~d>gB<1~S=X(Yn#!nY> z0^!;o4znF=;0i2})H!G6;W45{?buelTQu8#di9bMN>mB%f&?sqfX_|&+UR^f<*}cX zfV~G2N_B^OwL6H!I?*_?= zFe4|_Ot@F6HYwt0K%^{z7rN=lzoW-NLs8}}%_JfEZQ@V`%>XR0*o`N3ht+)J$ch(L zFJ!xZj=D#0I9g+JS8!{RU;v9_c9o2t>-VH@k_8}drj#V7csxc zE%l9^7WnZtp3(7c+55^1R&Aiyx$QHNtbev) z@Xg}hd8lfq`@Ze|@a}nbi25kP+_1p=N>x9ZnU!kZMjBw5dt~u>T{E>g5>5^t@bp3#lf5SEZE6#HzGASw2 z`7@o_%Fb!+cLcI6n3u}he)7O6YZXcZ{*v=yw_1Ogifl9FF0M@nYX{N2ifrqou7<`U z*A7f?U-=4viI^Kq#OnQ3wDSP}#sF$)Cmooc%Wuq$ch-GcjI{DqUxJVJiFBV-su6bA zuQ>mzy)k%+XZQDeR#upo8ItFE8S$h{TmQhy}`2qtU z1Hypqa>GpwRwnkg_zKi{7s!zV+l`xapD;i=Guy|li`_H9Q6?|ju)3UD0q`L=x6@n# zGIM%*_4W1^!1l==O@^8{llq;XO>3r1zkrzS-_N}4BjcsO$)~b9AMz?-yh{k#tlB@mLR6=|3$ZS{hXdfx(kDX7>zTgQ8{W)R6mhLV8&Ny!i`3 zM@0FI4#|2Gru-*-`S%y}$T$x32gxy()U+x`cQ_?j_`aT9Gk6X&9jb+C4 z!Bljdz3&Grb&$h2)4)e1vn&U|7jY=QXP9nTUs%G(19xB!x%91^ezb2}>zgMT1@8Fcy`Z4f%5#=8-*}xPwaY0;`1PWKe zjRpCqR=FLrKdn`2iPhb8>O;QU21`C{W;?<-lQ$nCg4+}@c(be)L`#nj{iOK(Y~WDD z;mA)nu}pTehK~zn-^iSGc{659n-+e$bD4eqNUo#QScYr=pW=LEO!WuBX_>I2x~BcQ z;Uk^)<5kgFL>^^pp7S`ai(wb%{TD7749!lDM>%&k#G*NUp8@dx?frIO2`~3Uw)j{! zZIlGuu*Q2I4m3^~Z*dSX4C1_mqrBK`(5}R1-GeP#$0{xGHx`7jLkNz4uiV{g=H4`p z8wh+2A9)^Imofc@+r>RMi=ypqr}3h_+T%|VuTrqQ?_dXarn#HAgMKQkFc5q<_BP?k z-(CO=oF{-X5B!x=%ZTi;k8@@xkSl*|-eO_@@6tU8f1uU%$#6|Mr|@S?Y(=S?DM@tU zjR0rvNP>QwG7i3-9!XIs?A@wnad}c$4VB4ceKW2NF?TaicCJT z3&4ABbEs-tes$$5;vLpyf08G;)b5j5g*Sagacc72mN=xgPVh)H{1u;ye{_VK?9*)i zx5<)h2eBt6Uzm=COYGinv{-~oe%E|9Cc3s{5UhfiyZ9dU3oUTpa{%oT;nWCnk+cWE zFc61-faCrHnQ4ux47l1pjX-2O!^!cMz`RFci4Vf|fDrN>GNn=G6y ze}tBw%YLo3LgY?OPfMBdc@s)Y0{!L2jl6d>6sW`(Ai&GEaq~yQW$pJJPaq7?1PAi) z>E1I}<9xFqj4!=t=XG`b82`D%uTTKwNPt8``!9*+(Kw*kEfY68Q*G=N`K@hGbUxs8 zcj`U1lzu=|&;cRY8)Qf!wj9e#!pT{7nr+u#Q=^wDLKQl*inlw8!!`a2VX+gR7FLqw z&I$OrG$_o8a3D_ljM9Ij&O$j~^Y}=?=Rn-yf-osTjBFTP$Qy?B*$tmwMPQ`=%o;9& zRyzA;gCZa!ipDCOw^HOf;aVBidArwDXp8y^EmHa?yB(?X5LeY_@<7{rGwoy0H$zSO zA&=IFO6UBi!IKJZ&a{g>@dR66Wr+V{xROO`&FW+D|Bg8QA@+I+ zn21G0WRqPwnEl<>C2`D%e~JxGUiw{itp7;!iyPxp+E{nK;DEu|DzfNqZHK1w*Msh+ z!G0^r+L-i3A6T+fX`Ij*Q>zV+%mZ&LiKU=)Y1^A6o8BLsg1zukBuj+hPrLV@4la1H z4hhGqkG{Vv2Bk?L8s8G+igeSKq0()|GhR3OZa}39E=PH5rdLV+gIbB6f)>%`41@&V zDA1CZ@)E8a#rX1%)|(a_w4PQTtN;Pf?+3`IjWC8>_JYiv4ALUnfCD0MBq6c&vvVUu zL%!4ByTaO@3rrymSL9{^cKnwF(lkJv37CN+Nhyn{mZ>8jt=SmflUBnfj|Jz8-Fio0 zv&x_eHPK@FS%^uG@SHWfRAq!`O@UDC_T1v>RiCa`YjI0yU^O7RK2=wLCzAf@RNe`Q z*`*TbuQ5U2j>Bd@W3ir?@P&K;>oAL8=n0R6R7oVc}lfmvJgN7BtY;xZXEH^l^?b63CcN z#7RCm!LYQs9+O1>twQQ_Z&*IbIy-h*O^)r>a%J(Lu44xBTF{y~pPA=PkpGwGXcA|p z{T##G9^q=10_Uh07Iif^RW7sI`*aYvP}`1l$z8m& z2C^9cKY0;o0_;}l4o0{#b!)2G%x+@|$Bw?&`Z;|#g=O273zO5r3~#z$ zfSxOJx-5P(RKn!yt(YEPppV$|o2K0@GOek*$aQUm10R-Ac{clRvjkU|n%ins4kVfI z-qJs2XVQI6q-+DPy&e4$8f8}zDy`G1OnLedTja?~R(mvf7!&u7tCqTW{oMid{RdR~ ze<2@f{M#9=0;`ylG|_4u2csZ}X;$Jq$i;F&aME@x0wyb&7w#`qG6c+~j)c-e5%7n9 z50~`m)f~GQNy8ETdkf$Z&5;SaCjq&tbu%&fu5JSH7dG3G1L%@HWn~=HLCXtV5CkQA zndNy%cfqx7&{gDlhyDlO{O71@;L;KPIrnMXwtmU~LPos_Shj$t3tJrVl9c|3Tn&5R zp9Jsgkl5oIcc5Y_lAFD#eL~iN)@Bw8TCV)E^Lq~m1-QB1r!W%==W1i z1=@H1IA%K(JxD7>S7bJ02n6v?g8~|GW`mFVWjo1Hc$@LORVpdP%_7v}pE-hu9l1Ea zaR_69n=;oeP_=S2sXP{2ub)4nJ?Wyd1kwt)6E}7)6lxgR4J9Ki-wuCpDx30)TbN*nBU40+{k8t(im#dRCm@wYQp@9&`^wBwa(EqJg54|u?Ad7d2n@gvCWPYlBrid~MO-A1K2 z`SUbbexfC^_#6o%YMR)cU*Mznh}bSeJ{inh|Kf}i8U0oC2x|*w8qeo&uL-eQJp*IRyyK{ zOh-mrEXb{uakR87w3^|HHB!rYnk3o|{yUp)#mFRRgyKBI@*~GL16_hD5ApIZEWTK1 zu66BA#2^6h4Zj)!7}Or|(^Wu&)0skB=7S-f~Ip2m+D4?!CAhp|`^g zxE=Md>KCPxjnNwC#9?*Nv>jv%yE|D2{T_%(j!S>Bjg~`Z6zEEY}wC zm2Ov;7n3f9GqCsQ*q6W2g+ZTuWH_(H6KF6k75`XQXS!|R+oneFfZ;wnT?A_#=~EdO zD}+2ow~gR4x(2J0o1grM7OTBf30E4=5m8(JbH>w*mT9Byd0Rw-XX{~kVrf|#e$-Qs zJ|Vsf%=^3SOtkls2p;9otabC?KgwC;QG*MHM&H6Z4viM62*aR5d2@^5)qnkqCjukG z_nO22<<)S=6BGgZH*f0u1=|T>7}zeMC}R7(equ0roTnjbeO8zcb{M!d7FnxH9~`mr zsD5{X=b^+`>xIV*c;F<6ZHDi6e=}buq$-`1$9w97SvK1|;KGe{{p?-grlK>=|6}W` z$WL5A+`?vQpUr9(iZTLlRbq`QVj=?-b66hyjP8bOinkQzGg!+Y+z_niB? z{)6a8ocYdv_FjAKwb!G&@rT*&Ysl$SOqI=|?WC%w%A zahZbbkP^7QkBj0yFc~!rWykycnb_HyETs{Zl)RRNESP`W0m^BilH4WHp=*?d4#@ty zaTn$83|Y2o5@2`rR~}V}8-2iqoU^)(w8759;{xYy3v|*ceZFMPAo2K4v`?Ypjp+GR z3moZI8cUEX2SKsRV31Z^oDpvH@WBsmtSKLR{hxi{pB<@8=|vf|_*^eup6<;Xk7*v@ z#E-o{s#fE1rs_Gzg`61#M@i?$zooO?oUD-KF@i3LDQ)KAbYse1teQ-wmu;~`E26_t zhu0Ml~V*3;AejlSB~9yxL37oSZ6<|7mhBM!>j&W zBG`oKkWpQTK;x^ zYAyXL$SUEwt?FYi2rOpXmL&EUgj-xhhjqKOAoS*FN9AY!N0{9O-Eu(d({*2O20A-i z;Y`YT#sjnbD-$!8&vueO29Z}`Jy^&5OWOy|R}zxUNJ~E4!Zn9OLA_p!mDz;z5^VJX z*ik1Efst13-ScWj#W`^H=64jWnSP#6LQAN#;(~PX_ zc=*NP`)VSZi&yoV%U58b6U|RI<2)#~t0r7ZL-JJ&6E=e~0Cm;Z^>oqC0`YwP;?sc__sM@r! z)v9(AOp{Q|dP#&)i{L(ayzjMyuXmNzv}#2|QiDm#BWfb38b0PsXJnX&;FW_3qTa6O z2^wlSO{asGkKY^mFImFOYnRobFD=XNGTHMtHWq=8bBry%-J{w@@kgD*Zm}ChD;FjD z*Gc)`2j<`BrV+LPm-I8WZKg50Ff>}}J4~QM20yZhq&H?&$HtWKi@tpzer&ZF5OkpY zGAP!14C%XMFOO6{Yk5bMFv7kgk<2sLw^rV=f22fDUAu0`9s5Vrm`sO(+&^Y;*DB7D zN0~7tngGdfLOS`3*IG>N1P8?_3ZdG47^KFHI0kj~l;ST_-oL zDvtend1Qg4mGaR|s*+7(_p4SmSs6&Mhb)9F37mHSIC=0*_-9!VcZcq-u0Q|26GPS$ zLFp=hjcZeV`8z;hXISa)q52i}3)h(rAsd$=+1f&=Q!X@Q=xySdbii@2=J}k~U-5l{iGS@Kc^)BlhX3ZK9PCue9{-mwMX@!|#WcK#_Dr***OL zyDP%0m;z-12|OV|3yyr~=G~NQA6o(OlQxTV;T%}X9Xx%uNjj|;%HK`dbM6%{(vGhq z#$ImRbTMKzkl2LaJLxYHUv7@9x`x0WNa1q28W@^JI+@Xvz+FYDgJ3uFfkCl>SCUd8 z!w8=>n`NL#j`5mbryu;C+nwLv_xLZ>Z~6bom#ygj+C23h574SpVsH>b`2i)iz;?JV zfG5sT`Ph-4kHj+(t4+(CTb+y8HFLlPRRvQ~l8>Z33l>7&&KITU)0#Cz!n>>X{iO_vUpGK&w0AB$_Im~a3{T1{5v3Lz3DweGKgj+Za zZ+hHu2)Ck;yTUBOgj+&3JR=`(==|VKH)e>$|62(DBfPsQ{8^g++M*tOu*eC=a<~K)v*+$=dbO_8Y{^ zNQ#IjqtfmPTsSAtf(A>KJ0X_qk$piOi0a|04-VQie_om=>Irx1Eu3#1Iv*VJ;p#KZ z?M$!CJFh$kueReFE7)VXGJ+KCt`=kYE47R~>_-&*;=Hw)8Y^8}kJP_&IL5|91VC$f}Phez#Ni{4SkoAf|dp zx3h^>HMo?b``3V)L{3pS|Ej$utcxPl&nl6Rq`i_cU4_3x?MZlY?7;db>zx_`+%~sRpul9z`RW zAa3<`ElP?_kI#jlG$<#042;Bl!|)#{DI2~JA5lJ6T{MUvmS3MBSW4h}jaS=+)qaH6 zsL~o7x4!)0CkeVI+l!03_H)#@;w@2IX{alZLCKgAW$#KU8vuf2F7( zHD8gw>exou@r|)?s&2F%&Vg)#J;)dHPaVu6U$7>xaP#XDfA%^0 zJ;qf!pFPo;OoB#irRvJ`n;DPB*`=dgAp0Ei0 z5%zzT7TSo`ER(M;9D$}5V^4XI)-_70?rnPXkr8i{;B}4MYHLJR0d!!{DN4c>OGmFw z_dIit7s2u3=@ViCdVZENg8j&IB5%V~jE=gGB9rji^8NB`RP%CKQ#F=CV$)5;pqq5P zM@QoVuOhb1$fcbJeu?kSCBbdeOU_kXz}nQX`r{qP*> zij&mYb+H6qs|9RC8{ss5uBOB=+uVT<`{6{a*f;|RanARHJs7@OF>LZ; z9V@(@8QH`~*C+~Q3;6GL0D9-o6%K#~2}-N->YgN6rv_Lg{Bc=?aA>ICo0(Efx_K18 zniQQ=lvXBfZSB3;@6LrH5)O_`a;BnpkuX(@Z%QFQ$4YNpeSB8!CDUC~4?@B@JiwbQ zfPMA$FrZ08JM0N7hY6?PU@lo|4)h!)ktPJmrVF0-&Nnt53&CiIdDcT6 z{FYB_gQ?U_oLN8b(;4h~Q9w_hU9#6JW-P4Zdk^`z@4|s065=z>CX5^G5jea7_6RP)fUJT5E_`BELj@KweQSY%g2_o6 zajFcu!aZ<{Ld{S$xI~k(meaNiY{N)05MYai|JOsxmb*&|#BLlltz7hVZ!^X$&5RCL zB(FlV_xoX%zwRHARgB$pU~g9(byw5V6gc&ma4#=%HVVlm2vYeqXtm5N003PZdnb%|AB7-(KYp1r_~8 z<9lU@&^D<`D7(g%8Ww!YPMIfnmjFmR?%+p<4A~T^Fe>W=h44}x^?%~Ra)YM8rQkzcFU@I)N9R zltzSg+-=3e_Yi=0Qgqa_E@F%l@aJ+Rt8dC*)NC~mB=OwZzh6e*hudI`B>m}JdBWI8#CibQ0`N$mdym8n$a1bp(e|;a>taTyL`qv z|M`or>aV@^SM*Q=Ro;ENv?NYqv{QbkwpF*KCq-psjO?gPrW0%-< z{DE*N;zj@JTCUPrv%_5QhhS271 z80%Sk7iIoL5id$^1c+L1Mbp>ETa%$G>gqJ$s*s%V@$pa|FjV=*X=?_0-EyGB7-_Bu zyN8D&*NToJk;crrnEX`Hw7>Rm3qw%j7xXZ|e8*(It5lHmnQ|GniZn9B@3FZvxM`TZ znJnswxM4u0^#3^2;>F-sBlK8SZ(55j5`kq=5|$H|tsLx`tHH7I>(h`6S*HBLEgTM? zuH<%*6AXTYJgf+kR@vz$f-_m=wB8~(LX0ils<(N(%JC}3GT#3KQs}<%$0Zf}^}jR3 zz5^o*F>eu*4@?qNp=kc8uO2JO&*k%U9q2Bs2O-Cm8@)S4el<%i2^YaLBM0M_E85l( z7H{X_IdjQ&j3NHuL7+WuJtLX%FPqcI1J7)?d%E^|3M{vwwJs9r&fSMNGyrD)_HVTK z?wL`ryC@+ZnD*GsNCBgOeCFgm4J8_JVt;eke+GyOFKVHo<8h+A)Ej4b8PU6RG%7Q_ z5hrfCZpJyL;(az~t{j3gE4Q?=cz5t;OSH3oyINQ2%CZt$d-?o5RAIiq?PQVCSvG9Wef>mq)y1sZuUcHCYQA`+QuOu{x^kH1cY$ot@5c4P&~ zfRx_PW*o*!83J9wqm5qnB7_X_Le{-UZBp7aD94_mQ}#R}f^duMJM{RY-B-Jt2pb1h ziK64Z6ht-3p=K~pMYo%kd3#JOaHY&o>)kq!o5?9y%5%hu%E^vCu78NUc1)}vdC5vk zxEVk>%=`X2xR65n(uQD4wSqV+_X)2yHTd-7JKfEBlo;-QPbc4}_irtLVR{Cce@1Mj zOR#guS+`BKQlrvs$wJz9&6mwQv79+v&=-CYhYIm+dPbVMf9AC$(7o1s4`w2VJ@OpR zHYl?-a6kSFRZD_1q-`1qZ(5`Q#o?doU;~2>q5Aa@x;au=?0_p(PPLSLCB4+0LA2_=o_}|is52bW5UFH#F2me$Favy!NC|$*)y6eWemj6T|e`fwR zI@J0yo|Ua@vHog%H5SLQ{FO+PefeLhG(?aa4xK>`Z+S!b*T3X&H6bg6isFSkkPvc0hGq~;*ZLrEfhM5cQ(`(?O%+$`yLEQ~T?clw`fJcyWTxgYN{aq63++mG3Vxce6}Z(ZPEGvq$}7WuVlz1ncHwFme#562fDacj%1s zn-~`v0-moi+~>4Zj<&*-)?;^zP^%dg&9Sn=_B%4c;O@I-eqbpbYl>C%GodxkO5C)s^8{2Djctsh%Ux-knqY+vS~_=$>XoA*4Ck_Ir7GYdMkBu z8c=C==qHIq(iyf5iKITBSt%weEUP&tjFIovw%^-!L-4j>){5AEl7trc*-Q#pBn(2ioDV5u)JC3pMxG zX4YcH%x4X*-{NYr0Mk&E5fO400dESi>r8I&hd1%fFJR&TphQrAK8Knc3RN~^J;?j3>1;TL>vj5FQvJVh@85fyilz+>~w z3f_m4*H4qwC-<3pd>^#UM+|NYiA>7hYrl*Vo zRU}Z+6o^rA$t-xRA~UqBf46YDrd>6(c?NO(H@A*W?yt?=_`>Z&Im6B&ypp}WQDwD% zC(uXU@jN`0j>%@sU5#guft;S7iP$;FV#T1YTu@q;H5rOzED-pbUhK%;N{jb#(FfY! zHvOeD5(=!G2+-rny0qKhKWVRjRz>`q=!A`@jlyzOyU+i*8B>>d*%ieRW|>2amPnIP zMQjQ;U5s3SIv$qnYIM2@)<@t_eOw4GBAoLO9_~qI5f3~3OA9Se$;`Z& zyTivW0nSFJ+L2Z36Di=bnMq_L$JSuiBTy+dA|aVStZjM$7mwRTKl0-@!;N<+Sf(sF z41YiJg>$>=j%Ux)2Zp3*a;E)Gl*TFx7$ria7B;|-st+=yI3Cv8iYK~xRbh{DkK!E8 zW}91pR^O=B=Iq+U%Y!Y;G6*KbAH+Pc%rZaR;9nD-mk?^@$Yz(A7E*HaR&rKv{(hfs zJz*9Bh+;08={0``a|>dPy_ouabLM4@Ugp21UN=fXcKRrPd4uh$zI{incR7<`!$oH- zS<$vGbCODcVqwlGL|+pVFWU`X=f773DO5beb*zohd6FB{2#Jpokr{>dK$FX$gg`xh+=R{GpM*^`Kc~uVR zeO5E_H!%t}38k=9ja+2GQ!M=NiS^w!y*56v<9Z`zBW%8 z;d=q4B5NA$n)eCyn;Bgk&=AY&M7M~sRs^c1ykvQ(ho#~i85NaH6;6Jq>)3_kjQFiM5Bs zBjZQW9pamk%6Lv-RhIyGP?YyQMq`KqK}oqz0NJ<0pua3AEc^@L>(#wcIJyFnKg+vd zk1~#lfo-1&cw$kGQ3yBeoQ0`kJm0(jfeiE4lo08Gq~Nj_(^|(#Y8y_`H?e0d+&h=M z@`V)MG4A|w9tFWTdE=ubW=V{__zyGLaCKT~tZ?Dx;tc5m>`*qOPD)3<)XTBI7BXHs zjY>JOpQ9H(qe|-qRaEUezC0IHjJ_3ieNHinD-t{p@MYeQP7P2VviG2**8wx9Iu!Pm zDn&N>KMT&!-LLpHI!^IsWziL&D8^hnf)F=SNL^5@e)qK9Gxr)LQrv?08}$TCQR4sN zAF@-R>2KR64T#gaTrecyt9>hvJqrjji%peaV?hDMjWEtG1}5~$fQm*(HSMR`EN5eQ z#AIz|8&N89x*MO8GH8N{b(w!W*flTEh!&nP;~~l}#_;TK4`&K!5|pSh<=IRobx=)m zq_AgcYUKSYaeL$xe3P`6RsO+NAjWP<=h)uELIpcjmz;|-iSo}G&7v;=mHQt@3{N#) zj{RvOV;Kgqq1J$?>Vz&_6F8&bLGhk{f4geNp<8w)3Xy1HW}f=vx!0uLupBM!qi5Cp zX2$om{!c_?l`JM5O3RrqCf$9M1?q0ny(b?Vzz{kQC>Tw2ezsSEJYpIR$y9YN3lSP~ zN*5rOpP>2p)0W4i`ReTN%(q+D=RON!^r*FOq$6R6-bCDqR+8pm`|Vz*9Y5a7nim|L z1F84@7>J;D1@X-;{BdnXJt+TbSCp~dmIS$LnRVEY6j!Q#s*+pOWgX;5ra>!Wt0Xz} z#Cm1iIkW0wLhnoDxI=?)1?WsK_Xjd0yxe3s~S36lg zN{|;eIPM^5D@eu7Yoz3|pzh$K8%}wXSnQ-z`~AKT8U7S+JR69U{@*Ob2ratW_k|KJ z>?hN&IKFEIs2zKxEO78dhJTiGRSwd8WM<4SnL*Rch_yYYGf`oKP&8}aFxH#S_sg?s z)05)XCnf#Nj;G2N#oD6-|71P5y5A2?j1OqD6MhN(Syrjhsr-9==f>)6npgeFH&ol} zD*|i5z0p@I- zUWsp4h&c4hz1CDkW;^9wy}wiTEH2$Q#@4zJi2V#Wh}ve(0EN5L#gFPH%pv;|iQePH zjp8RjblO1z#HVqYBr<33)=wJ$Rx1*x*H=dALeQ=?=7Lf1t=Y+pIC)iFC;T(W5=9xFm|Rj8mq% z67)u4-P>#E#7!{(-gGc$68R8h1TZfG81@QOBLl5PI}km*il%oBzM8taUP(H;ehau~ zK#=jp55Atb|Lj&HQBfQ7s2S{>bM-VO>V^-D`-hy#JmBunV0A0>|NeSpUxQC-gz?8mlh zHi7;mlVRw72T_p~@7RtWs=5h# za!3jrmCC9QChDvqCHYcKQnY8#8wT5!%HrZpKOZ3y6v9-&_YU2C0$|)QZg0-Xhw66Q zX_)*DIC;KsnZNwAE+uUlDOp7GoVWW1$fER4I*7k5*6+sXpu)SS$>&espW0G=4Frg7 z64v@fAE&}65)_Vkl!g3iE_m;+%=K#n>7OzblP~~YpLb7lmFQ2O*N`VJi+OZ};0%EE zlFKh$_Q|%Zq+}NBAf(EjqNtY1ebF-=Z&nTb+q%7|loEJ(n9Rp|ZVM#z+X-DBtDZ{> z4Ht4xkPt=z2M>A|}&Imr{5nuFuS#^Il(A;Ice2E9bWC98Zj_1Jl|MkL`7#RQiVae0R zJ-}{k$ZzObyh{InSl^fEnQs?GWDmJz8^b!&pG;$RKHo8H|JWB$jDqKyAT}gVHG>6B zqQql^$UbnlA*nH+WDGfim`cexq)9g{9s$!$Ew8&M@Ok#X;h0C@w z*nBE)nxsD*-&!jFyj{5ZVX%jH-#n&30Q@87%svwqNw9}9x&*hl2KxTZBMrCFGbpX; z%n7(Jn}^-WNG=>sXm=#f^VYL@~-#oA1qqkSzqJ|p!BvE<<(Ew~0vY;7x z020DW=DTvT^8o8OHKlBWh@LkZT6Qi-4ZY#G72Sz)mVi5SydYNzmj1lMl z-f~gHPKRrdzzs8)m(|-uRc@8yV~R^U0ec`Z7V-mB+buWRxx`y3$Z71W3eE(~Dx>oA zr)4vCC>l9ne zWE)LZ{=>Sre6F+fBl<^{N%d~Tnc&RLDRpk9!5pB{uF&-#3JU> z)=8L0$-{tWk$FXyY$D(AKgCL_8jK|r%$}ZRC^b<4A% zpC&C#?7!A;D|pqWGAT8=*=bXs>3Ro4RtlJcK-2H^M<)^!q>PJT$OjuNMu6)oj^N^V zz$*X@e1?Hb@IweSdDQ%UNDBm##kWQPuZ!Iy$xyT5(3KK;OxW{(BL_my`x`28{#Wk} zBk`|PrnPWHnW%A1D>Ktbr&B(vh_R-CdIcksu(Mquj*-9mA3tEyp3eJhu=^gBbEq}cQDhoShbvozxp8`|=E2xw`VFN1S9 z*~~lSrE&9*36sQk;~ccywmQ5~?*$I!4D@yCO&FzLLtEgv+zka6Hy@X@n@JBapzlnZ_5MB zBh>Tajg4UhPL%6E*hisR;}zu|(aUF$kq_6|@!V*@0jr(TTOPd|6R_wV^QtqocTY~) zI0bg7G>m*^G#j$Fg8o@lK)^YR7J5<&w~qXL;GN}*i^?OH9pSeuI)C3Uw9k>ZJ9Fjo zMg+zzXG(|%4fe3J&HRa(g($3G=~N=MKCCnHDW`7X0uz5P|5Hs!5)V4rk6xmH38b06 zw1CP-`tk`BBK7X)m75_E!>ryz!}h*^6=*r}B%{rxEMuRX{(5#6uTtJuXo)hQ$D6P) zaUdy%@ywAr1lNik=R)Sf*g2qX`0S%~x*p~hC&kz0|Kej4yfQofbhnBmT$0~sk9 ziY~f1v-$3!-F)-)90E!j2-vv)qjH1bY0&SlM3n!2Li&qckDRkig3RTmw#Y;{v}FhB zrnCM%_u(6O*FdjER**Y0s_>UOYz#ZZWwog~Bc=9>FIadf_Qg*st!rN>SBBrPNUu6Z zqX!=P$jhvq(G+*&$c&VTp{%*3#q;W7DY&>P1;y@LBwX1ZdIwZ~Pn*9yr0b15~aYLOooWe{Sy-n{2 zmsANdyBj>4Wk=d#Q`|8p6i$W}YXlulUZX{B*JTed!fH=Pg$>fSzm8X;(ZkX;= z7$xm-+J*Qrtny7CRO?wHk?uWNAUR3j^P2_Gr0A9-yUPg<8D86eKkhP1BG4^GTQNu~ zK5%-i9I`iRSuy>(hT29yaqHKQsQ9r^bzw2mrbrM%75DwVam0R))w(zhBc?Xf-SUIKGzVU?=n}Y1x?Wgn6%0^xucZLQPEGLCZlCdWqnJPvmT10x* z_XV81Pjknknp*}z{1DUPweXkp#Uc%soJ;oE5T6&ZKDS#5NyH~s^baA<_=FB9Sgh+K zUzrWS+bo!URfFRYzOt=)2E5uGN%`n+Z!{1h(a9Q#{OWBq?0e`1b(=eg)8ek1ROgj5 z2MXsB3+F)=S{UJ$YAdrv7|1yq6exKn?`#cdMiHQm9*vUy1}0PgA%FQNY)H}mck5&N zrM%tR#azt`#tp3@besGunpAGCG>yRBRYmsb)nggqG z!~<_be*gMui^oi+%6G{3)$0|nsWE{Xo^`fZ z20QifWFFXk|ClM79qG$QvV$1DCTQpmXp*>1ijw}Nd~bg?0w>tRb7*z;h@MhWkm9QZ z0={9%X=z{?&*Z}`-xJ$1iq-WAUF8-QexuV)lS2GSklXH&Wiz|f7v{4Qc@5GIa-^kq z=M-xe`@XZwk`=F_S{?DDCcGwZqVG|Fr2g{^`=5GsH_RZ^$knKDI1di_&wTQac=j%6 z*y&+d$GvNuW+YhETI`%__s)Dfl+xwOUKSH6V5%?43RKSq-PdqQS2oy2bQ(6V8aKm> zHd>~gxR>?XDcHh_HTn{9r194_m-+)}%Cku56?_tOp;2ob9B}KaF_`e9Hsk)}5FeLQ zzJWrvFqJp?J-pnDAD}H0TtuU}$vk+@-yiy?c=HdmOp8C_-i;}-vgisF7+j2eQ-AKw zQ!&vOb#5a+Y3xY|y!;t)_Mm+ze02RE-taM9sq|QR;i=oLWU(#kiP}@>zyDQWW-nPT zba6M$I68GCqz95n)*bwd=R_du=*VOD#Gs)@FE?UC<_g1Bv6MEoiF5?9I%?lsMgdW& z2DS%SJ)UYZT@zK?Ep3KW#%OuS@wnowsm;`0KN+ab93mCJ2l)w+;YbA*-=L?zll{!;Mezp|P2VdC8i;+59XbWsqcHGU4e}Q%(X&wpd)JP3)C7AO#*Cpa7jsY7A3yP$EFJ{TK@IIb%Da z4EFf=9>#0_KN6%2dTM}7bDnYL0v4tqk){RhnNns_DU`0slw)`=40j_Tpl$aB+afF5 zC|sSM9?m8gj~@E#;O)9Z#zLjIT0WXK7E%R0lzvge$dNSkGyWO^-pT7F8OOe6hiZB6 znDExH{z%!8b>*Y}4N=!*ymQfk1qH9Lp3$fGYl-q)Yiojw=qB$ER5fFOV+;OO}EG4CQaQ=~m<(F~WPDK&|}xy2y5_+yyv$O1Oh-N65eu z_4ksd9k(db4K=0B662k%HR|_&oCfylx67R&3qq;UfhEOLc<>vr@-lVWQD}AvWFYyF zeTl9c$-dKT2Ox9Xe`sbHSq)rnSR$)JExUIufDNSnPxV0BGfcV_?WS;nOaDb$yE3DO z6CZF+4h(7UL)2p(SzlE1%re>hOYe<*=9w=#U`TqVFpO*#W(d!+(;QwZa_tFKyT=y5+zy;ck+D^JAz^_S>efsaK zjmkaEsxKh>C-a^7mm$Ht1@IDVM&SCbFZPAwkH{cDQChdU&bbDv&wLwl!KGpxz*2rn z3|wKSjIy9kBEKgxS%q-!ra&=b83Rxw4_MF@6COl~(38%J(mIYM4v5@??b|!@Q6=Vl z<464-mI*}BTFZj+W<6iI3kNy++Z&<3uzhu(KjGJ-<(*5xzGbmXkrm@gP|adMRknDZgWCEt^J??zQ(MlsXdjW%5b)YH5iXB z0}Zd1HGsQWilHlY2DNgqA)a)%$mH(w7GA9BaC2+!CvwufZyeZM(*i>}4OKDpg2VIo=@LR3$tZSwZgQyIsTg~I#4<12S)`0Ap{ zz+^U?!1vEio`%C8*RLrD*%}oomjTAdcLmR*1J@>o8Wqe{Q7r7fwIlywH$~`YYFiYb z?%{&B$Do*77JK7pV+(>_gWh5%FTDJ@!VLlZrVrUTbu!jx$L0m7(H9W6c*>QDi3u7o z_+~+CO@k~rGQc2pjt*adF)wwz%-~2&GY+`gkp%`fR)RiiC?5?D#;^e%sy-R#rMjQ% z_vk_?-||7y^(}5(fDUk&JP2Z$Z1b^amXH!C2*n1fr7$75rByT{;OewG*M_DRyYlV& z9{z5(DnEvSZiT%)G>H9aYp&y&J*18W8qkpEi=5iCj-+A?x*~@7$g!1O@$qD5X}ET@Pr~Fs%Yj#mi7E*0mX>Rm!ut}R{J?|+@%IL6(Q9tE)d_D)GQDnZk=F=}74_t_iZAb2t;EdW+xP%O3-aM3@* zlV><@{hL|S>(>a!HGxIO$Q%ZHMRf$cP?Ft?E4-E>i~ z$SEh}j?R04Nk^kqha^>EBeBEO13JW3@G%Dl&?aZ5rd|ZA?Jw8wMvz(5<6p+pcMD=c?=>iUflDi5-MwA1!7eAiUeT!q zsH9w~N?D{I9M%Bnh3@M#TkoPsDdFb$z#{U!{O7ll30fDE>;$<0x$FMOiC;pdvzw}o zJO`i!8}c1FOzIiEcbSz!%gQ^!Cu=&sR$+%>N-Zi3=bj`35v@n@oXhWlR=Tgxl6 zbBlSoe-AJ|M7wz+`X%*1$C0ht=Cz{!OJQ5l2*o|G1kO$>ul2{9-KOEnS!a*v=Jul* zX1@za3sfW09l3Za{ijIjHIDuy!>WHE8~cMDdtl7}z;k9o6tA{a9 z_M+yweU&49jV@yrtk$JNm_$n>$VAE1@-E=hhdTT=6LYn0fe%ctm->4+2pV#;&Qa5j z#i&mhZl7pr)}Y)k9}G^VZ?b>=ZFsO+&Pu(?jF0EMrpFU!!TIld~1K5^J6o#b9; zX)WAV*iJk5*SmDLTX>GKKI-x%Kfos)>7@7WV9LGzy6(!mZMRCip5w$jj*3td+$0vr2vcZX8*VV7AZWSbM^+~#MUE|ZT{kyS{q`P=&+ zt5hj~zdkY)U|bZ4?d8f0@G{^jGw0ayad!@mel5SYhwj&-t#n%$j$2>VA(rNKU9bR- zR$ovo$E82GBDgDay%>EQ;ojp^N4x*@`nkHr@|XCr7U7zf_*2&1yk*g0G(Tmco5SG> z9WY-Fgtd277EnP6v^7O0rjpIudVUNI-Dnn$FRp9XU0k0(>tdmt@dA{ndyo10K#QK+ zg#tLdABEH2*wfwIEm}FthA(cl*7`o&zd@xI%jD#lj#wLLG*_D@OkMZV zJ&JiL>2yt*Mdq2m+Sh)L9`JqpN05hsU*(UyFXvcu@_*Ra;z^6JBoHzen?X+GQldmk z>?cCoFLZkonaXj_7_1t_Jhj>xodxO$7i~B-O8p$bddowTy29nli?E7f*t146i2J@v zkv3@t#q&GatR5ME`W^dC5CrjounSWF@zXn0-e>gMCyuR%>1@m;bKt%$ zd<0V^YB295RLo`l`@8G(1!Wk%l~@bYl|Q2;2O)KweZEHAH%v1ftcN-o>w=wXN}rTE zLn_q++FlxcU(OJf#!*zST)Vti(D|JF^OMSg(q6feuRi87FMM0irz)3|>c<0RVkpf1 zCIx5kJwOV6xv#LCd(F+*PM`M3zc9M!vy2KcUdu0}*2kiOSsQe5v8Acsa?$7bx{G!@ zk6&k6U<55v=6`*=KDZ_yMlFPLHz6=1842~dTL`uv@Qp632H^ z(Y*J*A>^}3%gz4v<*6?s!lU0DaF}4#EaJRwvQ&>o3~bbjzfv39EE-JQ@{UrKIwDru zJaDn=cg`(;@{Kr5+!fNgUKbm~R~rg@n{K6#YZG~(<{!H4l?;;hGVuGnneOb#KWTGH z7vBmqP%#HLjwAzUFbmeSyXD#|_oiSxZ~6XU=@?JI_Ef~9JLJve*Y~x+5DQ>$CdeNa z{GZnKzgA#Paa#Ss-yT;y$6dBlkP=1>1+Q%epaa8|B1vRi3m6Q}szCL%>8qEG9>=i# zegI^QvH-zMHL)k(S}AP`sO<0@ONL)=hz=>E$V?_2Ds)dLVbUdZj{d46vq%EU6%yv> zofcSPSI3MR6-#K*RFZd6;855hcknvUiZ^>5(Zgf#{9*o42HZ1GlDitVFk=Zx6tLZ^ z+uPHtgqUaw7W#6Jqulvb6{~mOmcV^5CG*!(-cq5q)#c5FL60q|x##w%(VzVeKzJcJ zur?#zii-R`vEf>YsSSE<{*1+2N$H-c$!^n&P=0I5zfD0nIJj7_TuT4N>MH$JA$;-5 zYKk)7YiUMqtrv5O{*=F)CJdyc}2MhL)Tqh#`KTODf;)WvsRMb7fA$dD@jwCu=a<6~Uaz8;L z8$-&_RCoS1I_r5#eLgYg>g+=kHo_fwBF#xJEkS7%1*6ew`_R^jvKProBpTLKeOP42 z=QhMiUwxk@^ZV;8XJfne3!B>QuAMeD)}BSw46g(=9Cb-knXxbNz)DL36CI*RzZu7C zXu@3Dp@<0q`j;IaQz;>zo652^qARQ$jc%U>fg$HTPs;9LfE&6U)YC}u0J~Plv7uc7 zxRDAybf+6uFr*uDg2ZkY$wMD?AsZ`z3kEPDs=$MXcS^x~0az8dTh%^`h+cM_@rqjc z4fy>&B1HB{P_3#+vjhMPMae6R+E;$*OiQKO5_5zL&^9h z@XPXsp_`ZiQeqX|!VY*Uq^VFEzNS$L_MnWRg-R(w&m9J;Px!i{!F5jrkQ+l17P^)3 z%=HZG09oyB(P-xye~n{e$k{!J;hk)(2`swp-SdlnGgAmIKP(QnTJ295aH!ub2xN8= zi$>mqoGJd87KTWK0zUZ)h-Y|cgS_fYr89wV1B;3mE4e`A7qVIHR6jJ52Z!sdG3>1? zQFqFlDFcTl7ZV9!<(1&R(2oQCDZV)~xZ(mXXcJM#X#6RE3gtqpP7(KFefmX<6ZC>dHS1NNoKo#r? zsE>BG>P)AN)P^a#8!HmBu0k6){Xga_z6~-`i1|bdl##M=vAra`@xhkWvFuJ|moa~? z(iha5)ZJ0?Ex*aN>XIhOuxtjqn?cO+p8P#vj5BhM(9XM6v;ZG~v8)^*4ITr`U}JHrs#!{ADBM#Nq0|-Z$^dllkAZ*99d; z2q28g$jZuE7Y4=zq>%Kunqg9di%0}K_q%D*`hnjAs1xm-02A0$x<3(7_oKS=x zc2}2O4i69CnyjKf55NRgiHnI=-W8^sG@vs~1v)RT$>h9fvDd5M6}sC-!h*E(~pc@V8X&;a=D&oD$5Q)35_T-W>3r zYH|_WQyaeSF8!=&Pjo04vM)C285FB5F)UsReV?g5kjQmOFi>4^CJJao@>31>hjc4H zBKhjw9u|^WXkXoW4K2CdeCgtWRFYa;ZT6!;&iZnB|E4i}&INfAo`P8nl6YBczH|NM zAka9rw5(@(O3_4gfhXu2VZ_6uF}*FEHq$Rwab z5;k)*t+NPeE%;&byFYC^I7`pTF?@YFj#nLOLN};QR2#{m!hYUT*37DJ>_ts;sL+*h z_BG?^13Zr`mwxH|t%M$>G?`0|F_D>`d|O&FWK_MxtM4HvtdZ?+mvw|&O5~5n&{iyr z*CA)F9Yxcal~9u(=Ob8GuaW+7^9Y`hgVbGF;pj|cF>3QzQ<}t?*aU0juk{ql=w(rU zwoGh&FXFF1kP>;Qsum9kLhjr`2*M#g(AUDv8yy|hWvaj%>x?NIQ&Nyr_`mc5{~})k z;j}<}DV?MT<0MlktLCjJ{ey)m@_0UPLzLu@kv&(TB8(EiSuV1$;~}x8jM&gMGyV1*uVH-vl_3X2)462XcflnlqI(!d%0!vYs*_{S-mL^ zkq!&~M35%~Dcs)1h#&JZlji@I;MH3}x@g(bT*Rw(+~R&yIp*zPl#Ji|v4JuJ|2lSp zYO4bw{FOd8TOT!i7M~VV!xOJ{dyNoO@i&=Iw%4zen8VLdQy6-@tRniXmr3RP07#M? z4E&$dJqI?RcTb9VG{=5EQV{?9U!#68O>z=yC$lj;h@%ALD?LRTTRuW^%4Hs0y`Wg_ zE13V);A`PljrEVz`78j0^3CV&#&&u&EHNf4gCac51y>mjWivW<11_^*ZJteD4B|+a zf3z2IWfz9~9^zLrKzJ*DBh~OA3~GQC zSpsNkNgx3qB(L!y5=PWtDk~l5L1~#cA`X2s;{dpoz}J)(JLtg0Tg$owbQFL05G784 z#q@7*{d+ObPa%JIqz-G+{D2IKBJuBRL(+W9l4mAl;EpNU1Ja2eu`;Dy+$jyEEDb9f zkSKH~e>uE#kXM%}&q)5ET{JFS7Ckmjb6z?TJ3z z3-9HNuh?DBM*MY)1VhVve!g0R=SlL9cjtAgf-P9?gS8WN<{(2hMZ^3Ddn?(&anaSW@77Jk@E{(u4AMd_=qp;2exFj0^TNYBQE=d37ehRWY6a znQ=}N$h|njL1G;Kj_9p&znp{78N5=g4fpzAWW7~b6@Kh6_ zhQ1(}3-wpTp@5`eF@1Vls+2IkGc0hAW;Y;kO_FQVngG|3+kw`hnLUpgAyWw*U)O?LO!<`8nA z)vaz7tx;_)q)-+kH^Pr0fc-{c5XzAF3abQ}+?dNZ_isN6BSNh23!cvl!|wA@sY{oq zE!c*;p@02O)05NBXZ`FL*R_ zR$)O**rY&mP661=E*`QjEdZ_zPG)_N$P;;{1-W9N+qA*O{WlX>(zf;~C@84b^tt2{ zfHJ;5-PljiwpmF}{yJb!^l*uY%V$!C@HB~{4L2MbHx(y={^dd_JG&XM{V(n)P$@RX zDmUcIroEbTAljOP9ODHEnXE=pU$s4Fq0d@u0em$U;H!6%mvzYIVMTij1V{LXI){~` zl$?XYy&6K5Ju1KW60nZG1p{Y8g029K+f?%}oM%h&bl%Wp7|4G;;}e<#mFj~dBTG;N zDSqQNv)HmZL?WPqrn6qy<6s0vSAiM5mW_zGXXUbW4SS8=brfJf_3r5aV{PE%89rQS zyl2z5Xj%#ZZ^4{>+Q_y8A0aU51_8}M9A|IUjBE*EV(Yha7%@Dl?sL+J z2tARwz!S7*pJ4JmW9A}H<=8DN=yx~;Z0h1^-r+ABVue>{9A8V{V7|`l5WIHx43TVE zh6XBIXE0W#Zq#?g!(E*VXz>Yg`?m@kjJ?3kRO%lq??yUj6Mzcwb>tBe+9t&<(myC& zIgQ*YibN-3Q*bWZh6fsaeoid6HWU%|tAsMQxL3n`?agv=Xkfr9jN*}Pyc^Dz;->qr z{;JPSd6TX5*Kw^3Q0``>yWFBfcI5LS=x28D88&bL%_|wEWgzFk8w{ZYO&k6dB5k0l zh8ChJhFG7fd^1)Vnr;#!rA9_7iim_pBCvh5*sQG5(VCm|5EjStgxa|f-C+_ zB3tzWj<|30x8fPTh#tpNZc=dL#IrXloQBa`(332Ob?WHZeUum9#5Q@aBW(~L2(2D= zq_8DNg6HHZh&Y#q^Q;(S%Xl9kj6rK9F5E{i$nJWlBJfS(yU`%J|l zvD{V!vXFU8H7rx}TgY+12--VD$7(G-YC4=sr!;A6G#aA$gKQo}A^)f0*X7c9 zZea8NGH;Mgmg)OmfhFYkZlb*_V1H#Dm=d^2*La2>pZTFwg0@q1;9Ev;_DlM=xZPp5ExxFRy&I4C*Vl4y;YFT>oo1->EL6CO=x&Q z3QurETfb{kIbtG2tDZkTFbtrc5Y~%!+DOm~i0hV7Fc9-vr^lWP;JH7dbLUu}4bt+;cUci<;}z2Gzd-jYvF2F=k06l4Na^^(T@DpfH(((pO#*WHp`Xyd&KXa;M)0 zy#T1Tn1Wrn$d>&-6Z~12b-1}%J*qV|6I%F-h|jT4l8@g=KQJ{0@cF#|)fjjBrdQ+3 zmRPMVyr@LGqTFKfQb(`!H8#?r>K2z}kE&Y{5zRsb$cUvzM{zx*$=s87(V2?YxMu_l z9Ok}$*8QZR@;=SU?~wtmZvGFgZomQT51!Q~TEK+fEMBxb3z%k)pXO95ydS2VxmmkQ z`&x#-`~f?0|MAlLf={|@1ia`r)&F9l@RzVUg;K8!f_L40RrrUIh}W>uuMj5e!)^C~Os(kC(qB(}93*erVlND5P(8ic0*b_#BnLjmjPrtAU=&mue+X-Kd5@td(Nwk!2;Bs?OOr(Jy_%E?rF+yjPi!<%v+k+A}(3jDZ`r!T#5qbI1E8G}+BQ z2Y*E3rmxOo`8j&zBe65Re{qusU8J*1-Ri!ONt-KFWhm2XN+|Led8xIo;h=wW zlj=KcEw_Dsr*q-Ah1O)a)jt1#9)r%&Uc27{BAsl3-J9Ov=^j!yB6~t8kf)D33{H^8 zvtBNar_XqIv@LzyJVgAo{1Z%Q_k1<~ zZ)F%CT*m1I1L=sL&d?I6Do{75hT2`z#!xOqImSghvA-YQm8RZPQLN$Yx}*(s#j_>O z;g#1T96o4mlUAIsLxn@LRZo& z|n=0H%ad?6EuovwuY_HmyvIQ+?2nmHKaP(zbT4UWcs80UaX!jO zF7FbBu>Nws6e61*Lb4A<(f)j=(}eU@f5AEL8b3oMDPe;Y(TZ(l7$j$Y$p^}KWJSLL zhI1Bmm0r{Nl{crf?`2h5*98uP%K9^==}&-r_4~E^Ld9$JmeM~<{W2a+_9qztn&>GU zuNVWO9?vl_+zau)UI4h`Uy4=j5~L(y)dLkBDgkJp*LW;pmS-Tx??ij!!IH%UwJpo& z+iKmFz)9c|ngz|IT+qQ9#=T2?C^x9a&=Cr;u&XBS`V!Ix!O#SA5r}}kT6_Uvre$q4 ziFqy8LZsV;^RL`4{W~qBd@XnB<>ia-FYzP)w~EZ4nzAj(14dY*aB746WrwtaUDru2 z@v9ZwOq=I&sYl;Fu7r|zj9HKvyu5fI6LOUVr*lo z(HgY5VYYDBj^>O0Oi;@Dir&jat0sFrsYjecu_F6qM>A!^D(PU6lh0Jv(hd(bF$jiI zf&x{WV1{`z7-^9L@H5eaX2u0lIJ=y#wxd-K6r-T`P&2`~)+(O$LRK^c9U_F`!(h6< z*K(Bk)%(YUMY@4jLebd#N!%ape-ZV`K9OiI9ayj1EYD#%Sz-<%_V!DiOW$tCl%iZE zJR7V$EA}S-!7~#}pO(7xfQH)I5A-pa!)KQ=V@hR}iV8IA>y}{oSe#4aP?ttTOSa>w zBjn?RU0j%V3lc=*4Tu1qf(YP}_C&W4Le(6^&sRveMZS`8&lhO@NRSKyZ{KgB*R_(m zHO1b~)T%njMoi;kWg-LQbGxxGyXgs)TuYw1$5`XPS?Nz4Cg#88*dW2#2xM}#&5g_G zwPnw@KZmBR@f<DvQ?yD0}?9k_g9?qktvk)~~_& zr0MkPM2QvSxhM~gI<IwJpG*A^;RMy7XH%vao&i5 zJMYFb1QXohZOcy(9bOw)kC}FE%xCP=)?k)tv{J(>tib}3-7P-3@{NYWim-|Y6AZWl z3{kiFel#P<6YThU6Y4;!%2iW{Y&ip9Bomz9RD1+9 zTzB*K?a{uO4sXpM_5a27v0=jYQq6JY1KXDl=z|je+VBG*cD-k4l`PCDIY1Q{pbN=L z)BTdvi~0HGO41go1cflDe4<8mSQ3v338bJ znE_&1dYLd1j?zza>KE2X#ARHjZhH+@Y8=>NjWVBQ;2+TI@>B@dYoxy=VI~2)U3O+b zwdyk<934gy^B3+lFq?*Zlko%CKs$0)T2{sIKgKeqy(q<_IY=A-*S;y7xIZezH+v0- zKj>}1U*#J)V^W`SBr@ysUiLYcDMWybMXqi01qp)bsJ^I5&tkkP4#*O`g*PS(^X{$LX0D+Bn`gLx>>LX_lAB|x}q%i$q;kaz-ttjCz78CPC-LD(mkuzkitZ_@@ z7!2@vqZy|0e|%dW>36IAn3ZJr&UX+QX05LJPXDDLPvwbdvs-L#TmKcd3XG;vjRuUt zi--KJjxc;H*|i~>x=t++R^pQvK-j4WKS>D6=W78wT-eC9U!kuUOssDa1edkH$@qt& zZ+vcZ0wQ(C^(R+Htdb!Bk(@snUEuk_<)ct!I6RJI4*EEJGWl?>KJPT{e282VAb>1B zhoG~k|6+iPM2=A*rzQ$Tw1eaEzgB~|DI*b0ogmbK*H=Xsh8PzPUZkz~PVfaJk^yvP zO7HC1hy7nxW*f)fMoJJ~qagjSEZ=aSXxb=BtaQq!*3TU^UQ|QVp_)i6nB5zT34AMb zh>g*;z3-3cyQyv|)vn^MaYKOh*DBi}$_k3gzjdB+l*1x~y?uh_VH7l%wmcL^D>EGM z8L{MpJG+?zpWV!5ylf*Kra7Oyy$}l(h2sMqoDW-!45G_8VNk$087VNeeH+4qY5?y` zcPf8-)(S2Y@IKNo{(b7AAwzWOH)QY509zM@1}a-x|DmtERhh)|ZCt%V2!ybb3;^&7S!I}x@Q1=fD zOy8&nxHm|u!M~>j|NfXTekAS>)wNy{paJ3hd;x_)oEPVd0M9H=7VNjzPzLIza{V^J zSI5F&6F1PoaGOtMo|E>o9vo9zVc3D%y`O#sW1u&!%seRb>ZNeRBrcH*j_yX#8&sc`vv~p;F|J6SD_}N`T9S9X#!f#j2`y(}S7*SMA;J}r zy_)`Xj(nX*&{6;T$4#JZD~k5yO7Z&r;O3Xpw&YWrQ2S8F)Ru9}Cor-W8@%v0Jc}UF z|7zY%$K~Z3(2x8@$jV=%8^dG1YqbWZ^raBA?hNa>C91$QT5S+SMDHhANAJ#g(Aq#( zK_?NoIs%i2YIQ%JXAI~ND&<4(;U@7npK`AMxc?Dy#Yw(S1Nind?86i0-0q%q9w1f= zJ!>oGAkhb_s8c~70;pOT9VrCL-XYNR-b3s}{68Mq(&mxwZ8dyE`CoC18r{EC5q){r zX8kJ})kxTvR`^#h9bZqmf$k4xv1xO@%`g04U%IT=ePNMx^8>M6JX_8VR%UT*q@M8) zpQC%>qL0dHFh9dCuZVX@h!7_=UaXFlBQGbBj(8z7(=O9@T8@2b_d&S~OdG{)kidDr zMyP*LHa{KNO#*J^!6>7pU*J|=B9{n2e!7*-|DRhqa*YVUh?phHK!Gp}2cV|b4jjGF zuCbc12D6pfU4wkMB%;;TqS6Kql$VpV614xZ$l7o%bOb5@!ooE#T1x z!~{UAimA$Dh?0B|Cu3p_tP}V@GztXvy;SqNE9zaZQ*m|f+)u-fM!nevZ>rTLdK1Rb z^E(yfw+g5$;q)ZiA^8Hi*vwF2)la{YdIxuWJ8|JUDn4Pvq|&^ancQIE!liiw$`t1tDsM-u0VAt>mcgyx6-bNv@K#`jYv z*VGvnPyJbDs5of*Pv)6(2#LHkBP>-KKBHPeLO1N^$OuMY!zCHOYm~rj13ieOu0&E) z^jF*c@eGd_`KdO1ALlN312OVO`zuSNo@i%~9cJdDJ@y3!q|@ZJ^9}?4d~H8;8z;3i zHIK0!ZL{JWRtyhhKr#=PGdHfKCEM@^fD?L>boc1-&43HP|ATbSIPig0NJu~Lmv&6u z&JYlLeo&M~5WeN-Z)_RcIgX@hI{O_9vwVy!zgQ2XAAzH{-&4k9o|r=_Ao6?#qx=%i z-FHDJa7`oe{I~l}@!C5)kG^QX;gAv^Iz`82Z9qJq;t&OR-jAZaE)x6vw0TX4$D?In z|CgNv&1@;D2nNxZO|9Ak+Jb^kxfbHKI4HHX(Yi~8mTE0#bB74qA_^Q1=?{)Nv5~%_ zn~8%Z`igt_UX+pe5%<*ix7P~=>Qg)TCZo-I@BX8%>As(n1@|$X_NKdtwmu zgVS^cw&Nh zd2d&m4EX%i#M~9(e6`fxrIW^3!irnKf&Fjw86CwYQ(oNL)~v(Jt~WkRprFDPwk-xZ zAd%E%+R#@kv9olSj3xt&hb=k0IiO%&q0|Da=l|9RV^9JY%9J-HRb>aM$tqQv^-~14 z1PO=LL|48vJO2~Q;8pUzX=QlI@%yVUZlAC<5)-$P&&I`79!#{767eD-FMWr(kFoA$ z4-kOK-tFeq;GYu-GMp`#M+!ry3GBl2#!u#lUf39AL9lHEoyDNSg0IT+s?jJ$(3;4x==H|L$yjW!LBLa`BOWzY202cke1Rb~~bn895efI=t zlw7WKP%C-z+`6yI{av!vrT_GEsOyH9F)NcNvON-QIsy5 z*zfy#`_D)9;%LeDYO&e#h^@{i{&3Gdn86g5MA@Z<( zW{KBBj=*S!zSF&PAR-XY&LR4=ylF@wM zIo7ET2)BY;tKYM7bqWrTR&zebMwc4#h|Ya{yEtEX@u4Hf-j4&+x1msK1pjS*^42kN zFL)nx3kL^7Wr~HX1ZFE3Nw@Zk=!-Gz8dQDtGaTOgm`;UUQ^$CwMi$E;XCxuqqtAnx z66sgDwN+lGFC*zoX)`tv&)0=5j_Ka7(g#|7aX){%jx3C(U1vsbJkU*Us`xjO&L)4& zs7h-dpa}gyD~hGkH|tuBYZ-RZIB!^Q;}r64^;b&6{~6}qJ`HnkJ-@^}5ps|J0AQ6V zXZN1mLEN8#{*H0VcVG1$&nhY<7J>!xoMyi9TdKKC+I`w9%{FExm(^|9JCUfS^;H> zJi+8%vpWCZ+-~iQF;boXgCM#Q{9~7VHrD(riW@O;FdSDv^tSXe(s8wcwKPVNr2u^p z%dtcL9cpI6vCNfJGel`w=4CBK>*_VHzba3(2BO?*KM$?bvT$kwNm0C9o|k;qB+_%vj_Q*13^&vd~)UX=yoNk{!iIKA+mo9Sou2hZ&&TS z!e6|nE|0SP4ztQgd7WwWtPB7#!E{@Aoa69Y6pU#yh}Q|A`4B)YaNByjfHe)|+Lez9 zCJVm6;qoYZ5gJuteQOvPePr1!6r+42r=pz$jAm)K&)W21+map3`eX(4^W)x%;bvOT zX(N$09N8b6bA!!@@DTlDP-~UfVg8TuLe_NgZwKk1%NFh~Ib3)`8TeYbtAKZxV|N+Z@QVIAbq)8-L4S{%}XLD9?l@tuPh%!YG>q`bTx~F4-u~`zdUwio; z%vN6l1#1~@dj7WBQDgT1fHA?)3h+QnrB07VfKTU6U^Fw`;u1hx!tvSP!!9uw--w^P zx}+*waL*tdrpgxrvthXdNb&Texfdno4uSSZrE-CourN4!3>Eg+Uy_YaJc>W+_-pBFS&amRuzK9#E4eH>L%nAb~Y311w-jQpzq#WVT(Q%02Q>(_IL z{`HnitXE7P`xjHD#l9Dgv|9ko`u0E&kaP3m1L6p z^pWa)8Q~^$T}<5r5$lvj?zMG%fH=&#SrhP{Ze%9emmrWl4+D{sW~DKze1~{yqUKsX zn8sgFP(b<7WQvn6$o>$hD9B9AG!-kvqwem7#+i zvx8tGcHL=xW=`B&ZL?e;9xgeFHWz`dtoCjQmBB@;d2y$$pEra)$Q={UCOPj-5<4un z`D;9Dqol~%9PSm5bs_)KirPc}lWUx>s5JTKaESrfG?(o32+Vm7Kb9MBsKw*dPBI~4>IhX} zqO8UeVYcBV9XcuUyN>*jw>p`OHX$gXw0No@kK0+`*RRr!bN)Rf`%@(zE{DeRb>=X1 zRqPk_wZ$~#L(Sd~D(r&n&8p|BBYy|elRaTfvC%zZis$I%$a<@84N=bPy zWsa5zs%(s)4%p6&*Br>e802NIh>3>y%K5NNvl;8X01-KymZKxJ&rFQfLlU6? zbF_+u^)z3;!a9Pbt;k3D*s1m`9`SF4aA-nDv(>%j`IuBhZq=9*H~KvOrrxFm_X>$g zRaql+HwZUc>>{Qg&)WqkNwpI$fIEH<>*na z#MzuCjAnqxw)L(YtSax}$fuVm93o==qa>rse1Afp{){9D>XW}|Q0PXouv9oy_xuz2l zLPOi_R)(?qYk?@$2n&Wn7%l_?NcUgwziBBcY%oY8Ye^%M`!k;U=h#8k+}7olBTrx)>!R@yI-Y)@O{d&d;y#; zt_YI_mP&k+)snI-F+&!f`=PQA7ga2H>`Otl6@)mKrx>FC5x>TKqHn!ZFCG0>&(UGn zcF#xjP|sw03%Lgy`CX?b`wOLzn6?gQ@Vet2hfSW>udP`!yb61)^tr$B?VUIjLEI?M><+jwXa-qT94p}-nmMr`6LB!>n1sAkYV16TIE1!4v22%=Gr1%T3kieRk+hmL9(>w8=40WfE0$rk}{>ECo)* z2gkfA8M*k=n=qY;c1}O!v-iS+_mp$uTK0M~OM`0oeZqOo=oW=*kj>KU50<1~&!UTV zHX!zPV$n*rc9)T+@^B%-mq^h{3cr%VJr;z#(j!^;_->d%U)39u0=0x@`JMyyN9FH- zNdcxUy^cN=1NNaZZK%q83pFI|jFhcAQa*X(5t&?XYO467ql5V;=`6)*OCuvu- z+!J;;ghWq?r(*0IN$cc@>Nm@XCduUPhVN^OhMk+Wi*Hfd& z9S5@PpYFPNT?^`cmsFc$fUows-vIRFu=usPSNvW>>tS5~+uMEiBhb#Wm>*K5hFGCp zaAG~@16tSKab_&7R<_x;yDE>M9=L$gv zHOwhOJ0_4SeFh;awm^X}+v&PSh@@#*R(||V(WuFYJOmf{O0?J8#{Kzc#_vD67&Int zTO<)!_5$KH4)xS4gX#Em2sj+^12|-i;*~&R;<2|OCe{i#cPbFlsk+wHk10D(2 z{Jp-ah1IUGOdVS6hZ6Tg?|En4jq5%~v=vQrPpUccRcF9X{nTC?sN+E3)E&N7if?7O z(>xQ%%H3QLh{JAVMkIP@u`TN=$9#iP=@kF>E1TH(Hgs^KUBvjBV);10s@sHo!VSZQw`3uwkEu7(wla1YNQy9j5@*ebQGB(U9gf@dk+INp#nfQll8!8u0C{bX<&|h( zp;W6x1h`a2@NhWyqapa`d~oj;Zoh&*+&E}|RMw>cky2I&85V7KSs42sB_3FjevmbK^moRhn%rLbii>x5GX1-%}Cy@G1}_a-`+f z+#jXytMG5JS!PyLR1}O14&G(vFA<9(nn(Ug`SuCfk&X3$mkuffCYu_nRBl_J*VaC1 zj^_FC`LkEmIFN^5&I?Ng=dMI%gO~TruzpuNV4$|5Ator@OTPWmjlU(FYa==2ngPUH;sQtCuhi8P`HmWIu z9gYXVW~z21#$CgmRVl14Nf@F85O!|;kWhUK-r=?8r0HylFc|hk6s0*PV+$vzd2khrx~vf?%2IrzP%JA-?J4nhFtk59#&g zYIAHWBA$Q~*(+)^yA&8ZvcE-sl{UorH)-aL^5-LI89l0U%rwyfgrCDb0vcU890C4m zQ1A=)Am}z8-+NfubAp?z0b8QrC5Vz=9E{4ZT{g6W_!>-G0nVDTNvWQ#Bxxi~#CJ+c zN?QcK01SJCfl41C;4|28sJ|P9$*6`aMmAR>K&H67Ai{y zfLpU?Y2Nqo{uIA6SIgYk*qFI{+CGGKB&f83V&G;T%v^AgI=&4bLS}5_Bk%PXDW67g zPJ#nmy1;Q2^j;Dozg5uk~mfMdwtO>8LjOmDxRil>fTD-rLM;cVAu z2#hu>+|xMb*EF|^dhwty8e;1V6$Nk0g!1>F)0HKapLxCYiwJ{bj4r7Bw=yn&sG+Xh zEgz{V>FauTw{IN|DNH;gr<8jPE8!}AV`M}gK?xmi^bn`2I(C&71R$Eh`2 zZ_z^yLKl|t5`Iq8ODB_aB2jiOh)adDr2;L!o#hpNkFc3?+Q1E+r3Z=Jl;x;AzaD(L zLa_$g*N5cWHCv*KMrVmbCA#G{jCW%T>lE~T_T9QudF*T~{L)xdu;OI4u*TzK7*siW zi&AeM-DDCNR{(QcN^u$$gweAuWaLVj?^XC{NFGQ}Y2ZpTp0kaD%QRoFrrt>+7xZ~Y z3>2N0shLy_rU+J~%E!aH%rCQT*z8R4Z&#)Dx^ zm{=byx*NW=Qp6f|Xkfh}BEV-BBNqFV&c|w09{!H1?cKnxlQX{P-rhJ;q}(+N)6WUo z!U%Wrb_^}4_`kj13L5)nIgFjf@WS!>;brkq3CCsGqSO;{^`M!E0q4-K7V`s|-&rcl zjK>L=^<4!s`M@+5={1b@PO_P|@m#Fg4!o`B7k^f<;dr*DSi1EveNiC80GX&92%|U+ zEwOdGym)xy)wtz8GTBLQH=OsSY_yOOUcq5C%CY4CnR@bPG25SC9N3eWwvEa-J6#1?1*r(4y6m{sH#Y!Zz$ zOVHoMDl8yF&?#46c2Bdjr4hPQ53`xH^R|LorP80Al&c4jW18-yw6KzG69+dU;93F)%d~1asa(R4}!uGgW5azQ0GqRQM~PpdK)E zpUqnsldp;pPwZj3Iq3b1jtui-K-H#qtUR<%_dOg{0Hl z3mBUTd_w8J*;>#~`W@2oF8O{J8=WC9kxFj8ezp`jtU`!Rql;UVpGY~_jw&J9+&<5p z!U+c7W=dg6!55Zb_B_Psmio}(w1Wo1)VJ3ytuhsw^Nd4o8xC5guHicI`&OsV&JnQL zn?(@G+0(1N`@;Q#L|2}g<~D90NZ2X}A^YJWt%l(XUqWgD81|ea5&j+H_~73!M5x}g zw7ZGm?>;PK$v9j;SmH_z-A@j$Af;Hn1=X2i_9QNzcPBkkZw5~Pu!T+D#!KhXM1+S= zN4>q-ZV-6k=ihtGC@uhjWOU$Zx8co!rbKS7nToGC;|$oymee#fkJ0fGLev|53qqe|EEWUwbGtXA48Y~Hf@i(GzeWdNnLHo2s z)#0w{(3&MI(WdFw@YUq7;CcFY3KC9g`;GgfeRgZl!;rz;)lz*hlwi(iF}-{_VRWH9 zpN3N}itFPpwMRQsWh_z_iyOCz?(bL(9D@&~GL#{lsNONo&g$>cxqs*kbQtZVz3s>s z)oO%QA-0hnm9eq9_X~2#Ar_Hi*IP3gbN-0|3zeMSj7T)1Eg8-d^XF)a+M9eW$ZrVjY0s;l?h!2nNA48rk&xk zb=xS0mse{t&hAZ>CB|=uGiwPJX);E#ccUS@z7(acz5y?SAiY{E2?xoyU!Ozjy3d+$ zbA#yOrAL6-^3j<`wfS}uM6?9Ct!IBy$u!NRiAkU|NLd7GZCZ}jfbk$QB`K+H>6^7a z$4Xjj8f$`Q-uG_=ndAKt%z!PeR*pD+WW$4zgOXAQ)=5m%ksb}rEMu4fd$a^j>~4L& z-;23>j(5WX(CL=njD^GJw~YD13wR+shwd4_x6rDt!@r(wr45Asn2?rTL4mwoZa56m z;@t$6A0I*JzC=+{h#SZQ>km5$sFkRziR^iU=g(tt>3LC4KJRPS{P-Eyhr8>>iDG+~ z+i*oMicw}bG|#2lNjB8vbJ%8D6! zg6whr#S%4W7TKf1r+dpLVOXV@uw>z~FL#QmK5JJ>vN>y3{sSsFZ@Oep=S`Ju*LmgV zj#&SoPtFktLQcQM!yO zfD=^Zu_`F!RH&TM(iBL9zO&_qzw~EQWYPISPy-gCLrkdbGwqY>C5M~<4mA;khbqnF zB}DJA;m~zeWh0IBLtiOvS89M5X`QV^XRYgLmd&xq4&gJgmBY^=htX!eXHBH|H42vM zI+ezK!%ye73D_aBcmxG?lelg5L|i~uNXiWOp733@9CIX-H3!A?GjA1;pGTu!A`>11 zz>=WWxx!7EJ@{HX^ab_NSf<)w;uH#b%jAXlr^w3LRW$zn<8t~)3{i(;eqzd~2QX)Q zUbirJW%N?97c`VMGDfk|!h_%LzbBWa7E$0x%yQD`U0^Z(#d!So*AAVkRlO>WV;YIi zL%FQX%JxhUQo`1(vG0NPijOxnU7Jhed#ax)3kk4?rf?nz?cX?1o|$zf;dcI5F>m}z z^49Hw9zN-+V_=>y>tw>k>L+q~mEC?F*?g8_K4&*4nuU|$2Xf{C4+LSn@cDw%VCwn1&#_p`5$QrHl2Oejcv#iuKh7W_d?&XCjm?D- zgxvk!B)P@QJdq`2NvY-)%HBs-V5ROX8rZyXAt`Q7###C&fh}@eLVr%p+A6Kp;|^+%L&S$m<_poh@8Gmc#<`e!Kwr?PY3!_^Jd`E-lx*}r(RPpDAP=;S2k z5p>Uv7t1o#28}?WrOqVj$1l>)WcijDX-+wczL^2XXtrxG`ls(%75q_B(?{y3Bu)qi z!V*+Di^A5patVkS{#s@ zql$TJMR*ns-^h@08ds3!KBenw%}vb0LZ;h;$3Ocs%A6m<)8VYEwUA}YJbWDyUa_`{ zl?En`dT?pK&%p&>RaRCfNYpUu%NHaq6FHyhJMlhJOodTKmtqqwuP2&U%Yevj*`E@a z?dttlUE~vdDrz!kYOLpG@QZG=0P{lw=|v~VG^K5M3kNv@T1gSwrq+SQ)0}_&%Z&)) zmb%xFh=5PAfpFW8{vKydpE1zQmd2EC_!`8imgtR^$OGNf3oM6=eeLw}RZnjE6qBYr z=+OS>(2aV-O8I`2d!wDt_RDz4?p3pe5Y2gF;W2p-Z?q(r?nZqeDGNO<6>0#hA=zda zO5!HUW+L9Vn(%00TrG|AVbii@g*PvP&uQhD+J%)*$YS;Mq^4Dm_x8eLR|ut`ik3tL z9OSIjb_7Uvx){`!+;OMZqjd-r(jIl+Ke*Y}e3ZI+F;sAXnt#IC)#~O~Y`gu|TF zk`gO6gO!zarRI0-ZxgOfMb@QWWXoVDwj;$Qe!x-dm)oyNn!^*<6|2$<_Ej=|Kce}$ z7#>#th1Er?tK7q}+b3qT-My|d_C5y51K^ki=Y@oL#@oBaeSCz8jOWzVmi%ru@2l2R zfo)Ne#w>fWR+XL@0(pdf&DG+};6PIQ+Mm%3Jh;|co9u)mw7f!KLqb4EZTD7a$Wv*k zW&)>E&YP_}8oH&PdJ)7Eru{{wbo>-yu7^4?xv5T6~$;7T!!ait-6|(oV$<{zticarb6)heX=tq5=$@0@ZhJapj&cvkwrsxnUrb}(`o zxB%dz@GJd^5|0LWM;-OOa&I5yFI~5eOf1n}D?>@#EuBz$Hct(jv%06HxPjolduzc~ zIE-!b2e`9c12LGnKXO~j@ofG1m{_Q|b?ft8k@g){K1UH9v$$9BmV2fiz``~F3oF<0 z(prQSm#Gf#O(E5W2MJSv%a=`vHDDe*C` z9$NTp>rMv5t>}!ra4?0?Axo292I*Y;8bKIk*4%3#h0;Tw&Eec#*22GLZq}f~G|(Sh zua~E6-eRIPVi0@f5L?f(`b^nmJPzkf*pLSm|BPZ8E17GKYrop|&)iRCYO(jK zt;zAv{%{A-hwDCLMl~HQ#}_PYSJ0~KWg^d#$1fw{GTxV&*I-p*!&cWP9~q@h@*?Y; zOqTdK+&fjWN&=|D&E(NXEb<(^$D`lngVEt0zba#YMzVmH>$iJS4qPaZisO}e_s@Xv zr2oD6>UFFfJm^hT?iI~qne8vHF=+p!yE&5S%R=$N0iAJxzW0%wqIXJEXkf~i5aO48 zCq(he_}TU2x2U^{#ZodTu$zy9-kP#X3t5q_b`9wlXb?0T`?n-U;ohy;COQB)a!9Q^ zoO_^wY1(>fA4~hV3(&BQA3-hSBgb>h*M9d9iUi0qQg{Ug#SQ4|Lt?+b89A>4moz)& zWy_6%c&E|vY>3q}$b@4%UMg;rKH8$|)==uDPr!8_T<#GB;@*OlV%tAkqYio%OESSu zI~Wys_yfk_>ta0UpJfY+LPmrowK_e()pO@X%#>Z`&yVRjD6%5IsCIF z@1Um=!BwnmxWC&z&5UAIx4}o`{3j5$*|nf%=B*!pXBS|HbyeX*l`WU*zhd8@gDRdO zXIqZx**`=@@9XukUmvTpiFKX{j5PKl(sEM3ib;PnoGiYt3@@oP9ekz+KeFs< zNhCd1jxb^Kf~st?{Dif2{86+>6gvIQ_gCwyJyO&}Sq_50#0G=)0ciEZwc7YV_Q0SB zrS?5f5&?lWl^#c_Rbo}lo2JNVVod(ti8;y`42{I?$_WoStVGilOJiD9H9ORW@;8@R zGU!rlJI+Y^2A{z@YH&z6lmgz-MNG1sjL*)YNlX&aII*#@Yt3`{D}-MhnW|)4iU}D| z-nPZ6^?lF-)RC}w4rWX0Eyqjx`@o#@Z9H_27+Cg^$R83Wqrvz3-d|Fb z%nad~bJkUJe!r%fc6L-$v;ouBOh@4~aFs5=3^;1t#gP+{%K5c7G)XD}{Rn-^s){C2 zf*i$lW-L^6mG_dA)ZWU7CoTbaPC1wPmQ z4A02l8|u|yxawNXHO;l&A`;I;dA~-k4db~aoB8|vsnd-);zG*iAz9)>de6wPD*HXx&_k|h1}dpBl^(VLt21wuay4mer$(s7TC7MbAhs9= zDMx_3b6gdIS!NDr{XX?DQCSiJ#Ei>Y{%*3rsn%$JO2R`%5FqX(lK4=zlR}O}8wezM zT~&S0qg<15@x1LB`a3b(sj??wpaJL)-rh?~q!39^kMLQ;UESo*LhDtFuR^^}0=zQs z$b6}uKu$+2F){JNGtA7D+ZGT}@OCdNz+wTa$@hz&ad@ZvWJjROVbO;0f=>(K+&Z)E zE0xUE#lhSXB4jI1Tnn)8HUA%5UjY^M`b9gGk~)+OAW{NKcXxSd5r!`54hab<1tbOx zy1Pq|ZixW}32Bg)kaQ#^-#1?Gz5ln~TMO3`bg^dUcfRkOv(Mi9UE)e#7ExG<~?$COq)Sji) zYjKpsDr`oepgj4h%8JqU*ZV7WNWP`nsNOrt2fi{sSx5ky|4#{QDm^T%!M8a=AtWkR zY+S+2g7fNCV?}fII6IN5z9GgULED~6Zu~jqFYEhpZi#u_m+zpwtIw0^=TlQs5} zYiAU=^s=U#PJGG~&EFBelws%91*e7#*pTxP&pE4WRZ(ERGc%C;*5n}bIDF`*xJCx! zPLD_;@%ddTW z&BJw6A2ahRqB}&BK9{~s;jlrmH)J=Pdrjf0TI;IVZopwcPD@GSqgeS{lcE>uu81-f z9vRAcbH_stip+bBRHng78aiKcOJ(TpN+#o5MO)+$XI10tauB(E2r7`Z?cK>amXUbl z#HVF*>+w695BxRrKVHza**tb;er_^LDL-pe`Q1l(SBmm*1J6noj3tmoMUP*ik0-tK z=6oVBXw{DjxBJMZLqN}a;or~@0na&yW|0=BSzY+SU~H3!BKy(JQ0UO!k~cpYsO{ZDq>HsvOqCAK_>faF z90}}1ak>=vp*K}kRUI=Jy~BKyd6Pvfwuyq1o5u}~NoCI~;EcYRxC7*vU^!iC>9df1 z2imRcw6EDn3|aRXuAQp9_fe&D+=#UAGBNf>0NnGwuxI?irl=C1ug9gN!(@@}f`8~p z$!UUOp{+;%N853}R@SMOM6$#UdmF6$sIo*N7#c5hdZEp?PVRT$4%lTzt}Wu|KXPXKUg8Ct#kXa0U}X7{v9 zAYUMs_K_qK7E4LFSHr$V19DdClknME=(XvmTNeGbOsso$%*=c{_kHl`6=4>mb_C05 zS3(wL@zXn0@-0-vd-oFic}GpNA4TjguugxSuPjm21*7L1pflU8K~Zp-oPRT~@=jg4 z9*cTd%6e^iJlzqtetru%MNetrsLLTRyxV3PfMN0lX_*~mU}y;u5uauyL;F!FY!5tYN{cI|kP{st8~caB ztdmQB6Ovv#Jj7Q-pKx5?OdC5M1Lgnaa^$#NIohM(`RVN>R zRZkGUxi&rZb@w~`GrhqNqP-Vyb|;>@xMaMv{IW%~9!wpI@WraP;(rVVw!W%CZ38(N z>WZzqY4Sj~juoU)D4$6F*@QFX#IQvTV1}%aXV?&vFXmnxeD$I+?=gv2^&$v}rB;3B zlganT5!Sb(+%FZ7Qw{=qri>xpR&3AF11X>R7M}-vNUu02j77?n&BqHELC1{Cg7VM4 z9$z>C0R5E<_z#9``}(CK0~2S=iW2hTANE^ZfRia8oY;IavRdt0Z>~5~0r+ngZxaCU z*`F;2U(b7^;S(Jb6LZ=o^@bNImtaMc$(;6N&~y-{h3^J=*XKERoyhxz)9jVA{)CpL zq5Rj~v&$p%s(Z&N0N;DB%Ur^Q+0kq~+1gFlRu{1%d`!2D5yqpVOIWNM-*08twr`br z3qC(cB5Py~YaUTeq$eYGuY8o~>5Mu`+mB+47qYt5(8n}X0T%h~@dq?Wt)CNuozxyY z7>hU+TNNMORsX&5gK!TLq`wyF(n{D(5yZW#Bl06+Di@WPO>!VKI+N@AF`4$zcnNp@ z=KNpU%lM~kY#unzAImc%sJc>oGg%qU|KfHO#o8iGz#KR!rxoGR@{;2H&UC3meP!qz z>*BG#bv>(XDHu{f(rM4a`b1&yI_0zhL7x+Df#%}K1^&XM!DH?OO>Igp)M7YZHkAxd zz?Go+u!N|t`yv)Lf56K`0T(haPEm*@gpge?_ef6AE4crr?O*bF` zJ2fDHwj2}An`|?yYkX!e2JuW207<~K^h*&j>Mjt#@2+)&T`NP8G3Tw$F~m`7c=Ud_ zsbLJ5?r*>~Z(y&V@T_8I)cgWH3i0OTr793`A6)2zof(x1yiuwor}qu4AQe9N>QYgH z>sh;p=vj@wg@=2HY@veJUQXUM*{QrYO);|9NlgYTej8A-v3|Y-A9LukwX)wXsgx>C~=_aS2n}|2neEzk<95628Q)cZ zSTm=KOJB;zTKQ@EoEAAXE8x=4hq|rc+2if5@8s6Z8V1ctGC${^XC3^-#!9PWO1c*m zUg($cG}MhwL2mGu^@}pkr=V!GZ#24$wl9k-d6G#cyK!P)Sl+LW58-k*KAusZNPTJw zoRhGF_Lto-7O^U}Op?~UL3SsHaBBEmlbqadG5sdO+{1wR&*;imCChni%ebepXz78= zRB3J0iJdH_ynLW0o)_;6n@3V*BG`<{5?J#n!t{@i{0_rveR>Y#?EMC`)o#*?`-u?1 z`L=fkkEd;kvN(7J6tM+DJG4y`s-V-Xe3oN<)8cP-@DtqVzi4+veI@!Fo30-|&?zKt zoif5BSwB2f@Z6(QI59NJ{9I>U>0^114<_toYoKgckffi-!M+Cw!l4?G58skEok-Pa)p{(*{@QYXE37gxNRj)y~Sd%CkYdqS7D4`QW;`tqvU_7=1_ z69dP04#6}?8auq>OTNm})RdIBc-5u}vMUa<;ku;dADCbNG%h;r1cbXQ^SR@P*ZdCo z{TNlTg?IXmUD;c|vO;q5N7M>6rsdaO$;hUBry$|oO1*5Oq;sXhy5q?V*BXmlHA&aZ$+xtTEut9>Rh%Ny}o<9^Q+B zWFbDQF@7kNdTT=(=7^g}Ibqlwj78>z|M?Jx$7r@9Ln6+iN*D*w=a(!!bf1GI_h|Cw zsJ0v5YP!5Teq~+B3`T*Xl=5`45WwlbdF+41LA9b&%L6$lCJuafkE|jIj94>aHv};T zf9b+3@ZRGxw9SK0kLYOA_6U8@gQppx&otH?d-lf4!JJ1Az>fTW?{nYmGjxa~E_C^t z2xq81j|rzGlQrsvr4Bls_>Dy})>xPGpRoa!atTeutY+FEu;KvcA81>6kx zA#_(HP0a;MexXp>L1kAD4-Wvl2GZ_LR(!m_bt2(tx|rr51-3Qd`-S4D4E~}=RN42m zoZvu)Kyf{JI7!pnh)X;K=YD3=gL}#dC+u}of6%RAHxU>NU4BL?2>2lVfX9BBWqU?I z-|enuUSr=S5=d&X{XWv`cTrh-*>s!;Tn)A?UP&Ac#a%vioUL=}S^Jzyt3;g(;C%oo zNU_e8Ra%nH@ok+8uRrBMW)yEc_2?n=bD!{@a~u*0e9H9V`vWqY1N=!(jUji(zrlKY zbIH3Qdf;X!i~1KiN}|DhV0v;<3%iy6jb9d1SF2AsA*YnU2qpj4Q|6ulPvv(bd**Ni z!9#&-yv}3mWS`Tf$#zLdjLO@qa}3-sKbxQkBR->EL|~_qP)k{#H|{LQ^7^j@-^^%@ z#Gtyk2=r1{sDCg{IlQ^DiSVl(9cfV|GO|N0^Q2xI@(r7`#_J(lVt-<$x;}2-svpwy zLv&`*fz>s|I$VI4H3jx9CD|)CpIwZMIakPgL-HnV(G1!4#zRO7^q&0H!3!&hv7kYI%JV&;rL1ukdAppM1CEMJU3Hk z9|ipjq@y^p2i?K^62S<*rG&2BHyDByqHgVIv)_7vVWxV6kC*q!K499`dAj|DKWBjn zfOskVP}z^i{uIQMiT$hXN8b4Sw~1k_poDinWR)+nHz2xDkEI)uzl!Qmy1Rzhx~3VY zv|>8unm>9qz_yuAu2L=1?;BtDJhj*azOFkNO-N)P?M#K0BG+U&u=3f%>Zn2;V=NL*dd7#91L7MQ!~_t$)p<5PzBaUa%me6! z)g~6&J89 ztfvKL{T%7A@0oAw@SGDU;Su@G(#rN7I=)~@4G>N;&p1tiGQ4^Go@PF6|0y?;gn3Mz zZGc0+Iz&T5qd7&yiH4F={}e!8MI|_jP3*=^9xnkWq!;cM0tM}ZYJ#WxU|=t7_Q42C zX_Zj_JEvMOAu%Q=Hqr4w7=@oGfECG|PKGPk>5*ysDCK-5JM(Cx_%e4cPh(x{G%yYS zrw@g>g3+PtXM?Z+0Wr(Yql6Fd3RS)JhP0=L{d2>ddg4loHu@{cJ;%wMiCXFT%xBQ}&UgH&7dL!ELjXQbUn^A<{Vk5y!nBluMISMa+rfC(K7uhvMa7 ztzh7^*uLv4>VMi&U)ZpIU}ia%Iyx&=6UCA{KkUrGfu6{q`Fnwn?1xcn;U~b2Nzw@x zmhxW;`h8Cu;f)dcvA+%BO?k)1ZzOl5BxEme5isJV<|AJ|$xjq)e?j-fbPdH{QA7bd zBso=eOmjSy<&Y_{W9aUyAbMfKL@^fphGS1Y7Ko*I6upXu;Oj!Or#2O@ZWzCF)8YI6 zagXVX0=5C@l1)_P(S-A05p@6(rC>>$e(k#&Tui%)rIeQ5)|0-cJ^4qaGXz)4TT)f=_eM1*aMVn8Rr~2b31aI#5 z3Va<<*0#*WJ!Sd-nRq!hVLtV_2nlVIf}+s$@X{>7{!s=(}@5_t_4oXgqrSi~Y>07_Y$r`rV; zs2d*{ux4sa3B33CK4C#631`-fq-HkPf$XRxj296)VY)+m|K8MX78QY35tlDfxNq)< z36+~oiw8&V+o#~YugW}hr)e8`55pmfw)D(REe5uSUq|pNsUBC12t5n%8xX1<^up;j zqmRCDshS_yF<)o-z`+#DzKB2Nm)6}Wa&%S|yZU9lHGMd@O&={@A>gnVy9ZD{p?hKq z`*$(rvkU>!_v;=5^Ff;qo@KKDZKTQ{yz&oYtrI(Af45W@9@muenCmX%#5byVo|`F^ z4uq}-T_*6%((ljSunk{-`{dEr9~6YRjy>s#>0f^G1w8vPV5=Zu(WeWcAG}(fqB-Ho}3U=BbuXbU- zx-mJM`hbS=z_-h1Ly;bw5U2Yb;UE~((DEg$xVDBj`%0zBMR5x=*g=d&ljZ&x0$r=Z zgZ)s{EQ-?(3HBYLspsR|Ae+uw6{dT!qJ4O}fM%drgo|uTK{lM4kDY#AI|Zpt-aA1+ zf3!{KZ0b2qc?fm;y5B;M`Us#1RrwaSOW(g0PRo}+w!mDUsr2Xl!R{y!MkeQ;f3q1% zAXep%{%F%8tE4u3=Fj%0LOw~{m3pY8jZ?+GH4SRXdN2{#*x2|0lvcd+w2;e=pku^Nn7L_>m?O<3u%#zyVSZ_zgwD@?lu(O$6IY&u;r#)kaWivF32! zkEGzrV!CwE_LGa`TsN*KL$zxJXJz?A0_y|rteK%8#D2Nv-77~^1_c=HdA!jf=Iy7* z#|#<^Qy}p*3JG6l&B#l`!!pu6U{$KD(8_}~8*EC1DayolM;VWnb~;2~i7>EV z=NqORs@{NK3qy{%PX-Z--jMiGN_AbV%a_r5x=iuIQ3g&}YhRWPZe&U^c&V?{e>1%3 z(jorY3SAFa3a~mFwwmzxh-qpPusxXhb=YsEA8t)XB@mkvfz?hJkP4jxzjEj@TZuQQ z3LyQ}yCaB0sD(woeRj$Z#=c$JFO`w@9 z>*vzQsdUsN99gOC*#ZWxw6E&RwVAZ3n!)cUhO||f^8rkZrOdEiUFRiV6W9nyPTZbl z`>X=Wpoued&$f(s{ag-Fb<8?(tO}f1=G0ZEU%CeXQ(0?;D?ozg8X!poy2t2`hZ|+% z^TO*=HVB#my}eSPZ)ELnX_`Y@_R=rwqujuZ3@jO%+dx@i?0$og==(y1s~;W zp{=&u)TTmK<15j>&@~LJy1}O|IG9-ckU~P&pXpq1x8sLNuWAgHoPz}oUhrs;$~W6u z!7t+}sTriBULWp402kHm^x5DKuR`m;8~9;1P}pyg+ffxJqLHa@Mj_BVk)tPCw(Mxg zQW*%6!3EuUk_B(lhWsQw^ zPLoHyGl_z8i9!=+IpWIx1Jx<5bX}zHjIuA~Vd%RT!`*a+{D>a@IFg% zN-clRY7INzlx#Qdsp8WZg|3%`_61wf-K7r*w!$8-m~8v+1oX_}NZo9?v~zWTIw7C3 zAAp2Wzr1j<(;umRpIvnPjucF|-tBRhzXBJSI^JvB0(!uDSAV5b-CmIcmJ%z;hPPYM zjMVK+Vss>uU6^o$oTRV`Sz2PnQ77hkhwoPE7Bgzn<%#-*s1WNFJyA;R~ zF@o60{0GTo&ovVCn(ET_UvVSnX#!~>qP&~abr+eC*G0m8ux2z#l-<4F>8wPVsV%np zYvTQ_3r>q|@1C<#*5mF?_WeySLtt}z6Vl5%IPU|L(_{Z{Q2i(j#spgW-k3B0g08_#|hCoPF87gQKdu8q$?TO zP&hk$PZmyOgUme ze^5)qpPYtHdn~WX!oa5<%mhr+ay-s zGPKFsd~h>I01pBJ=k<-(s*}2&*TT5cFKtXT_XcXowKOf-qNnr`epQqTLXFY#vBYNS z8*m@(1g#B!yDfq7(zi^+Thz2&iP_a(9hrXe3y`HyF~;v~>TRyy$4cBFN?PYDvg?-9 zQ$x0hk33>m0tJ@Gjo=Pon=&x7}XO}R&afqpJGK}^%@gT)XaIe(Mb zbk8T_Q%5gwGva|5C3=?WomB#0%?#+9#4Pk$LKD>0`*51&4POe0Fy@A~I z1FTtfWc{;j_>fgZzC&s0%r(eMuv*=)*&M?ZV-Zs8 z9oL6UdLRz^^6Q=XrLi0=p(63u%rkGL~!M!tg7swp#AatIkCx>_FyLD*1A$bxb&85cIeb|^mfs$Yp-E21PK!}z;$y+$B_t_!Nr3+v>a^s3?b2BjlWUP-oHWXQ}s(c zBs>$ay>-z%y-%XLqX7EL1VsO+`pkbMYbnU@$6lg@v&*X0C%>+v05Srq%QmeKTaxtMZTiAb2CeeN^iiCr8jri5`ev|vcAzm$+t=Vm z4u)UH3}Q5%bZUNdcl{i&RYQ`Y#O|M0at9}GW3NX@y|u`f z`V+5fcR)pT2Y9jhk8`!Y4REc`MT&@|Ao>|b?iE4PxuW%m`ou1F0_i3tt)JF(vnjt6|;5Eye z4Wb8})?1$K(i>6KI}2<8E{*dw5yxc6SoWDue6P5c*8ZHwE1F>**eXaCMD~i9cEbER zJlN%1zEqcif+7A|6Xj^YPTfix7}tn#&M$xlAB}$0QP(e=79)HiS!L#hrAIpV$d1su z4XGAEpu@TRiP7U5Z~YLoRxKx@3TMrs(Ca`cJbMJd2t9?&n*{Wh`Ry@)jeG@i&7GTh zr7&Ajx^<(Jt2{jJC$$!1D{1W!JV*LHQUNnxO1|}uzp5D-gSG%jCp0kE3u2`n`K`nM!7OTfMD zxkS`Y{MbqEMCoLEEWYpUWoi*|aj`{lv|+TafMUkl8PCcch8G1lE0s84+{Kv!w?oG3 zh#7M_U&pbJQSj;U@C$B!F1PpfRObk3V#l-rMGg>we%8|wqw&lvoDaHnyAdf$I6-|u z@$lSKzBco{t5TE=6^o|kmpDU~tsN56Tz3-x8%g+c@Aif^_&)2O&hhtk0qpkmQ`RH&Bg7Tl)-@kvadaPV0+9LKB8+gwD zg$qm@Eq{fToKk#gMRV~N#Eu$SPMMRSOYKBCv1TD6$@CB1u>ws(vOwXGh^fb`C zPj#;JActTqu2yUX6wXO_-Cp+qME;yWv9+c|h8%m&dL34qJ1JH%0BY3$bcPxpLcfFc%TL2)ncp2-&1kl)DIm_ppHV5sgWVSzEvE)J#}RU-Q{zI0No+$!HY2pm{?K)oW?lHviKu9_ql4dN3r2BV)8=y>ZWAVI3CenJ-8U>?;@v|?bV z_CEmw-fosD?ilOl(obs{vGfs674r^P%#U8te)RsHbqB1Zx#qPwXj1N~_^5bz2REQo z&U5cDmxrR07?k^x#8Bwk-g=VdSbyNtFD2@rvz|hCKQ@%{?w7{MDHKW)@&g!?EtEv{ zh%5}iW{pvo$>0#{ezuP11Q!JR$;`=Dpt1r(N|VG;4XJuKnmnvfr;5} zs-V*hi8F`@T#N0w$lBk^FXFQmL=_*rjJ@A?8EE1c)P!|d1p@LP6{E&e>V$x^fGtTO zoO<8nxArx)n+N+g=RQV#l;BhP{4u zgOWR(YO7?lOvI%mtdUHnz`BS)AROjp?nZtsUH%I4<{1qgObWr_`b8l+Oy*;2AOCaD zTK;o|b=nF-+>W9YlNYVo+_vW7kG5_0^P60Y1$~)kacpYyHOt)I@vrOKbeFzLs+Trw z?=?qJcWn^b>o?@V$p%PRti7Yek*baPxL=~z)xMdH1#nAfa2zk#ww69#$YdksKkSW7 zwH~QNpY%#GzGwRkx${4E?w5!2fNXUCceFz>{=FXm+$7d7Ad=olgRnB|F%>m#+dMA* z#NlB9-=y$&hQ7)v0gSP<;Jy-!;fz+o4$A;R7Zwm@7Yw4x<1bJTQ3Zj9?T6+4CN8~q zW{JP{*ImZtFTr+*Id5 zQzHuz=9<3+nFmM=g{pbcOrC3>H~_ut{aNGZ0}8=%mO7_q%koFxg$E_Vy`{zPFa*r) zx5UmGyJuVouipe5(|hIt>bR&i@!q#pfH4;wE~OT6hQ>jE-bq~k52bl8x7bJEV$=_H zxC&4@aWtUZ-*k_`*LwWI&=T%uk*@foeucq~w>*td^ z61o)XLqVYUglS!*x_SwU9BZl(>7({XoC`)ymiWnK)n;+_$6zAlLW+@(1zx;S)3W2` zF=+{4kd=3I8={`%zBew;a(BmNI?ZZ8^ngfPM5bl!HCDg*W5 z#XAE2mo1=(;Bv_!16}I>6C9u_;iezv-V~rp-Bc;q!IbFv>#358(Nr*~RUhZYj`gHJ z92)j5t-K}`t2y*7Ns}5x*9^Ko7R=J50DoUY6Y1>gQh~;z@ur6&zX;fpb3Lwqb)kl; zQJa+mJauQ}R7FbNF&_QVcn>ftB!IZupI_+2pJ<~J{uz$}TPK;NcAtR1A(bTqRQ9DK{b*lM_GC3{b=@ETasLbv9nFyjQiIkf{kYF@ zTlO|C!t1m}->w;y3pd+KS~ob5U&O<2j3R-2RRhRZxk>~Jj=dWy zon{j;JCxxqAPc7pp}8K-TZ6f#fHT}tUFN4loWwZsS zyE0Q2Bm0mKeT{f>W%WZ0Az?V`FHkke`Xu_HZo0|YNLRIo`n+|DgSH44Z(7yKQUry6 zMG@av*s&txIrCs_Hu2b^_=mQ3?zRPBK2h*%T?HKMUqAC~ebGI|C=UV+?ID_#9sZ++ z{QCyMF!l=Db%nAPJB^5bOj=5LR7;h}XGtDm=NnnoW?2-gz5X)0)f7UNMG%4oa-S)rrOX$&;4xLQH(RaH0 z-~W;QBiw%OQNd~9x4KNG?&)!P+!8t(yQWG-3@{Df{yt|qi+bZHZ%5XtPZp<3VT-@s zzfYaJBha>B_uP~sT89H?(1iN9+O;^ndnR~*AxSZ7z9iTj}SfB#mx?TEbP!R+)yBn`e_K zHD-i|A7A=^@w)%M@lXZkd0b^&tG1RvSXq77WI$}hGQ?bQxA$T1Y9O#AP^t0 zyg*-5uhly?q8#0c87L2%>H$J;iUVQ!t|;Aote+sz2KGQ-#g11ea2hvtfvO?{!DIJK zi^_bt$^c%ol6J*vg2NT=1|*MMX#ijGqY7T+f|*(&;003&HBGw2mSJ2Pr%s-vcl*QHm#qIVT5hT;aYq1Rhlh#g} zUL^AN!f|D#cD7i@Yt)Ercq-AXeJViY1Q|?nY+c)+%g?=5X6ssMA@@r4ok7Y;{G2M} z@EJi>&~Aa)$@Si2r|-7XUu*P6Ewccov=mU1Sy8qt%rdM$lik0q#})E=sQfDs$;T*k z?`>h;oZV-58btQ`&-L9AL@aDSPSo$;k)p}9)a+pFR#Er0fv4oMq_cx;S?dtvK9QWv zRfFa|Xc5U}fgV$bwO%c8iHNg5uV${6`$Vl2M)i2Tv2V2|b6qy*OQ=2q?mHzv`7<>( z6#5(4yiOP_Bk|EY$4N5^sd{~EmFVCRkJb`u0k@S+!l3WR7A0<+yzOy^%xe^2^0fZ3 zcl_NIhvM;Bki@QClLTFs3OhCg4DH@x)Li%D)(rzSrqT5G##(h=&}dy0jFbHTa?l0^ zW&_H}fwFq_qq5rv!rBNTxxJ@la<$oocReQ9edE*oR44n*?>Rb}z++&X?O#$n-I2eRKA?9X8(0{A2KBj?2bmkKr{(s zRNtjjBysyyl!hfBJBfj1Z|W6QVDsT`ZvDg&J$-klci>>7C2Q%RLIwZImp{-)66Pue z*sx{5nK%hrrVISgPM+{_{M*eb&*i(ps%L-pRdLC#;|m+C4o)C*i&rB?x#AE^>*x?@ zXV^fBR4K0LfLnF%RgWo6Z{3apn#FGVqHi}8+PYzS(-YYeUd1`U|AMD-M$H1-et_ZVrM-KRXQ{DfS9aO9?!`l_DL@GqcsED zuElP3^d&sa`Dn5UkH9YHIA9C|1#aewDjz+joC5f*a<%C-Fc3ve(&*~9NOa^4FZ;SX zK9Tagm+0`zQ0Sf-9^e{^0e^W_yZ7-tUepYiJ=J9wyxJPgejlFWT>ov^m;Z0c~r`vwu0Q z9hf^TadMuOf}4kf$L&wcTo`3BDVGV%kQ5ci;frY+rSp&=w96$&KOhffS|M8R&tHdB z0AHO0_A1NSTVV|v`1JD`0@inSv{>DF<&8Fd=COPbxsln;FJTry{@T)LxbnydywSf? zJTe`5^=7i4C*_c- z(5AelcSi-!BsTe!sqh?{^v^v5dv+4rn)2Ojyi30KG;U%G%dKdw-1e}|dIXP&`U*6? zvZG#ASdIq76+Rt|h1I`a+!q@4il$~MdPv!h0)>lhB2t%a%$kRFVs`+E)!W@Lxw?;i zHYb~DH8@zAlMRap&!;LHs|pv8Xu1VQ?((7842ouBY5)Jqwl5UoEG}W7sZqQia)3xF z8oAxfLVi3mtmT`pixI-F^-CJs8;GOBR$HsCGa`bRC zU9i3D@MBul+2u1h=gZ{hpyC5&IIZ-v4-L?=_D0eyu6ogei~+R#StPb!~v$k59pRPL}8}maGez~%O_8sXc-s{{^CGX$vwG~iqSz_vGs|a z*Wdp;;y+tJ$_ya5nFOX^<^+$Z4*$Tgrq6o1PaaKrR*uT(1E)uIV6xUeq0Pr{jt8;P zE-Brq~g>@cD!XA zB}v|93UsR9&`|^IUTDC;`Y+v{cj+a8`s+{^uwfoOc`p%0+pf$~ zE_hsN{Teu=&Pz$(oFb81 znIEQ}mVMcF`au6JPgzd^Y>eT0l^Ubr1YyCOGi95%s)i2~S27YFBpf0`c3OTF=wO${ zv{D_lhIS9@pGfet%YmLq&#U@sQbw`$a)B}Bu@@+P`v(!epw#&12>b`U(P5^O+EN{w zi@vVn9Np@Ua+7&a;0t$<$S)u*tRq+xdF>0|V6Q2^o|4Hjb-b=<;IUqq&HxSmzyKlx z4ft_@kD`@DIxu*c8;FWrZsa=6UI$HVtsvne7ZTZL8Sb2)eXD(S&wZPr%dzD;A4CFi zEVfg0c2wuD2T2(ucn)-Ths>7v*CTwxPXH%9V9fq}I$+_qg|pQ&fQ;{Hb0*Rfv;z>G z1!K{vs;(*0mj$z9as|-7L(wLBb-O)L6jF_kL{PKwCCUA>i|!H7WeFfBoZjJu2hw-r zTCi)@4%Y1uU@W!>A_rzxCnA@29~!gF)=id7%*XtSb-#RqoT^{8pkD^brj>rH+sY5% zFEEQH)?_Ji+1uo5FUJP&S^NLfz}3a@)HN6MvV}mENi(N$$0GmdIBH@(C~tb)IOOUy z=R+azP9fTu9W{k7d{4m8#}gls;Zb|1pq@Hn6ez2cHTfm~EsRr}`;>LaK z1hr8EQ=hK_DemYYwSz+V{C{JI|7nLnW{}CU9_v~aSkb=$DB4w}jIFF?`n9FHb{>^K zTH-{tQ%{XLrzYk6E1C*!A2#y0mJnT#9-ZO%54BMX3H99eMvn|mIP!2n=nY$89#G&C znE|JtU*AGI_udal?%xSGS*qtoTH=VOqw;6uQkI~o7G=-s61+ZiCjaqLFTebZyD)~?06X%5&iluLlYPz1>-&0{frwIO+kAsf~j z{+PSsdR10n=S@978C@j}r>PexQm@g`}!-EEk zSV{%(>5B|39(WYHCjroGL!6KiAR$C66<<>RdDZ{d-Etrm9Hv<*IUe2?m*bbubd8Wp zEY^hl7VrBF-bf!-Pm1V(Sp7=;-tXNFiSK#7c+e1Elvbf?r3Rjx;GDhHth~x3K34e=Kc$NgMc$-E{CxH{SyQMKt){N#8&}5aui!Z%lAmTPgp^qT~P;=5|~g2 zh?2rjk#-4IJ0vx4$XBha%Yd;0PV;unsJadP@)pcOh45_t1MIAB?`$%m-%8(@ZPe;_ zE(6=T7?4wgh$*m8GHSo?{y`zDHDrAj6 zE?AZL0Agv3Qa%j>^6P2`@=l_`^xIi=`MG|IAnk*0Q$H}2EQ*K>IBa|s6SE5x)@7VN zdg*)&ZL|9=LpAQIfRhu$b5!-Pc3=;V@G8WmznoAP2P_;;N`KkO_9wJyU}h;a6YVlQ zw47$D{n%Ii)9s8cpLs4qb4}9!2~tZ2*KZ}%6Q>(o*Z&f#9)DhfFz9%TY;EAtuznCB zy!zbri!e5?jFQJ7&r*rqc@$Ojw}uX~ix)mRT}2b)dWv#x5$s%g>nB`X{_(|{T#tzc zYVagn(ySW@cHI3<3OX%sF&bNbE_iOC-2CI0V|PU*pNDf3^H^x375^8{7nN>3tzMy~ zJ-#+tCj!7VYbmkh`{6krKLQJ6KG@S*sY!W|`My%5z-`drF{RX|+P|mk|BBO^2atXj z{eGC?{Pvtxt8(%Cu=R`ru3C$}E%{32k=7c;9#>kp_0;W?g3V_y3IiW8d_3AU-V``T zV@T(Is%&Tm;&1x?55`;*o9hm*{StrAd<>Y0JmO1SH1OJ1PoE;EOjiRTKXm1d_5j2p zV`3vUAYs-z3r0`7diBn8bu}W<(a|Enjoq{X*r1x$mrx0#Y5ic6>nO2X7g)}b_wjvt zDcIMu53?AO?=Iv;UWW1r;z5qFI`)9spVewhH-*2`?B!EP11I;^xZ2ag64me>)sk6% zOStU9@l&wlTF4G5+2zwQH8q6+9D1q^mVLVwC`WbQ|ZanZWe~eXj=3(qcbaxs^0!v6#QqY`b}h@PqCXqD+b@DfLGIv znxiXr#ZtwWeIFy{^?fLhRihn!Y`E3FLJKg>Z#<)3*Q0Lnntu3VYJn za^HdBv`#T!UEZt%ZxLBvW(5{A^pHj&+uJB6N*P3L6XavF>k7z^JDVQf@l{rtW6E0~ zj58UBz)htAIbtFeDb#Ya;MC~b9i>fukP&~p#8qo|z$TRB12ic*|9S^8>0mTR{wa71 zquWpbG+k_bIS-c(Bz@;(sH1ENOn0uP(6h=5iP;ay#(ygl`M1W|)Ha#9qJvF|WsQ>Z zQ_O{@z%0SbA6Q39A;P&dN-JOojeAwScg|h`@L613nBT5I{-46HgN|6QZL?`LC}99b zS)QbOaerx;z@Ql6^!ZX#&P2Yr=uiXK7^7knWqFPXR*)ih8SPgcS<5acMY(IzfDp+iBT8bvS^l^A)*Fa)Q_|ouk5>JL zWSwok67R%du6wLK0(ayKDCN~=Lw9vY4;Al3ihdC7@>Ls({cvv9Zup&VK;|@mywDvu z$YweZK-Hr_FVs+6bJrR?P%!-w`5#M4fDqk_SKcQG?cAcYaFOlC>KwfuL;WV`liNXI7hQ3 zZAx?%4r;teWCgKnW@uH3Lh+o_@A?v+P;H2R{^i;C^wxpKmm=$5^wM3&92hT#5WwoJ zzWD(uI&dhkH&i&OK7l~&LFMxN&W+P}uD^w1Q+YNln9u+A(I-N_q0P87yaUG~Lgq04 z_e^dxzgs@vn#Al{)qG*Z*JakJH)#ult(g0&eSK%;JV1a+NwuH%Sfh9dTQ)L+hpT!g z+Y_S;;9IAg3HnoL4-~$Rj*Y3kmw(y3-g?;ZiJg!NVPkIDZ~4{X0e3X*HYe4z>O`sn zt{Pu;Xk$Kp+T&a+ctkXfE6vRp7w~y+%3qg(M}yd^F{Mq%>)NlFumydfE!>+;0tQnd zL>N`DmS__0(rG!m?GKLh^daL=YLyTsXx(+N88ZQfIc4@fyr7}U4Ln7O8K31#^_ zOPLIB1qTr#bH6val++iOw81?~o0@!+1N}3lR7Q_8{08?jO70wfzmiybUVN$Oe8nG4 z-Sj>BOwPgl`~`Yfl>IVI=tGy~dMJi1F6`NNhNMs~6FNU>2=<>ZGg;iRtwUhtb*RKJ zQ{b+D-?MSuMmoBQLIS;UtABu2$?ahJ(Wj_hNvJR`CL>)I71y*{-kNSAr}waB z#%c6NnOL8Y=T-#M%#9}FbAOFCSxtP%FYvksRg^6fdQMHNqK=Jz#wZ_@{Px{6Yz!#u z76$GiNgw#vvbYz7` zubS>=Fbzcj=M!_wfMCNK*V>Y&TMj;xPfATsD0cGl$q6equg&fYgyvi}!L?6DDRdoW zc~gp;sL%8c#`gFVu~g6?wt=>FFxlR;cKA zqD~?9jxEt12!sxzBrl`=j1T(laCda-PAKg|Fuj*$Nh(;?vUS5f{MiCGbHog8Y;5#d zIF7}hy+_92uFM3G zii3!=kB?8fm3}|SNcGzb?WQqfV(gN&*qeu^`UB_e*3PW?ZB<~g3)kLk{a&ILwW~sK zd9aP==eiv!i?yfLbghq+eS8(Bst9paCBTX#lj1nv2Z~vVCv8`66?0MVi?2V2$k1I6 z9*^-Axyj5=^2KxWK1U$oCa*zoT0sC=M5Wmr8;WkA_F*Od-h&bEL(k3eawSpR_wCL% zvQ1*BZ^=!P{dw*wpK!4WFzKKykn9X*mM?7$9|ru4Cg*~+F)@BWeU4zYRKdC2X-AU; zk58tHG<=z0Z@PklxG}M#X#+1;xmbT<_=f(YPni@|uSDn|n18-rCw80<_OI*fe<*%4 zQrs%~yaHNd!1IY?z@fa01Nx`$GlSF>BlwU_E? zgo=i1G>2X!rHjYSJZ;Fo`m8-jY$&XryAB;#esO+ciFbU8UvdqAjm{M8A&Sd=&}vGz zeoYwrS@*<7^i(z+r}3oV-U~_M+z@4_!o$h&AQ$ zexJM`aPr)w=|VfUx3_m*iL51EJyM>N;NQk278%vCnX0iHjreT6k0*a1&(K6moc=ll z>_+qB;N-k?C3$PppTNO#Tf+BXRsP_-CW*i5yfyA8ll)j(i15;iB@`^cK@FTYCe<&z zeUt$>eTc$Dhedrr)O>{_;QWH-> zSf++~Et7I(f4Mf9Dx3;iI!g{>>M@ds8RFX@g+8ff=D1@3g-Uz0=_!BEku*7}m*Ba! zyu9oZDbFV+=9dW8mM_ZBdSm?BA2|1#^lUfo=3EPc;QsmQpu_euI%#~ED|&$!P23t= zF#P83lmp(1@J1fR@w>dw$ z8;uM)p`L3!qb0?5Yqgsb=Y~-zm1B>jy0uVlh2rEC};eq+1#!g;5$Lh7OUGE&*YrVF)SdF6sK+JkN99bKduSe_ULb zBY)@&pZnf>?X}k4+oMR!gFNHyi^|r0eTi-jsA7(`>hCd0+(Hk7A7@Q>@XZ>w zJeZ?vlWx^$^@P68j=NwjNFUNPr1~PNY1PPjYqX?ysX&TyWaY_1xAryO79_~io!I|f z`ny((&XWd;A9v8`zT{$y=jToMVjYhC5%He`|MVmHJHp?}$cFH+<*?O9;F)Hhl3A9n zR@=||A~U5zeSy?!3Wwn;C@A2D;?ta|b23uk{p&8@LSrT#cbs2wB^mTCiX1F5T*blN zQg$8`RI5}SPM#%RUc(XCQuZ9IgXtxreAHQ-*O4gFI>F$$CS^|%-ifq7o^^B@@oXbt zxM!k9ES03OL!#uf;dt(<;NFJduc{LLm4uU2ngOZ>X!ZclGBc|XY0#`Bzw;Gtw=&j-_1oWJ8Znqfn}5!aWFMPhL0 zqLAJa!f58c?>Fi)(tdU8q*uI|D(ffqW9o1(hwo3s6E+k6w*Hy zfsS?;>=e5qH}`$zoxgK_Y{f@~qD*WdyytJ_t05G9@ES29pS=iYsFa`uHFL^nMQ z2ELP%V-N@g+>UR8?=&*PkmJA+;8Ice|}Tw zq8$ivWw2xf1dObc64kCz`Bq^TkA3(HtWUmJT(y>*;2|??TbAp+u7(xeLPr)kL-FMO&WtgJEIjegqc3A#tu`{QiAx67DXMU5NcwQ9h#*L>KOYuGM>U;kl5gMA zV>nkEP|h*XMw|UGB6b-5UT5?cKXar9 zphg5F|4yI(&Z8+U(fr<}>eH(`iR2s|e^6Fg!K6;c23>ZW=-o`l2K{BCB1W>@6g%h? zMYM;TN24DF54^{NGk##QEPdVDlCNJ&&}(q8=!FWI0Al)BS+1cD-gVJ? zzOf{8G+>_yUrpMh03yRH@O$gY=>#113yc?@vSF(^YQ8EkSJSNK(54t8Y z;b$^Ek(a%_y?LK2(x0nErc$BtN`|Qr4yrvRq5+wz{CH>fI-(#PfN}BLXj{>q)}qb~ zR!C^EFJB}GpWkx=ah*n17^e5))zRcxf8`>7plgj3E6!>bVrT{(;v z;>px6OC3HwIJyw;CLf)a##JmTS`E_KI_0@T#OXtG#DGC(ur*M#ZaM2+{YX4tGkidv z(WI9>Klq3P_GSHc%yRiNl}XC+W}bQ4Jgm(u64PU)I6rxe0rchroIc|HlARniwWg#a z$>ThY)n~V6CELE;Zz}MJ=H`ua*c|n8Z7GY(_Ih?D0q-^*qKeXU8oAN9el;$%Cu`aHD`#TNz!Gw_B*21wmC9eI! z!NJszL|ce3xDoy9m`@Hy1#^J^`?aIvULmUkh@`X7g7cgb1l+R@*d+?vXp3*7El7Vh zD6pz4k51hDU!uO*V7P-7{`xjzA-odC3x2(!GFq!615n&Eva5ge#$`^Kt3p&&5 zH3&#^$g6#_?v!*`n8+1=*n}Tsyvy;b>^(1EiBwWFZ~yC&BBZwQ;v~bZ%$dnaduaf6 zV>50Ac~Y4Zt}m%DA^$T{{{1>X{Bk%L$(Iyy%SCMGRnAPv#%;=)vrF5?Xu)rcqVU!HggkBvDjW9Q-a$&PRPH@=)_U&o-hQUmT;JwwN^1Ep z=<^m2=VxM1KS5rzAqc4=`1iw}aPY0nQ+v{xEvt1N5KsRp^pMh)DiY7H33=v)Ckd)U zyl1yL;mY7p4t;2wqq=#3&D;mse3@e6P>-z~`lPh9#!JxIs3h&I7q^4FU_^|Z!g_kc z@}FDrknL$=5rr8)q{t|OPXftU<^6caO^}r9c{+Q%GiJ$m>jmkB)+!A(MQsfuye@Dgz z#m}Sa{hB4ij9zBpXSE7e8sFv}ooqHBH+KVdp)nY~o@(Rtqy!A)O=@Fs}IGz;#zZP5dJhyQ1J} zh3RZ1x7C9#LY(+UJe(b8XzQ~$!rZnt;o#W;8od+r;chj$;$K^KitHuzSD-F!$bB@! zXP3)Q8RVIp!`&AvE>rH10Dp4O*x>-dyW>9>@kK&(&39+~JJJF&dPT$OooN)+LFsmL zNzd_Ho8{@iGqy9|_98G^ep9+}WkhU(b^bWW9fwFi`SfjoA6}DP|7R<{#MMAl#%;Qzjc@6HUf@QmwcqJ?s|4#@>h76@24s@ zXaT=Rx|_t9Km$J7u$&P(QGKzFYlAl>dHbxWPj}--s zzJIVP5abDEa~KxD>wFsU~le@PGyQOYPirnJ> z<#!HYXTbl2%zwRjr34;bnSA}!`)c*ux@D#Dbjabe35e|>Nyg1 zIuZ47oP`EYuz0I(ou?6vV^9|(UtN&h)PZx&-hc>l=x{wkyy%(?yyuk-q) zQ8Cfb>wgFHDlhV>(H(R&kP>%D2CJ<+awB_7m8U8u!KxWn#7x=zIWuI*v}lntNI+Ve zC8q?G@_&WBA+BN!5cb5e_9Pk3bfZ)Z6JY5g%1v$Lg!yqT*0PPa@slG3gcm*4*Zq^E zgu5%KO|u3#cUmJd18J)+1jYD7YAfRch3{l260Gv|n~W6y|3tayh0U=vYid0G=W7l( zKx03<6Yqb)!@q;x!Z7gYcNKk30kF4Wb~A{%ow3$6KX#=N;9?VK&4@+@>g1Fp@$_gfkf5y7vd<^72;D{1baCR*fZ zJalWi-p#7bg!Q{VeTRy3nuA^A;n)H`t*EyGB%%cii|}RKV`td+=7p~XY9R8Vry0^^ zIsNudIz0uQE49#VF#nBcP&aPu^)f9DDCvDM2_9a!eDpQuAf#YHJM{F)Hq*YBrUJl- zTgxSnay^D&A=BpuC%5>l(++&+PBiY3qO5bWvOCld9ti$8?7tmHCfRs<&UpP53Ah}E zHRP@Sx3J*iHi>k!(_p7`I%hl17r6qj@MhQy7J3ln~>CocA>|{4j!T3s~Z;1&)Z{iUa>;m8Wcn-Iswk36EYYumfu(8J@aRJ`8dckaOpdr zNYTpjvfax0Rf!-2v>+;dh2}w|M$?3Djh%jTW#u0i20N>-5JOs!W=j~I z0k)*{Q5I$=qZ(AC?VzDd$*>~~3;ip0Wss-z!K)sBc&le-UF4cXJO@9?I!Ew^x?@VN z3%$P2Q)Kz)G=~R1d&*Lmyh148@3B))r7>6Lzd`3-end1_?^@<=&^b5Yh;};yF#hYZ z_|Fq3ie2w}e7ANh;wG*0^ozJ`&~GuZ#yvi>_=Bp+O6#0#6W#`k>ksp*8+}ELkYy+D z5^q_K+WS~MO`CHN7F=h>E~$&G|G-Ro^?O0kPi36tjYX5Rzh1FJZa@4;MF}^xAh_;R zCZ5-$Jy2$m&J``0>$F6E=ZPfIoubZeG>pgnA+E~j${;DJZ1}WhtiBNl^hAT$j?Z9F zZ?5dcApAP~QR}K%`m8{Zcr<@3cg;Kq@@${TvhvhGZ5)D8LzCI`E>ZA(iFW!#JC}du zjqlI`8sr0Ewque(as)hM&>QO+4tY7Cs8a#V7iRD*>ikfn05KGLi#dM_EArjD{q&5% zmt~r@VIBz;1|K~fjMD|-)(MV^LIY7R8d+NsR5NTo(8x5h+VZRTaLQg*1f>S6mZ67n z^LzvkC8g)h3snFHn2B33+o%#H!#Wn4lRW;apMKp*{F}1;Uw6u15`B4gcOnhC7rb>Q z*KAa#e?3U5JQ>hvIn4qfTwM-HmJ2$FS0h<8hk}PG1`@1HCmS5YL>Ivu8p&uHy*tr! z4#h#9jWm4sx07({O}r=lf@3-ATaxmOsG9xwyxY%>bhx8_jbdXUn8D9uH9ni+xIbQLjp*-A~AA1w~etD+)0 z9rbu}0$q4~MIl`HmK}@US6;g7dx=E8CjM+RDQK!wf)=34uGIAPeZk5`kvBUtjS7fT zzRu3LbjWHx|9Gy_@J-S=_z3eWpx=K6F{49o!$70s4gq;hG(PECe__%M0iWs`FsnsJ zlVTYlUat~dKlDA{xS)^u!j=dNaj=K2J9z}z${$wZY^W=Gg^iSM;~z90RpL{tiX~>) zA2Qcirfa3Wfc`9dq=#xQ_MDr8yq_GM!!tG(1aX=_?fiW{;1_!s0hb)Y9OSvgOu-H! z-oL5Czh2`dv}mvRRjd0CxTJ1eHL%%6xDUM?I&_xQUsB7MD;t*QLXa3vZ5JYr>e2!k zOvMb}bmtP`gyaMkogwn4?eI$6lU(&LOIKd|Qff_~6sC0)a#*r&tnbMQN_?$Ql@|X< ze=EXHb!0d_E%K09Xoi$S>u%4Fz8@KlKKS(-pWh{Hpbx%3A%hIaxF_Q5c~bsrZZ_N1 zZmNu|@dHU}?}>Uo9wTVN zPXc*};S-5-&AKA&{vJkHkI<(kt&zFNH&Xg@XJ)&L4?>1z^xe$!mQ(KA`!n?>qGcMKDjGWqf+Cim<~-~%I8`Cw0*Tzy z#X1a0aQN~c+~hx{fhwo}=>b=o=4*^R`Rd+BKEZ{%xPu0^jo1&!#$I$CZ^C4?P8+A-`$W+)P)KDC-BK8p+T1mm~%eZ&Z@n*75a+WnT=jWk*&2U>}N*%Q$> zile8oiwf%f>o=m3n}^4#6w;fO_qzM|lHWc3cunH@T_01hK#VCXA=EaF5piLm1{`aC zEV~N4SGQq3HcDG??moe*lLdP}Bv02bP@uZ-0x(u(8vWcn*TxOlN%%_kRoV9l~K_0>r!tesz{IrKl^H8?K5gCRgf~?d}v#w(q>b^~E?^ z+PL5W+fKgb_we|66W9>;r)dt($Es}g3kLmZIq_^t-;$y?awRmm6Y~g@TT!oK`3vy_ zO9m7o`S&a3e&mg~D|k)<@GMVf{3slRsqQG&8|2LVrdXZXpFQ&-?~eA`WK}ja;|m$q zD|xzAJqztIL!!anx25%lki$pjS6|Ex>~5pNS<>A@fu+4UFw7uzIL7GYW0lU<50BRCz*Q&#f;mSsF4Mq48;m|5FD&^)dCIv%#n6fO{EDMs{$v-Vdn0=~o1 zV{-qLFs7o%!bO(bM@e`VppNp|+S8-iq!S~C4f`HkC0>ghF8?YR?B)aEd@-pBnWA-(J zUJz2)qd`Tv1d1!CUzWL0j>r#n9?}F+)ClFA{ZbX&4pxCF&6!ZyFV}KjMS2<$ShTff zK(~H-?fe*h!3B}hH4)Ju#}^8uAEAC`Jf86iAY-NVzE*p}TmFS0ZspP60}yO2)a6ET z``_O^bcD{M2R|mK$@__C@(a6eEo$-$QJPV5eY=rRMLi|eH}doC>`EWr z$QmaW=28oW`XyXM*>h>(mfu=_Ii?ajw^*+yDnM%BYrF5YI98WW4vB3I*bx`m3|`!37fhm>ZK}A!|#M}w?3@`HeS4|SY4wnX_>#NIe+vhKrqy)wbGe&_;HEgwnYR>Q@ z>~7XzmgYd~A6dax1Z+J9P?$!a=9MjetOf}8{>uxHvC%0H2|W@iSC;)g_4Q)7u@vr^ z4j6onC*)4zMw6gy5S5hfkLm*KqmI13&_eDvgEF!1uf_F1*f+mg>)DBzJrd?3yn8;} zNxrA}@b;33JwHT8q?qsJ?pPGZno+m#A-qMS*cUF_{p`nKp~Lh5Uo@HLou<~zsN*Xc z2t(tHohIFw)IFR1bn4w&s+-?zJ7p2TK#H9HnLf983*?r;m(MJL^8A2-0Fg7yjm$Uj z|A3?xXA*%8F3}2I3>K(ssc5S~u|t&(CpKv@L)6Y67MdeeW)?=->sA9VJ?Ry4)?MjM zYaFpl$#xCXCVkE3jps=iG%gE8?=$kp7e3Lu^K#-nN~N1n=wJ0^!uj!*vO3LVO@ce+F1bPIV`(q+w1JgEg+5E6qLb7kEy-`O zZd*igSuom36ESklm_?%4X%YD?Da=Oq=%dZ^ z0H#5)<=y#q1;>p2%?f#8z+|XGecE3a|DZ6xnQ+RV6uA{mi6D@~iV0iyxOhrg1Q(n= zXy8wYqzUDby1$Mw@1p?q$hAt%MuwKgWV?fHDF(9A_JfWv=$ogc(p@WniKak|h_Q@6 zC0++PcCG2BIj?(vn4Xzf%-VLtnaA%S3*2pI@icEpbI?N<|Q2B=Y^X_?) zjyuQBUI?0n2fHyO=_aj2|2wVxQ)+eww}9pWWdYAWtD+2ob`+vK2gjh9smy?6_W8_<#&hhF3}4@Pa($Yc}!IxzJ!Yvc+=8}CIL?)fygB9ED}&WQ%^RG__HmIdo8 zmWmy-`$AiqUy2j~VZQS{-Od7bBqto!#1oOR?E*eP*eX#S3JxRK7v4!`ua^uH-z9T4 ziZc8rov6LhVMs1%%b2A|)P{_Pq{x@ez(=~JNn^o)W_>nD zH$(3{6tVK5?yAx?9dBNe#s*8(_$})VE+(A{qXP#fL=F^|6nX#`C%)mn_ixSqA6(4+ z3e8`uY;AZutSMKMTxZJ?<=d!Yg@O!s2-O|pRT}QthojC<3#U~RMV$N{xPHAX%R}EN zo_gaz#vMlZ$+II1nJ{~M&>rGOYis|q#qp%f%kyrAYaTsXg?0h=;kKn~HQ91ZC{ zd8ukVu+mQf+jXT?;$a4qHKw4IJL!~B{wV1*3D9|MMwilqa$|ZwH-IEbF8v1#!#A9@ zPzQ_Eb4uqP>t`thK=b!8teypq9}(}_>j2pLwxB!P zbl$Ohy2T$n+A4xv$;Nk~JLj~f%Nw9`BMUy61@~ki5_zkXPA@{e9UuJYIX@BeNI#Bz@{$tE_F4yen*0P6#6j4tU{sZzZj~m@ z%lXwZE=?HI=I-5kSH;V9!4HduFq#qhYB`(F;Cs*oI|p5wwm+n>RXs9uRm_bL;o*|LsD!<-f+@ zuzZ@Z&GenzmDI)OYmNI#-Tl_|qtD&S^H%EDB&5Gep52ciPtL-~SJzjy^|jDy<}N#y z+#o19Rct3&zc(abmM8ZB7(|w_KQrTCcul3N8sRmkuW6p-3|oqll5H;$ef{VYU{&gV z@8rcpGc#Pq(B0G^IeTznp8Pi?mbkTUp}u{0tS~?vl*ut>+Ez53RL}xhAdUF^%J|G1 z4qGQhdi#cyC1nLwa4s!{BK+<4 z5$3iQIn{!s9Yccr=30!k`w0$f2Jp9r2JA?vxpDjOdmV4<7n;n z{OGN?QhqKU-ND31Z&TVLbr!XGmXg5Yq63Cy8QhSbilzeiqOY+J(flcSn=OHm%BsZv z30PW8m-(5$TWOD_4Q&^^T-I{mf*&t2A}`*eT<&}*z2Ym5rk|=dY(vt@y_bg^@_}h$ zX>n?d{Xbrc6$KMse;V5QGtZ_kb^TPOuUJ_2Y6ogjl)#L9t^z%MM&bo_%M`)Sync=k z)jO_T6reVAZK@`x15n!#6S9VTkeg42eA)tCDAA3TW=TJO}{V#0UN(JKfjQAKvPl|Epx z&&8>BJ|~W5d~_G7<{mK@MoQT3npHw*N?3nIiS>hc%Mv}RDEvc1`Uk_hc$qu#kUi^f z&dZDQMTcCz3rWlJ(E$!tr+mLyXmC>6mvA>;0-#9G_&-jIkLW|`2!GoA4E1l+T17ZO zTPh@p|MrPsI24+-xU$Qs5>d0nlcQqsjZF~*SaU)eU>%Hgo;Z{xRJA8$>_h4l> z4WlD5IucAYv0AT;P|RvM2Bf0nZ)4p<4X=x{QVe1iW9!y4)1R*DN1hHt|2|@s+rqyA zqs)C{7mj>2MvaLRm0JLo`23GI=U*?fnH)L|_3BdY|E$541&66V`Xl>z?|Kd@y7$!= z6wNfz#bF_H@+k-Z0tTY*;-o*YS=Smhy2C1;pSSUue4<4wll?*00B)ev{jbcMUD7xc zkw?ZsZH+IN;;mT3fBJ}2T2O$bd5;5c1K`9^QFkO8FXTfg{~srU-l21YbL~$R z?vHho;2hVkhR8mbBEe>Le*={jlnu`qN1d}Zm!q*kne>oHqtmWgbLbW-FxoTu0Mbr_ zmIPhRM=kJlhsSfo2<@e$uTqjnBGNi0o>TJ1$vjv2vpK(WW4nl!aHqE6pU?` zMvRzYFOa-=u`nk!=MEjYwTIro&}n~fzk^uvA~EdQM(5wqKa-@!N(q<#>V4Rm z;A>XI)etBQDNcAR>E*w*BWb~K1%#9?MTmq^02cD+&$RXPD>pvQ9mBP=Is{S`Cdgd# z>Bsp6WSD;(&7bpj)$V2*n;G(Rh0_YOFfVDW2#q)S&Xa{BuciK+GuVart6@RIK~MFr z9mQso)H8g!sFIAlz69NO-J`gNAp`0Jk|M$};^HbGtdxHeW0dT>DeJ_6w$P~c@O7gj zxYN{_|Cb^XH-cfM`d%Rpe!Z%1e_vy8`hsoTW}`F<=i5*_hTiR@IIzPT*%^yCwAr2oYjZ?9JLlZS@*OFjN0#K#R3--O3 zWS404sQ3HMML4!O&OT?w@O_B<9!Lh(h>@uU&>F!BXlwa8k3-MPKI8?TIerJee=Xad z$?`C;5V;bivqk!5k-)+7vvZkjM;oG)cu}y}p`H~HQ0g}5H~o=uM4{)$9PXdvH+)pH zyh{l~KUoF%*4X$yhUJgZm;H}!z zU=ZXI{ggBd8ll+~ZH@4-@Z#US&7gSz|H?X|K$>1qeYZcSk6iQ}?-{i>Uuq`KL;wPs zkD1&d^gi%WlG|?Mk);rlfXxD!ncF3C>*)q$|4KU#g+U+JMP+TMTD zLqr0X4GJr|EK00?q7SqOvKf!^44EmcC-< z`b3q#$(p5n{r(|d;^HB7(hh~&LSY4bB3RIcb(*Gfm0ay(rvg}7U zE)7{2^fG?MqrZD&7IaqVgZ3Ewt`(kxJYcH#j1&30x=u#*$HSUR&JO@OE;ZEA5ey7V zZ)70DM%PbDfUSE2#F6@(+Zbrc)(oCT7iM7bQ~jXjMRESLd^*?|#?DI{)}<;VD>79* z+lBPYT3|$^(r4vD>Pb|D_wYARWz8@64Iqck0Qn-k-aPQex&Xmd)}#3r=s1=72-93Q zFw*aV9SY0N&12nwAg=UlTwt-iR=7wA%!Ov~ z{EA=qk3}5u1`JWm2w&yFK3=*b(m(8?sl*3n5(<5U2|E6q%7Ocsr0V^6GB*eJvR~=( zX|NGLNW@1|lK`uLc#}?NpSJSlz>f$6@C13pI*u@j$g0yXD{GM?~e8`*uIx9>EJ0+`)cn+-J~oaCsat~j zb63Ym>#L$iBLmMlx;YU@@9{<1VR5G=-n5FIvGY@ufY3zZk=`0bR%z!L*r3 z_)fJyh`3Xc;r2-XBS+Mn(@TJN`-uh@o*r=BP%&lGfR&E-o^5`~yJG@m?b2ZbdHs#B zRpU=)K4A25DAC|^=|3Z>qKz-1ln*?!uQ9**gEdO1u+{m!Oh76A29Er6|L_9puZ|0y zo+TO^5si461~BC(2Qh%Ck~K(_XW(M+1dh`km!rewTW$Wci~GZ5%#m(EJ`1B_$f!TTvIE zkXIs{QZ^u=VlqynTZU38XJ;#5TSg(^7Aup;BcFXu=|4I@8;<25*TQ9uPr3jVSr0An zi+7fQ|IqyHb@amC2$^CHj)cZjwAgy4i_0EI{Bbn_?Z;A*-5D8i3x2lF!I8B+2GVC+km}4^P4pH>> zigu~~+W8e5Rg}!YJ^RlJSdQ34m%hK`N;K+MKrg_(JCqp%KR`J`2F{AzuSC7>O1Ra89acTB~F!vyKSYK%X{Kjr4^U*|;SC}&GMzcx4ToN4yE<|5uM zxv|pMp#ruwqGY5{>TFM^g9z=lPQx@&Y1=ypI9Kr#%>|p$jYC0ZDw!t-4^b|%`?!l(2j~j_H#-5P1KaXvU778g6E3!Avai}#KHa@$+K*6Q#U!C2k}g=*;KKj7Kd+nVVEmbS_;eMdf9;fKidvUr*Tyz;Iosu2{qQhd5aXM{HR~T}mDrUm#R3T4kV+7sb z+_;Ya42nw+OVe#ckmqM84cjr20p|nBpeY|y-v`*;vZXrExT(?m)%QULa6fk+G&s3* zRs;F@rnig#8yWQBFS;yEIfxjq9C_mx4=`AeXb|_e!jpqbebi-H+(r6hUNA2CZSv3M ztj@I?rkH>j6k+Gml}e@yH+62Wjgb6n#NX=0#%5F?fxjU$&LNtv4iEd^{gxizG@gsD zd2!TCyH1t=)u3fy7(`gcm{nuB+^7ycJv-^|qNnYg*QQxEnL487KFD+4{0J-C72cWN zL&eR|3NDnwVd?2^Ttde`GKdGgo;u-9vHuRWi(U|jc?n&cQ80Xj@HW)mXu-!)d~(xo zihvKfBwmET_m3Yve|Cb8_5xcv$Ww$dTbQa`T-lm@MnNy>7t!%-0^T^rC(O);^ZBcysc1pk=zK;)!NOh2J zh-9PYrk*y0btEF={=}VF*MQeLEcL=@^}9b(IB+uIyrix`IZsyJ=Td3#KsTbgZVPTU z=*V?snuz$0I=7J%u=4zB-65vd+!?HO6l{|pytX8Hatk|!8 z_RzpviXQUDxbl>YPUL=bs_hH|;4-#G(qgU5e-t}&B6$Q2+NB^7S}W1&ANC4Jb; zfKMPjcMTTziW8O#hn1Y@0ldhLdhZ3)U9J~Z8T}T)vi;edLAxnl! zX0N_v&vg4>{Z=Ger-|q92hjyupa~|PmNw`WJ;?ae_z5s5LJx%9cTpZbM3UaIVFtzD z@_>uOE<@|J;@h{2{p;LiTT?Z(PA9wb;uJmsA=f{<9(olwy6wy$c5eocGc&vi6*Bj& z+^F+OTTwh3tKQ1vsi?kF4F4)4-4vc?90?jbu#w*1I*&YHJvH}CmB!Kik4-IT$)NQ!gfF!nK~s9Qz%*3`?4_$9Vchk*azX<&s`J!O11+ zkH1g|=hr@;3cQmZY3 z2xUo@8|0vr&I+6LH}Iu^DKp(nn;KBn2hN#_i!DfU04Q=(_1I|A|1^@#9@_&E#O~(? z2taJof5J{0a<&6*N~7K4=mX$tLqH3TS$6$C!S@%YQ9y~?N=l+k;(oT;ar!PxUv=Z~ zED)@mBuIPnPY=n`&ktWa0eE?hcr`Hp)OSnol%Z_f2T_bL#zcP5Vva`KbBp^quMD^d-QK*M(a6GE2=_f8plJuE&y&6nR$# zs(i~D()m!g{U@zsuU!|jTkiYEF79_Vi8yEqxcmK)s|0YF6?o3b34< z50`b=`CncDwRy7t>ppzfGJu61ATpaL7uvXOQgoQ+o>?Fw`LdYNp!n&m^)!pY?izB( zW(oBWWvGecBDXOzUo1&;XcAo9)b@&N|M5;S$ur(9JXpR>{5ElnfVtkghn~B;cKEn_Qf+YWDvTQC^yTPm~o$g_!n#pQD$$! zr$CakcrFKp&nKb3$2vsI41~Y6$aTcKcdi*7q+NOKsZS{E7rbz0Q#_P%~XpuZ)|<=+o{@n*TP0hBi29pqol)_I3Y* zcDNHKCMKr9f^T*v=TThg`aI|QiRaM=oQRPOmd$?g{-J+)38e(5pam?z1?CaoK+#D7 zR^;Zu&x$);iRSmX^#JKh0rBs1TGYf=mPT(j6Vxd~?Wh%vD!+tcBS!i^!e;G2vu@1S zI~GZMtz(NMs<{2BEI4&J_f%DWsFhKjhdOjMa1Q^o|UwfSLo*k z)M#avmN`6s6x{02+Fxl@sPL=7wR0VP*g=$q^sSR9`g;Od^2Xnr+!bg5wyWQwRQQ9H zgMH`?oFi9bE+zr01kd@7(Em1*%%&|qR+fKw&Jh&&95`HjW5S&E=c!;lO4607?GFHS zjQV8ul+USDZ%x$AGmc}b+O6!D?p7($*#?#Ql`O+q@kSm0@}AW|xnvkQ@!)Ld)?-Ba zvd2swW|dg@K|Fer8P?6%$|~6U78U1BpGsP20vHPNh zlkl;w0%r3RAZ6@AkEW`x6O&xf4(-AEi`2guA$a4XO@DDBH0ycF zG|AU(t)IdVQ?{*1A1pp6J1o>{$i-AzF2Iz*zfhdF8GJb7`aPQAT;Dkwi0|(775NZ` z;C4tvdpok~hP$i!(i=tuLJuK$IKiuCq?Cb~kNNQ9>_9Kp|0RUEvC$u%C96~_^S^q} zwTV6DiIMDzeks{xIi2kyo>Q$loJ1=o@m1~ddJ(x5y4RbE_ua7is{@h#e&O(!(*4b>&zQ zU#%_kGw9`W!&~;cX#F<9bgpQojz8zHp8myfzGsz=|)a+Ok)ZJ(v^FHKv-pZry>_$l)q zg~9jxJc6{@nyE1}z{o!Z?4W#CsA%-B(3Tpe%$a{lLayDz-@*9PQz)cFt6;)*66je2 z;WkRuMR7b3$wr--ZCS~%IfF45TqSRrC@JOW-U`8{c`N-x_+P38AwXH(Nb=PJT3KC$Icj0?@@u_58(+4d^jqq}lS~FJK=rNnc>UhsEVzQmFRU>FXM9C7Klm84M{olg#S>Pl0_ z_xPS!$Y43X1O36{b)d>0Ve-2PI^eVz80-ZQ{?=irYvwe7Af8(C^TzwoyaoU#0rbWG zl9}XvQ8NS9wSV8=|C%z_8=3>>8f<(fbIQGM;ewWtKr&cQvOU29L2xP=x+Ejf-h_@VE}_dhnN=uVCP{De?ktbzBMs`qPpV{Pq8 zJHycH(PRYib-qogE8o2u4l)J>tW{(Nu4l|xi-$g*pOMK zC2rooR5BII==LoL<7cw;=PHYsCXJS_+w=QhXi!wga;P*Gs&TzJweXC!@V4y>RKQ$T z@H$y!V>@kf`6p*J`}U+WV&&<9L-etiWDKZOxiPnVKVWTHX?!T>b-w99t-M-OAeBb{ zujuq;3_JC#Fk*!x>-1gVY;fjD@e!^1#!6AtpWhf+eUA~7QAfYIh1w9Koj#e-E}7+x z%DyEDqovp0WyZ212u=X_dADp6DjYs)64n+vM@mnfN34aXH~Pp6P>$4A==HfDVC_S9 zYBf!a-#+y>r4)n&&km)-ifH!lUked!6|8p*4>7!>C&aqxU|LiCw@KbeRZ8lzxaVH0 zjucBQlk^7r#RJHanECUUWhj2FPox*3)2P9GA^sBFM8e-m($0?FH0DEjC(rI{|mPTE=2%rdTh<`tb@TLg0sEY3Ym#=y@pF}Np=^n4xgv&G& zkzpxZmnb{Efb(VV;Pw5ohaF@9ENt_&;qE?B{n@;>v^e`mh6p2dI6gC?ARc` z%AYO>^qC5V-j+u1g%)tpfGwo3lY#(1_fGd^b@Dk_^}|oUrwAq`DdO_j-~%B@^OQ;gSCT}dL`8HN8hc?MZx&?2)s<)=!FI;$l*%z-i_SNS2;`#7T_fz zOQb}8qk`F{yU*%wEd$K6Py~P>EU`TT8j2Zs#i0D3l4yZUlyGk|(pfG}m21^| z5%7^B{X&V$M!S(X9o1|!j;4mrJ3o6i9|{7N5Wjz&sSgNT0ZAmYEPMyhja>5anOesM zd*G(ec(iz04+y~TkK@K2Q$tx1r6a|c>X0SIqWvF^{~udd0Tt!ewE>4x)S)B<8A4i8 zL^`Adky4rgM7pG;V?esQLj-A1x`q%5=>}<~LAs>>GkWj;-S6|RSvRz9!pAs z5#1N|Dl79)75cDjlpC%4^1SrMZiq!!f8AQ%y24O8Olvz%lo#ne$6+at4-z$}#n~hv zDpr)0yp-e}riYRv3PNjmNj$*@bYPerzp?(2q@m%E;wxgq8=}X=ic%phFo_4y3aHM~t&y~8(;7J^s% zWn`W(@z_;jA2Q}ZLJ)#Q=^`9UN5OMWaPjbG1l=&0;yKb*SI4Qn@g5HNSdFFOo{1bQ zb;_OCc%=#&>+uIJQHmu7mjTLa6|#8>yyT%7+$vlAt69-ux{V(0(0NyTn(Y35J5}{F zyQD^tno4-d}O9&!NM+<~>yw8@`q!v~wl8sd!SkIhkfjawTGM6#4XH{1VjzN34*Tg0EKp%aG zz|3MUwiN%@#L=DF!utm`qjFdd!b3M)A}t^9vP;(k=)i4fX!1hH;oP(?kZ~0)=t=tt z7UN87j`m0HONnkYM1>@5UXRyJwZ3u?eE$lp0C(=jgYEoM;`ic?f&lBInqfcGZLbU_}vRy1;oZR68)A5{@2e{Ey? z0{m@zcr|Sa2)Ege%W&x<>pN-0#Rf0l| zQ9Qu-A7DtGNzlm3YV#T3q{i$UZf%ZdoW=qEsKQ4o`*~NOA&BlX3pQa{G-j0SQnaTY z2}E>6>u-hJys*|#<+$K&*y2q*&gpi`26-Df2W2qzFVMwYCpU*rQyUu)mvB7BGJV5S zOybgI(y|J94BF#?7*|0_fMVeZMuKSlaRV3^Tt)RrvB%)98v_~U4NDEg{Vl3!@RI+Z)?CPWAH1 z*RRw#bFw6>n2jANZg(5hboa4Z>v<#rD(0z}#<>MVO3={H#PPkc-og=hpNRe?@Y2f$XZ=eP|e{;2DFFnh7hZD`ae*w(>YTc8}5w3opc(7H1jA^2gS1 zg9|=c`KV@Dvd_3xmZdj*pfu2V%iz%3h8=PB5CvY1f}uP;y(p zqgU~b@_X&QurBK1F7sL-+vGl<`m>FX;mR!NLlhac!R(@YxKd-x6}R;}X0;5s0ukq20v41iXqr{^B%VH}gzcP7Yd#QC2t- zWMBELd4fr`FQZb+h)pOxXJ9zoFGB)63)UySgZy@e9Y6lG=KU`XFaR) z(bN?x0_{pg6JkD&p^Pw+uOI0y@gUAcE_!q;7f;<|$3*OM446_6h8=SH93jbz4uj@z z!Gz0{KoG8Z9Ec(2uh;P(;&^#cqq}`!hFldFYAI~y_?JCSfMKk)#c$9N3+r&hKd<2& zbuz~IDuB4UerxpVEiA$fhgA8s2woZe4=Mq8xM1_8=fp!_u!hEp7%MYV2U?owc6`(6<=6YGlYPfk4V2A5%|^+qVeBX33MpV#9= zU(1d$bj)MXobV6RM7$|3%78U3!Ludqx;)Tq-GlkdBLstDLeQlK0;mzny(E zZ2qiDnHl;n&Y1`Bn_F((x@E;Uw8CMDOMT<|x5WMrtm`4It*xSXW^SQeRG*%PooJ*& zp5W|g(wr0!Wf)#vvcJ4@fq|d;P76#Cg8MCm&Fa76UJ=mIZsg9Tj6Mcej#kfK=beJu z2eH~43jSt#M0~?dy#E(0?Kbw1{&1V(nIhQRlSr$H@-Kia9oRP`wm0(?90fKCYJ zp(!QoBLW)0W$1SS7rK!%F`N^o$OAI(a%bg>&5l}pgr+QZy*^O9z&48m`T>}rjLc0+ zf=N8hHq`5$c+i=m!Qlmas^jQ#iC)APuRtDV<#HGCCXwfV<-UC9$ZnCpe$3=}qlrQV zr9t$^A7;A#wOr{7-(dk$uIwh_;WA}`;k&r=wdmJh5g5IBERN5qfX)e$^LH6y_4UUD zZhp`7dtWetcDpTVZ`l;om)z3~)icA_0DBEkmBrnuBcA#hl#CoQ*WlTz6W%ia6=eWd%G2mXobc**Bp)`jW z&NoB*7c4z2*~9)|%F-iFjQVqnH+#QvXqVHu(d-e`jLH40xSUdpKEL=hb6y{h-eXT6 zh20T4`2c(@)~O)p*wstXm##@|->jhnTT9m^m@3-A?NpQxUu8r(T5)(hJBA)P4ELE) z)&aiGj1B02l04QeLaqDv|Rkkk{RVJe^G@UlHeI1P@ zdj?3$r^jJY`HAWU;pCsdV0Mx#+Vge$qH>c$ z^1Q52n~Ug21^-IL%m*!3xD^Q!&X$o3*Or`u;dx*XGR-_iHL1_hhsdYXfKKM8>pB63T>()R&&KXS> zIPR_t+TIry--bTwZkM<{VDH3RBV(`TQ*MR;Riy_}pM2QA<_;55kTjPa}y zsDVu*#O)flKQ0s}VF`Kmpe zNC1~VoefAFVJ;oM5BcOzgwTvsEY^U-J@!!)PtBLRjLV>TH7Ht-zxRz#xziI|Hsa_$ z>&eZXZZ9ZK_AUByH>zKZ^YK6ZV1aUD2k2cigk+hBwezJpN9zHZc4mXxe|OH;&f%K}0~n95f8hjK$S@ z*_pE8D?l?-$&&~k97t{y$))g zh|NH6?mo%uJE&v~f5l!}A}Mg@``z;E?Ie%7^^H5qXIs=`N97Rb!rrwri-+IqH@*G5 zn{gWr=Y(-$0G$n~Z1af}q_jl8QrbL_cIFW`;+)?sjVd1yI9Z>O-V&e&w`8G_@@F_W-k`C?3_1iA(taE*|6# zWQ2yV)|#$%V^Wr4h>ekC#i%W5hkIKAQGv#wfJ)9i%D|P^ET1Y20!pTnG8*mSe2)U5e3Mw#M_b^mD2xS7;wDN(f% zJKL<;Ywj|YuUWTl-VAZQtJKQsL9!aiIH;Bb?ZA-!x>e0$oQNV&%S14)8_}GQWDn0e zrIFy=m#xV^1d6MfkNHJi|ev^{M?tf@Ov1u-HZ+c za}RWnwx+f9msEQpuCpg}QhABJePOt59Cj@1st}<2UBb&>Qnx-@R0nGn>8jkABilwZ zy5QlK0}7J_7jxi$G-xKwG)AXml*QEgTpc!NuSk^7onEMshqmz;daT#3xphp`;R_eI z7K6H^QTFhvV@80I#MiO$mW?x|FGwOjZ6thk{l}q*+0wiS^V1@S<^>| zxnnBrX4fVHZg!UJxVlSSeZsTCycqt@Zt0d1OwX73q<7F|6_WVb&;R*b%G^;|jO!kX z{;19GMhZbN(PVg>#WL)HWbr1XS3uKhuxVXQOMY6;GLoP~pI+SbWCrMai zRU@AOp3zm}97db_Z?^&ct8RY?*&1MlgX(BuJWBvN?+MHpKNo4rQ-O6qx1uHe!@AL- zCh_kzDG$yn71_fhLZ8cwsd>L*FH#(xW*PE^o9S~b6xh_SQXavETY5lAk>d+^>*R&$ zWWia0Zt8`6+Lyq8a2xwZ!W1zP8MoyD*NO_ihimU~HwBKT$CqwIz_(?IeeSN(8^8%o zDf)`28vZDAdlnkxR!#A3!ImqL%l+)spRog1TA8l7rzYwGdD$Z@{2$ka zBG7LEwT?650ecwzkGdG;@NGeMr-nHU#9Klf2~_pOy*Wtl;4G%~aQ5}2@J>+Tbc5oh}b z;4NkNP_Y5jKmk^<3;o(76o-=z_z>~Q8*gWl>an5s|0tO>#So&zR|djd5`9ZCQg>)5 z41Ak5&GURq6Pliyml)@s`wL=zo_X`(n^j5>V|IVL7F(;ajEYA(Oc0u2gsQXcZHDPu z0P4bQTGP8-FrTS*M^62>q!ObPV5@rSFG;t8ivXg*3~OP+^D+&Aj>w#WG`9_3%9~cJB?;Jxe-2x*!{xOA9?2GO>*c$?QL!zARF8Z9$3f=ARYl zrIJxg^vmx;rZs_Xh&qs@k;nAV3*>pyWRs#sI8c+ymZL%UG)~z=#2g9-Z4whgMG=qA z-dOnP0KZ+^N%^840>(Mj@Aea{YpjWPs-y>_ z4cmX0BmU_?&v2(7Oq9nfrw0%3c=JTBS!o>PzZv6RPghAXgs#f(EYXys#+B7D4S~w* zZ%1)$13p3Wptc3&GVJ}g{={46k)m3;%T)@4tOslf z3=Xo#l|%wesb{M5GR&~2IJ&wUb1KE4CUZ~N3%aL{s?xAl>i0YZqk>6a*LcWsp)?Qm z>+Qtdsl0WRWm7~O=((?mp_q%J4uC<~5R#Zqs;k6w;18OXwh5>U>K_^v7hchX$;2pM z>~R=)Z+ycCr{6k8*)!$Cr}?ZHU5c5tFL5A40i-~0!I`*^7t7f(3Z%Z_E$S*bJ1Bm5 zFDPi>gSWFJUH?RnV~<;CZ|{bxJKn+%*au@44p12e;L)=bjgh_MpMA*7ZX`V+Xdo_9 z!@QuQ`0>@2$57}UKRJG){f+Lz=;BCL9~tXVi~FA_=z$mTdrVRZJi>bfFp;-9K&Fmd znVj=aKAo3njHpEJm@88eTKoVy`wI^l@ybo|Kn9ybtn$l<5&YhLiN4`5*CJolF>WS2 zzxjYNYk`@MYPRSD_h$s%mFln6|(#Ea_1pkl|}^NOkm06JUujVXUN6&Iqg zuZiPA|A=Z)Hm#tPHRL4}Mx3@Sm=;@G4q1QFWGb7LDa%3!{CZuQbI^skS!R!tn=b{Z zoa`?ZrqhobX338(zq%4Ws(L{n)?>n{^&fFdd_)9NH(u8XZYglB*u~&-5Gf!ck1KZgBjPYX}9^d&{-M6G3NNQ zr%=e&re+TRxFWb%$~458K08oO4ElsIJC1)GGvCjPIab!1qwUNYE6Dg0hubNDt@Cr< zXb?@>56Yu*GL;3UnP1$69UECc3ZLo*next9(Ag-gnZ5F4vuwR&GrPYF2s8mym^ z5z1Y3OrXg@4=k`J*}PgYE6N3oUMs&eA95;tK%?SYAF?BL(E30odWm}P#@pCy`7{RM zKglIC;S84@dABhfaA0d6C2Aj|DU~Fbn-)w$pa-?`DTlv43ifoa674{%3Nx2@H-d9% zvTJeIABKhSfGM)x@{~6}r(|+pP0srK;^WV&MF%$-I!fX{aZHs}S34C_IaN+zgZnVS zRPy=b1Ct`pFGuH878Tcv{YCF!fKvmCgduou*9*^m{J{sB; zLW}?xcP@N;&_PU8TbT_#wOP2fs2q^~%&{Ht~6BsjrP zUS8gTfq?zlmK-qa$!wV37laOusazOc=QcmBdBw zFD2I`n1inr3%;K^F={zVabatGZqS3Dc8LEmDs&t?D#U~Yl8C9W7GHImEO=v*EIZ#Z zB}RpB(}`>(g1#Ypgv8v_7iTn6CSHxzT+TLQXw1b;>*-W*pH^As3%B0gQ5ETDbV}Ag zh1E$1MDaz3Z|AO(|Lmn%yM9K*<%NfF_S1V2{g)Xd>46!EWK>)ofv%T`s5P1#4koMFLnGnlXRl~aSxRd54j%Y?Gqq~8WLxSe03SDVC7!)vzAY7< z$GiLEUErF8T8NQnF`;%IA%<*T;Z~5%J#`wC8N_X2_PrBFQwq=ir(gN&v4t_-pH17z z7Ib2zl-Uqp=ebK(LT%Ig&_n(9o3ba(#atpI7ukXSJGPtZ^Ce{vsk%E!)k7w?X8kr{ z6H*N$-xd~Cb5aDt$RBfffUXdR|5mgw{&fOyQpA#qnzvYhkL#nRm0|G8ZD0HC&ESOK zA~X?$kmH><9%rK9+|-VpzdIkG2Tt0WDSIYWf*P-d7wF7+*p}lucprFJ)0b?kt~%55x^&mgs$IEzj}KT0v#~ad z)vK=sWGgOpdnbtY!`H~?6M7UuENi{@8fUO4ks^j72i<#vBB z59&Mjb7KT%rftZFMvOgC)~?80B)N|3Q4EV$+YYX%nlf=$de!P=7huN29ruwlA%T^2 zS$WmCqPP4cs%=?&lCWX3erYbQe5aFA!1eG*p#=&Brda4>MIbu7PH#D}_5iGv6=9Ch zFMAC5wK)1A$DZ@;6{8|QPA^WWu@G?lgv3N=XHq`}`U>liWBVFy4~LSI2b~);c6~xx zpb&gfro49VlbHGWJ}BljfH{aA&oC~Mwj1CX^kQ%mavEMgH7#pgyzMLnwfoUmBvWc` zDDolkJYQBs+IVm|b^v?dC%iYjH-e1mS1c;woiZf`hlT=t)7TpE;oB*aathRc*iI-k zKtjmyupnPCj(TB)E~_S4O+Lu5@?GeZei?uD8;|1g{(=XJ!C85qkq<(6*L1@l(9>c# zOV&QDdUISvJ@JF`2}>+&jc1oZP6MYr6)h7!8L6y~g)Si!*Umu*c4+CVp#{o(v3x}H zo}Ew|bT@_^shXen0YR76W>QQfg%n2Xiye{KxTXtu=lBdMq&fZid^NV=6US+Xu|vR; zg^PHc{cIImZca8q!_27nn zT$~UzuqF5jFa2=Pampot^Y^}~RFduRx%JNo({T4gN-l$AyZxO23u01d%+FYomHFEy z!@=HJ_Y-Y$`g5fZ-DW(#@`YL^sm?ATxlO~a4^3b`tD8|7i*7AEvOVf;I1k7$pcO?^ zvWC#tg!={Lj!8$0squ@$Hn=Mk8Gq=gtIt_2Qoenf{p-W^aEXFjw?;KD%`hUVaJ}b9 zV3-IvG&if#oilr*-Ac}-WJ9vigz#_Va01Ey1fRL>T)UWg`_0m{ZfBjY;c3c8T~+SF zdfueMz879{-RszQCIy<#bJn4xGGM>2@LAdev9lE#um)ExB#>;y{7C@h{1+lY$FYxW zvdx66v#hoHekcyoiTzw7c-yIXJI;LmUIT)}RhPJPEjIe4prg;nuh%6^a}Ya~tcsN! zGvt9BkBGW?tUqEt&4iO&emA{}!2&|mN9O;BB_!ZDx`?Gd_+@QP1Yqz*A25vD0!@l$ z*H14>9$m#i!W5Zf56KOLV+Rjg#Gq%+>?e}RfA0*y6=^PZcYS+mR*h*-pY!x&ZRAv} zw#9`8c^^aZ%<`NoQZm~^-(fjmtsaS@Y;0xq-5&4)Bmf*ehb`kDl#lK;Y}qaQvavo% z`sJ_w;|75Jmj{bJ9}b4FXqit?vaz(U$42i3So9L{O>s$9rnnUhz2s%vn~A({>Tw&2 zIY}?orplfPue(Ue!^V&YF30@pZ4w?7{(AgM`vC_EL>T3-Frp0eE!_u$_OR=q6n0w3 zx#VH&rK_%~IpMs!@TK4pp9$8;5hSSf`xVXgc0&G>crj=_kQx=m{`)}Y5~#-=N{n|U zKZ{o^_RDddaZWvLOX53h=Qt zBu4J{=LW}kF?p&Ps`OaWm#C}ONImGzl%~D?f+^WriN|@}bcpi7p4HD9$M4@LS zSHlG#JMhfsw(y20B*u4Lwc!@_bco}~giBwxU7a=~ z5gZA0!dB7Av2*}<{$-gN;!T3bJa)1IxUPBM(STGFmL9EGQi z!l1Mu{za|Oqg?&q`yx2BLv#MxwQGCG6+%n#|Ht`oLV*A7jA?p?kDjE;=Gp0!UVH`+ zaM@*=eF6mmP!2eQ2P5)>T7-MWhtj@;Qjy_~PGh3E1PdSc@V`!oSj*;)>~+4mI@?_L^zq`ORfw z%=K-DXbd&Fqq}r_BZQo{by5OcLJ8gpLMCK^)fYX?E4+d7?j0XcC-0EV$nEcA`~SHP zIu0E^hk{4!6zQ`<HW&SSXU|qrivGh%=vZ7Z_7lgri=>Zc8(tL7 z7RxUUik2d&H}A$3lurjRCNORq-d*9FFF1VLyhDG;F8Q*n4w0ZyJUHOuHtpQOa{qnr zBh^g3PQjcYS3Wi16P_v(`G^{WQ{mvpQb7+^!@v+v02qgUx_%q>e|el=DKt`CwNRbwN%lSQLGLQX218g?9E7ytIp2V3u*o?jvSV@NSZYs z+^{ZK{~hp!HonQ+I|k3WI4_acd^e)9t>)|ETrZQ*-{$mpf8x4n8W6Wh+6WFT)-`S$UJo+lxnf3d zqrm)%V4GmQfTEkIFNJgQ)^Lb&&6_G08ZGQrBx}RSYPUcbe`W@YAU!?CgK8K219FIz z4NiHysNb_pZ`ho`?&0@4#}m*4DGyvcAv6i(+w>P$07Cr3xPR~FHAr`~b%M%U<$OKc z09T!v5;J}6dvN9#Gt}9`f-m5~bTt$h?&4m^D&a(&T988R-$VC!qnJo8fo#xuXJ3eo z3I><&DlS%CW)>od;W1r2X(vTV(5>uZ*&9}`*V*E`gdo8F~etH7lj z5zXMku(^&EDMX5lHXTNCiGGhmKBhLWEp+T~`vdYYk_ow!NC&)GPP`;R3?SP^h69`!OAND7(q$(z{` z?pxe6_~JCu;;{K@HOQDJ(1P;TSAx?q%*cBs)nkWrP1V&Tch~rIPi`k}ekI8fw47P9 zd{bHd0{=i0(y<6N2?eOs{H(f`6J>Hcnfr67%%Ps|UHiXyuKkf_TvmoP-`LP%S?LLyxW_uLTUZG`Q($vg ze^0|CB*d`0iz z*(3!dcRJthNzI~J&&AZ#M1>Xp!GvU#3#^|AxTN#>L1F(N zv;MR6956`tdU!HZ0|d_LUgvC9VZtF~Ct5RDdupMEqjSAEY!nW)?$YgCP? zpNvx(f_%cqZW(${&NZrZJX;`|X3b%}FZDGnLI$MAz60`Li>9;WwUDrgiu79$^wTh8 z0@1{;?BPF;ii*Q}Cb+{gZd|gKcWkjhFK4SYS&vQ5X*c)IqmJ7X*{w~m{`-jXZPbK z2JuI0N&PoZ-$1zW|qL9VpYmwL9|T(bS^LjZ#EwZvac8LNa~KO{JSGXtIxvJXN{(*kzNPTcLg_a?%mYvYk?T zD4q>K?#sfwn;6&eJ0>_51ZMpdl^8yS!vS7O;@TnNs^V};>JrL8*SzyC7Us5^#H&>g{bP2{=@3a0$S}H3HJLimG+IO7;bEVVuLE3prY7jF(jRs` zwe=?bxS6&_Vrkq%bler7BZc>mfDn2YLj@ zJ8d)9NE~O;t+aUOALH-dmEOsOw=abTQAdCxJp7Jqj428D?yn=W(md!vcBH2c3F z{hx1P?rnJGQ@!&jTJjOZm|N&}zd5=h?hr@dFRJl`4XWww+UHQ&0TRDu-DyN+?4 z093pZueWgQSPS4_G|*Z{FtnDT9G18gLUe+E`F;F5j?)l)jA@3y!{*1hOD-JmF~>&U zmZ9&%!^0;d1!|Yp2Hoeqxok|?vSQE_s>`|Gcs)0Sl1C@gp9X> z8;dJ04xJwHT1{2&?XM0Q^UoltEt~cP9}k?GxHu-Tx`&q<|WPPzeo&na60W9Oz zoDCB&NM%Uv$)g+(hU7{Fe3kdLW1|A>+~==-{m*aGK{*&^xC`tBA2Y?>suCYBBd!o2 zV^_82ac0X#<|~>IuC2ueGpw|BskNz4ss{C|Aqx>N>&Xwv+z1J;5LQ3Cu{!NvdvYGT zDv^ID^G>=vbcG3gg|zQfHEs9@QdXVPwoPuuVGHHlCOG>cq94`IXp0V#zf>U%d!?TiKzmy0U3e}ZBn_; z0)|b=DOlG|@{117hl6qP|0zLnxS^nqm5{PP>6@(jB#aH2 z!Qkx9ngd>%g+j?C_pct>qm<^BbKgaflJ><%0S#z!e z`~l1C;i@(P0>5v`U<*aQ?%f#tFDLNVfjS8irR@8(!4l}=&SPhf1<)=gNc;WyGvVxU zu>aIM{{L6`6`EMxk^`;Op2>HtKZ?8;A5kU#GPoJ&2b;gHf9kFzrz1;R{>8s)AAUY} z=C-URV|fjD#(;6Y$;#INs`^C*!TRyQmp96_kuO6*)w}HuK2lA4_2sv3Z_|c|B|ka2 z1|lHv6qmB*=lM>&YDJm{yMzo+1xJ>rw>c}Q<-9%KAB&HM zNP~9#eEM*Ps)2U%3*G>z-1RGfk4yR55%lsN(N$NHTeoU2T0VGavhHL4@!K4^knT@) z=AmWzAB?4A;xb4S@}qZgA3KjcG8NY@^VX^NG^K$fZ5D`sC{GmJ=@kg8UtHV1-hw=7 zmae;Se^MtYblzm(hRvHxb!Xqz1SiY#4whKc>~ep%ttTEtsYGN60k=;RomkEvb(uLj zN2YF}-vit6P8=*lqpB>DMtFt;Z%a#CTTqypVnV1F96V{wb>6$fr^I4V3M)s~;eQF$ zO>+btwyXqFzPY^BD)N2QXov^+bJ93BWF(LeikS2N3>*7o=w;iRvf`S4GHOH1csA_J z@vwDC|Gleb&Rur{Ev!zRTaopP%uwa_0UVN?^n=ebT|@M%Uw8HO83}-&Jo#}2u^B=K zwlXs1Fb$7fsf1zNv>Y2UI0|g%CcO_mHOB(LHiV`>x(NM_2VppAWV_!SEu27YFmf{e zD@)Wu#b}+|tY@jgbrsG(Fd7=h73mCorSMl-h-5fF9w=gap=G9B;aR7`-z=O$j7Z1+ z?{Pyzu;FuDo2_MCTfDWJWx&J*A6~Biu4NgaYniHXuprY^pl{r@yb+HakLw#qa|<3G zT@Y1iZ0N^(lsQxHF~_NFSi4{Klp898D*sKa+yW0~75P3DPN=6)G{!A@3MHikJ#r9= zH-f^QM)IF?GdxY%miCu$mDc=itIXAv@hla8x_iG!(FtL#L3R8CKO0OAdJv+OfY-S3 z5yZ@|9C(ZVt}J&zzRGs5^U;6&{ojYv--~aj2=x2TStM$vzQdCx9)Ld9cGlgq$5HSb zk@wO>)g))Ks?`<;I-OF@wH7A@l-O@%9JgBRqo2>cv+Gf70?Q;S0MJ2&4~@eDwO4Mu zKY^hi0)QfNk!WZbhiZ+3Na-vCcUjCvw~hgIVU#{QY^{eGg8c+_K*$tFs3wN~eznto4FVphmZ zk%W0`9z<^Qm*~Z->xzr0_*RcQakbVMK?LxxVro6X3ket6a#D*+9=jiS+3a6Hg`Y1dHGk-)n+F~oh2s0(rdHk#n-t&i&)bxg5AL^z(|Z|gwYSHm5G){q1gqw^6bs^2JTJd76-d5 z0r13Rd04f+B_^^>I@?%rjs8G10~0*Ly{7p=Kd@0eZ-PHvc?#Poi*>I+Pgk$Ki4aCr zaT0MK%Q54%ygCOpX3(ybE&9X!L0O(0QND#3YHOC~!nX8VWd|+~7voH^<1IjX3~{6m z|FSVYU}Nq@<*Ko)70JBng7#6@3{gidI^@@3>|O=7K@dh ze1uqp?E`wBG6q}^KqJ!8oH&sIz89dZx?)i?;tc%W)LEWk$TKv=C~lS|4e)!83q|wU zC?6P2r)(~!<;i*gNcXII#2>RI-X;5-CHkKnc1S~b@4x|xqpEn zJyM*PHN>X#23wY-MroX26N(5>{!I*6T|D8~zZ{;wtj?E4zldm4IeI^d=!h%&Sb3r( z(_HbTU3;|LwmHXRc21kV*Okw0*wwgn)eL!ok$q1!^H93&E-LbKMSG9<)m^ZRH!$3` zCT|1{N5+fB5do2nv*d`F2^Sa+#s#wDTB5D{Bbb3ai$}-+?~sHvjAI+PJ=dp94#sY< z;#^)KA$Q%kicS@yCoG|y0wJ7}<)g`=oPDN6?nMVSMF;I55M8{#BH9M(gtJUUS9QTH zJqk!hXK+uC}_K6VGULv zuI;&+ndMiTjpV2LihoPkI?b8D^{xFAUqMa6L?;}ZY|9=P{E&|_U*8;Yyw(#MQGohM zdq0q4g_uv!^K%xaYIqG_{r|ZC_xpgph~4+(_@O~%!VP9*UeIe(T$df&*F=?W;>tQb zqTEqG%5A()cZ0m;>F1x`zFE;s4%mBHbh`^8JO@(Si*`0%IHQKhqo06wtBkx|i|XEH}u#hF~tmXcD&hwLz-lwfdA+ zWJkPXy>pkwebX2EW(hj&DhCtyR+$F(f1-Jd&O$qCCv&z=T~FV?vwz75+{pFT`&Ve15mLQa{Q#&84NEfME`6M!SU9$dN6GTt`2_QmN0w2ZEE(( zsph&a;DP&F%lYYZk9F}34Gql$gIbPNzN~KMPvdMmD<3`KB`7`>he8Ctw>xf5F)_KP z2lhP@$g=3$I|R~mDgE)&0H|2LvLO1m^gtf6Hl2G9?@`TT+a8}S-$_Wi8K`<7K-GgJ z`4j7&0FHMJBHae-FSz{=R86XnvGkd@hxuI!dx!ze^i`#vPo2r?@l`3q>oG;~+wsh| zmWYqO&zsPov*%}OP?t2nY0l!A(BJ71+(~*S_aRRt@y1bAo-i;rsTjagaQRG1%S6j_ z^T9P~dog>b@c>#|H=(@GtH5uPiKn2&6kNC0+}Cw&0wW zHF|DJJC+r5z5&B&&|#_=Hqqhc=4R(@J8R%vk|Re}K0R)Q;RNk)r7fT5Dz7aQ6raz7 zpM38tTnMJdR_VSClI}HtjK1LBaL;eDz<)n&xhQO1Dxodm5mTTG_VztZuU_y|P5n^O z;b8f@A`_%Z6+NQ=G`HWs9SG?Tv`$iy+ECjqlF5gS8AbLSN=yV#6y3?KHvukhJ&#P9 z_B;Fu(UTu)h3WDPo7o|}&qJPQ)JtuR+!W@Y-ORR3a|-Q9W}!=aKPfh7PEJW8oFEdBJE~F22m98sUu%8BKB?P~gLWsz1)~$xOR{h-!5UvmYtvm5-6`<<&2) z#|~*;W^J8{=#-?m&g|7&E6P$@EF{$%R*Wi~He8lEIRhr*6Q+Z*#|NYaoA=_o()%X; zw2GfGBU?%LKE{BKO%kG$=Z`Q4P;JMAZo&PJB5#=hvfarc2ld?#AU$ebx~XLSf#g2& zPm^1q)=N8XGn6Z0uFuA?sw|xfbOiVv#SJ=;Kn_EY>Oz&J#0PiFxzNKw;acg~OhpK$i}H z5VHcrj(7$)gzG|!ZDvlX=zyVxLwi`WQ5#&=0%RnI*JDRddK$JoGvPG=0mi@4{~4pa z)*DmW7a+<&lg_zRKI)hr#-t?eEvtrFU{h`hG%VBff-p2iGcH|z=)oCwV z>rzqBBm-<3LGYG`}D9pz}2a=^1gozHz3djD2GE1rYe>yp-B!C{|%Q_RJDGaO< zrQzsBY1Q#nY4tQ(W(4V}QpsrJUW^HVohM&!Cn-=C!pZ&i!(iVtw(D%|4+37HkP!1d z^NgPG2m3`mLYe$HA@>oih=oH2l4M_6BtJj5lLWH&Rrct`-G~Fe0u&hYw7xt{O9KGg zM-=9tdz8n4*XRk@Q#(JMHXBYiEelkOPN=hE=sw*QX0}a|ao4{U4%S4>0Ng$n8%b9S zE;dKHfFIXBJL^pq(7omN?UJKyD&4+_)2Kn4ZTY0Id(B6ss>9L~RCb<_`Xz%7mrq;5 zRyWg;J3WrKTHERBFJf}}a6LOjj3YaycBI24=`W@in8@}8f}aZD%KM*2v<}L*{>UiN z5Qor?ZKh0QMfu)TeUZ>v9>_XL-7o@-Kk{`G9Q!mu%;|XVedQ}-G z!@7F!c|@01@x;V#u#8DTfaS;)tDA+dNJwUt2`UG+ zQ(Dys>7EZLE9G+Ag=30sPh_EdCW!FYnZ_D-4jnv4JMycib3EV0oU}8IotXd*VrJwC zcXLWRfXE5Qv=Ki}&eMAg*r z8HR8UUXIfqB)M$Z+yMm-A%q}rv&8WFx!}MRkjpgU4xfitU2J-sy2c;0T$ex|?!A}# zBWn;X!C}o=^X&>W8+lV-kD+NFIdlNADKXMB&D6Pq>(oVAG(3^TXtJIQPg6tdZoX?? zzacv)2WQ>uTzOR0TIUS%vpv)Gydm!yF3sOqvt;!C27u@+xZCdR1(|Dv9Ubl9Ip)4LoJVZ_kqJZzM*^S_x-6C)u@UBrDiz=&m%b z!Il9=2d%px-=hg02_$u@x?k*LSRZAa7>YrVp@Bgb=t2mJ*nL`CX+o~5>3nueWZA>^ zgFNP=g&K}1Mmg@XD!B7Z8RQ6`YJfX=Ri$jkq#y!jL$86sNP)}4ytciw$@P%GqmnA_ri zvj9-xi5n+{T@Nh|#@&ymbHXSE-Y*3Pw|2yYEw1L*2^z1rYC{> z(v8Fp1`uE2mQPU!Z{`Z=15{s+8S(yd-a?(sD9}Z7-Ff!t=*Idu8Oa}|2t6WPAOrPw zin4Kc#jC?8dOK_FNn{WHQRd77@q%qkLE_Z-h3d3N>-a-QOda7NlTWkTMfXvqxxNQ( zL!AMBUZQPAnJ>y-;m}v#^aokCf3$dftvJkbqpWI_`0PFyGo+6=JXU5Tssvd^*89@C zVmpFc;Est)>Q!w`@$_Y9Fp?;(47+p#0XNPO1MuNNf=Bh3DV@1iUO~ar@U7xc`%Z^} z@P*w&u})CAr0UnaEO=usOb%%tdq1-1Jk!UVJ=k^%!1CEC#W|x>8bjk6k|EE6)b9w8 zs_*nE-IV+%$^+@V5?5MsK$ZDe@!CarvNuDZt%At8{XvR+O|*o7bBeHgtxGTQrfjyd z{l(ew9?M}Y8@xW3fq`KpPbG&=qr}L~wm1@fYy9IQp`$6g5xMVMd&3pZdrL{zMJL&v z!_uQqfWA6;?2)_fmxqfQ5Rs~)y3N`yncZ~3@2l=n!^T~;t8QTEvgl6u-QC3~yuFZE z0En8n5GkfEyFzdFPJATh@q_#Q&oI)X+60X}LukCQ0AABvAN;7|!RrV=Ee!HxI#(+Wz-__&u zVQa`t0hQZ*&PY9?%QRAM$Vni`TSa=>O!hIAtnV)+Diza-dNUP(qWlUGqTExfy{!$t z+s)mlZIU$&VwQtGDO|2r${eN6Ac8LfG^gsUpjo7BCv3WkGmJK z$*=EbJ%qrvX;9lbR;W?7_;Y8T6yND+DIS*2Xid2<){fDT_d~_#(Ff{!bAe+9WOlMVNHd3YS!$H&Ka+09b@M^BCb zv|cJa0e-neHm^j1nb7ZBJgO7}Up@i1ug^q%nj;1+2u~jVJO5R@g#~+YzCRH#ecU-ckX@n-S+mDzmuy9rnXyEvB$Tx*Lm|6K_GNGtvX1QQkZtT^8)Nr-r@Qt2-FyGi zEwh}@Iq&j3&-)zDjn&A0tE+P}fpRm?i)^+l^w2<1GqFG>YKmAi+|NW2BA4YT5hcR{ z6aJR#?}M$_XlAC{JMfm$_8$89H4`u$Yp8lRon+4GV4Of8$9kdYr}vBE)zOKzw6+d= zK1=y`M8O4%qK7GFio$*fVQ+{2Qjbwc(t06M9y&G(H(a^wZcE;uKEeKVH8B4VA^UTC z^Qa|@fU9>xWlp=$AFoPW;?_Cti%+Pk3Y%;Pt4ysf|GPK4%`m#$q4YlVZHD0In!E;< zV~u@vHKL6pr;OkF8R&iVy;Ty4v{&7K`5@(VR%i3UEDte$1w~ojrVhqWQu}l1YHrH( zX#2JOfSWM9Ode2X(ejwmEaQ{O+zjs(FsFH4b=3O zrqt*LJd0;T)j)k+NNnw>46@0HUGmtvy;~ADcM}R6SEw#|LMAjPAFt1xW%7x=m$Mgo zc30o)%Qtc={W;cMSQ-1n4I!xG!EP(kpejK;=XMfg+mriQteKy9LCiLmls)jj587LK zvG{IOs1RP@6n#x69`mfn;bLM8r37>uwY0Rt*GyEw(h+)m>}M+-mo z!6Bu}O3#5YgR^8=%YkFW>4ja065R#Qy6{sXp7r4oLvS}rc&OrtV!jPW0aYUg5I@b{ zW%lzrtLX0)j$hFw=H>(0?=9uCphmK3TkSI_6*bOmW%3CAl7G7{d?G8`ig*!kw@P|_ z{8H=Zy-K!V+IVu|V2q{)2}~5TonLl7y;E$&MEy<9y99|gUE}@S_7nV@Ic8;EKzdL?HQC|ymJ0p8FCxSRe%-@Oc+hzCtNokER+ctK(BvX{7 zX!=q_U{mRVVA6fhU&K@2@x98^GdajVu6;qWXX6Z)Dq-{OE=;oroqJ%rIsAcBl&6;| zdu`#oKP%S(dYOwVcCl-Dd&X*C%lM}%lt`FoGgkCIPlP9ld1`xRKLNOL9F+6*H z1~LL1#lkdn`m`&Gl{{nGS7TR8w-6x{3-T15P<@A&v1h8*34pa&9-98#+G_B?*7l@& z!Hg2@hP@MQ5c(}e6H%{qXPP@ZP3GQO;Y>Gk?We%Bz(F*zRC3&$>uEOFA3dAMZ9xAV zrl5o02$5ZPM&kp1E`s!OXJgArkIJPm%osmuVS$ z4G|s;U9`TsZz2?1(kFTYLReMaapKak>LI&6$QPq;IF?kGL!NmcRATuDyAl+ES3VsZ z)lPna4gTUfT7NK%f41vvsXr&es3-Q#o9C#WVFE;>v5gH{zr&s*vitq8d0UXy15hCa z9W!kgJRV=>og9tGcd#?>yP_#t-O#-Qy)-WOgHCvkVK&J2EdO1JABx`pQFH#$kc}Z2 zMmyz>>UGx(?s91Ecyg;a^4J&quVXJB`n|=J68_QxjEL+^^D)onvq*PHsfd|=!ojK~ zAS3Ql(%o@VPs00>mzLEU_9mS|j$fF@zc~bCrVCrL>jf#-D_ItH`g<9l07g6s(^9W) z$<}5QJC8NV{Ma)P35Q_H$~Ey@I79&nFsTZa$c}xY_lZ2)ocXuL(gWLcjy_;^fCN4P z+5Bx8HGBcKLqw_HvQcH|66t7!6&muNAXPisai%s*l&@YG9sVqADZPo4a|v`QdKpBm z9)F%7q4`3jJKt;}v%67m$Wlg$RS2#53js!046o2sdo?wBdk9!p!|oQ@3V9V*fsr%Y z84#eBu$N}?nXu9&wJLPdyC*JZ;Ha2IK~mbfb&^ZzUx7hc*l8-pN#oht@dJgv`8^Yt zLhnWhc1!Y=ydBAit6vp4V~AIfdskSkcWM7+jd|_>wKMb~*jI}eZLA@IB z&1~xoTu>+B*$Wnlph8979sHp}-4Pl1Lh5ZY4TRd*eXHv+pja`B_xO7_&z#nCp4I&- zVRdP$D|;0RD}Rj?Irdu5>(FdfLbpn~PXy0wio?#g?#B9kK)(ZM>4%wi1jOJUn8w}_ zB`@jV=y-ZnAxEo0lYJrml~u?B6dLcuEQO!E*Ai@_~X&jn{0j>#dprtG5`Fv|5yRK7)I;+yMKz^1e&0+ygc7lXk~+$qNxYHaWhh z!{{Rd2^&3!o_=2i|2vz2W|$}ofx;`$5-1xy6m>`GMcofw$P<@MHE6oJudUt0k^3$e zj`Hk)>>k_*A|9HizC}f0EQ`;zl^qv$%e&07u71mLeXIoJH67og9pGM8I>j~AW&sIU{tKc{m`Gs}5 zbKm2lUI|&d1-~SJ81(ySWhI8JXa6oH)(+A9N>EUcv5%Q{XUXG>VXb!4vS3v<;aK<%2q{r#?jpO%y_K0XjnZ{S`9L8rx#gi#tC2#j4hz?t~}O9aq{D#NVr z+SI4?q~XHT2^WLs2L0S;P};VZLim9Qkgegqb|joQSHz5|TOdTgmzMr*@~wpOH3Ozw zb|Ngl+E+=5h`w7J0z6L9$!rS2V4Cs=1W-74-b?Es!CSeXU41Rz8BVFqP>LOTN670} zw7i~*&gwyDXuT|<{J*5qSJ+JMot!BoA9b*{uF)AMPx|n>ogI%Rqm>-xZt)2dDAJF4 zOyErM~p8yqcGsfOU`GbZ0V@IG4SZ&CT!a;{F=DfN%KDJ!wM1GiJpwBmBDnk?; z%djjfjf@n!U0q$xeq~|M4{cPRT@e#DYkrLQq2H8^$U*6KYPL78znXi`km}mY*Y9Uy z#beZ&?PIR5j|;^MO($kVLk_s45+%CW^@(qw(pN!r9*JHvtL-mxPAl3r{sL=1i1V-` z7W;V^t!JymhAmI66_i_zD7>a7c&u$gt@v#XgT!(oQ0W?27`=^{}mj7Kd8^b$6TqJGS=G{bHv~x1FE4QYX7|Xd9hS{pC5Ma!PyhMdun>`{C2aH zZ@gjemhQo0j8C5_6DY0@qV@c;yNM5Jed?h1u-6#V-OF^Vrg$fO zlbkxePWT4VWz%w_oHJpYm4U|OV_IM9)O$4#Q(j%RZ5CJm#ywbb`|fA$I1;V+Ip%XI z14d3!l|Y(LqHfZsi+^M;>QjZh%{Rv}+q}>7y_$2F6p88#ca;YMOkFrERG9{T4G68& zB$Jc$_Fyl2qXi^~_iV_SC6nQ5Tcq_gUoS^XRTg;L2Mi4jMd&wDpr$Juz0ErUMaCtK z6ie*_!LSSZ8xV^Abh_a5uZ)FxkXyB7%oMnQZttOSx!XT|;inPG`aZaqM*54N_AREz zK}nFFEX0iABlkV1KB93;A!QOea}2a}-xf4c^ZNt_afdmNA#l;0I!)OX7Um_Ia`u&S zYqlWIrm7D53w^Jwt0Y!$S3o0%9DPfTjGldU;~VRG# z7kXs3&p`IH`a!4=!0+fvv>7Yo?v3{EIPAU8@}VGQp|ZCJJ1rzRr`=)J zLy79TJHJ`Si2M^_8?XFj~ACDpX^#ZSz_u+X)wH5Aw^A~4e*#%k75QiP`k$7-_Z zZ9m+%r8<(4Y?W9tH#b+$NC14tNI)S9Cqyl%Ai)@Q{Ar_Ze<89!;;4YYXcwbGRHzmE zJd3Xib*sZfkBGaPeJ8*%gql7kbORUl(TYve0)*$UHaimS>uRz&rl-_QD{#?Hct3y4 z5`kkgq2#NFH8U2f{e{dpSL%e}!Am*TC_<>ZyOagx>+9osh2W%qi3oQu zCQPFExPpWQ$p#*GR-k7!NuJW9hHlj&MuBP7ay9YVJ7T&HMUkRCH7mr z-u#&_g2n>h;(;9LeNra%99#G6 zg#4FH>uu-pUZXxZm2*_k+MAeN;dt05?*laIRImWOC? zKd;5~n7IT6 zTD0gmQpR|o7~pA`mKkm9yrr;a>#M*`ud!quCTbT1jL3lv@KDtujDn2ymXaftH*$*T6oOMw) z|3>mbUfiNwmhNp+lfDWgIwC3dW|Li^j?V0WQXPEy-2g-h*ZO#_^?X!|5 z^2-$J2~y9HKxxbB;ez_pC;LqEf~7F;=$}RPOUklSqcd)=qHi!LL?^mJ=8?U zWprvagK-ICcKxLUk#UP+7cD&8^DZ)hlwn&^Z4K#|x4p91%cvkj*1#a&L*5vU)*?a$ zk)x^6$Ufe9G(W}7D33iwOl+c180$!Sfyw+BvDBSNz)U3#p_CiI|?D~2>Jw1Ydtnd>(qUg6!Roms9Mqj4$Ec7}N z)#h_VY#YFGb+D>-0Fnf?Abc|ecM876ggaOkbkODttgOxMlvdaTu5j^p}Rl&F@+YyJA7h9~{u1NDRuL_|p zx$9Tpq7sB|B7>KP->73F5SD?#OLWl04?V9`Sfzpe!Eb!RMYEr$_;RAYUT!abIIjPY zq}5)$wYk>t?!vQMgh+5te_?WOiPFy#cH9(CN*!`9WBuB#saf>9S+`bKr_*G_!9*TN zMM_XjXujp}Z2A^d7pC~{47B6%BlNt@XlSE)XZGjh7g*sMH~YfKcS8Jw0xT{E{jho- z)-~>re=n>-@s0iAmq1~h=-t-p;#C==yyjuP=+L7O<>(|^6tyCTF8$er;7o}6C{wp94Vu}reea9_RbgUz$E~wp%~4;c}sQ8{<ZbNL=$LeQ?7?-kp#fe#s@0=+8ZGW90-76Ay) zi`D#wIyeDSP(KwGj-fIgZTV-OL{f-R`WqT?jjnlq*t-Y6Uz#eUPO7#bK+!?|5qRQo z$wCoeEMJC0T$lZS#0W&8F-wNg6#QHNeM_l5rwJjY8BRZr}Hk!WL@0+)~ zRfw#MW{;{9S-BV)s83$ia&?|P{G{gafFd~Y%?^K~2Ex!1+fL!6Bgeo)YV4?Y$$Ol{ zXjC20&$W`&yI{zZvsnpy;4JP-gi#H67rH_3diSDd-B>3Ape&t3B+q;{m7RT#gJ-+n9J_5|E0_E-7->ay}aDr6PbjUC3# zi^Ha>KX`)NgmJ$FSKyD4@81H^A_`)`-c{-(b)+tW3h<_0*pg~3yQxkibSN@{r*TY} zrCfP>b@7J$VZ1Py1mIgM^v}7XlSqYmRAA2n@&J^xgHkfc#;?FPgT#Kum+M7J0_1QI#1B z^jap{5^SHlkPu}lMU!Gqi(`6{GBVcMF|r&-LJSDd4!#A=(FtT*+7oWR$+>qO`5;`L zJzeAfwHy06eyk_u1K3z9^ROzP)qMLz{yj>+#D=F{*y68J?w`b@97kk&BUC0*6gKEs zi|f_-cJ@Sb*S19dfm8jR&FEYFGinT$C8dhLwwKQEl5>LuB+tiDm~Yka<}KJBTG}|f z$9*Rob%!U*LFFjs;aACQ$vySa6TD5Q7_MBo;`%ZmASR^)rS_CPD?Q~I8BKIn*0>6k zCH|_33G18~?Is4Z8U{6p*Jvqk@jxP67f`YY1FOyltI~wSG~@y#ygq~$CDoL!-x2Vb z1)Z{kMr#NL+FmvwPplSGp!jIM1N$VTzVN3ZJKl@RstRcJL#Fb2bk0i@ROaUUsCl+s z`!;5}1Sd@ObOMp@=?^{?4m_OplJJKLQOUq>M5ZPSHIv|q)Az&j!d8h2YuyS`ii$^> z>1BvklM}I6p9Bz<&9I`h8j;1#s0xCM5k10WCR?#Dn(HA@+5mOW4}aHi<(n?#!(qjQ zO%b9SG%=^PZrbTvd`F^s77aMiey8UT30+0_rui_7sF(3Sq^Rx{V2OW7UJZuRmlSm^c<71yS2#CK?{^lYG)UA#7P-rB_2 zl(Ihk&dtfm8Fhe5z!b31@e5`)-+R6e0sQHPC!=YiiGlp)%l+^t>~W6nPc$&u9fd7o zu3ik@ebA?PCA=+KK(z!&zRgF3{xp{SRqz@+74j|~(82;fJqJ3Jk=2rZYnG17J*+zryHD5 z(w`_8mpIk=l?zeqEast?z+S#3AC)dtoh~kfx5MB>2fDr}i1yZC*_H>9@Dix-aSZE<`JnyFCV{)!-D=)d4RqIxQeD;TPvR-b{}oevpfd-?|P7PSj*Ul4|FluN#U%=;H95FNn<#?_36YAK9^TGI-Dp$Swl9wcVNEW?XpL zMuD=nNy=0SI1h2RO_yz4n2u&Z%wp2s9ItS0$f&-dk+ZRRuG!)_2APyOafuvX4 z=+4ZIpzasp z%DRN`aS#%YbwW~(C)@SpStDCT5BN`j0O?y+T?Aqh0`f&Xpy6pmV>~JV<^PJVUKleC zrVazCJ!OM-@r5g1g~)1y*+lyZ++p_f09#4v5w+U087$=`h$vCwj7(I5RZybBRR{w! zkR254)f8xjrvvW}HN9|iaryK_UijvA1>;887W{y#kOjvJJNzFBIh#BnDi&K~^htu` zuCvg$vM)|8K=5)x)G;f19i~>Likw) zbhs)3x~sQPho%B3!j1GzU*D=VnGUhZdhLOQ;5qpfsiCVuE*Wh_qYUjoJAY`^nKUZ&r8z2OnaRaI5N z*b~)tm=;j=rVMo0WkYM5LfvPhRkBP%RSEbeGs`x!op9uv!=|D8gxEq-d3Qfn;$raN zX1M;n82t&;#8yA?-;@eLANSU1yMn9yUlUgjXRAsO&rU3FPi*CZ)+o_j%)hg9>DkxE z*SSjDxSa439};p=X`v$AH*3u%lK`Jn{ETPy`%4SpNOw-O|BNHs8ESYu*1xAOoO>VA zdQddTW*xI*0DJjiK$|d`+x`Pmr1QN$yNUt!G*=g?blUA3|J{&$s7h=33^EyJkkarvMHIgZ4HIp2mt zMPRb-OUokEd{%z~_Q#RBIFACC{#Nhv${{J^lAQIoJR=4ezeA#%|9(Ia71oK_8W~dw6F1?)B1F&(Y6Rn=aH( zR^Cg{Vp^2;K8)1*_DjLInHO>B**7t(nICg4sVza zYT;VYXYOe+!@rjLEG}>Ja^=1L#4=2^&e<=I0`vbI6e6Wky25ORFEbC(0el4AssMOnQOP2DJLa z(dJfloix7ouGQu{J+*F*sM@;EM>R1HMAP$tyMJ%gYDrt)io1K^^mvI@m*zhBU8kTV zKTL2OXT*FjzXIE$$O0EcCY|O*)mAyzndtPOvFNWD`(V^aRcR0&?3UPBUl;7ojIp4qpUrx>~cuY zfj22Y+tm*qmW^Us@maRBbY3&=*FUYjmEIWSN{mi}uCB~A&NTHW0Wa4+hAJ+PfpTy2 z8qDcipgy<-wf{KA{u5pBpch6{ZS~j08Z`pKyh+C!INsx)W1(ftA+xwkY3T$)uvT!m zvedf(Y$c$QSg7wR6=WzcJWMIvru!b8$DbBs=UlSAfyBUCePm{OM8k)?)XSPcL?1`9>^C+$4e*RO>_L`N%cd9tcN(ziP7J2Cug;w;-m` zL$(4~x8TT2(z&kGd@-_c15tpbj78(3{b zNSJ~LYfkXE$iLS38YA*n(mDYfz%kuBZ|{TSST*0KlNls5(FHYg$Z7q2 z4@3=`qttLN^K{i1L3Ffc67-0wDs;6YGsdVs#73KI7wI@KuM#ZlZ$>iGzl~@gp!<4+dznHqNj@J=KB)$)ul+@aZK4_K4{2qAoCZj8 zoY07BEj%-NKP)g_BO|Rwf$5KLS#;^(3NYQ@veoIB(V3ll4(md$kv6-exee(Ms+7Fr z+LUwYGcAItS$OZINYXEt(csX;dJ_^i`-Vmo%Ci<>Idpo zL4s))PG;=9qC*8hT~GF1*U5Zvn*g=n^SA>?BIr_pOGH4tQfn~}&~zeSuLx((&biV% zOaP|uF5BWT8|PdwqrB)+VW%BKEC6KWskw_uUq6i0IA@lVk8&Q<`63ndTNCOjnmiTcYp z9?ZwO7kJ5J>Da!`Kd;sITpJ7kJLl0qCaKUx8>{84D6SyjQGZ@eTr`+aOwxfLF{ z59Cf!(N^rM=}d2SdYG5^Te3r!zQEU0*kQV&-3aA>IQubTsSEL{gic`Pl?9QZ7sKa_ z8$)P-CgW?-Gg#3f!feH^Z!2^niVigjC?JU;v<%N>A?t znbFam7P(BYiJ4r^%p;%C+zF@uJUDU~TUAYTes4?r!wA*a-SrJmj-iUz-|_nIRQ%UY zUu5$>%pG9w(4{;TU&y_b33&0^Z)Zwf*uEBSRgqnbA6?l?k1TuW_llH)2>-BVO5|C> z9`PiNzn4>d5ORPVH z6L6$sxO^oYnHKEh2njo##-2q5^t9kiSD|V49W8?-b@%xaxUy=(23>}sPE1hp)EYy+ zD6D-1J|-XOJ|~m!h&pU$sE)SDq0{iQVXITWaRFCUv@7JN695PKr^CPzk(t6#gLXtUFVKg{=uK1%r_2x1 zr~d9JncTKC-4j0)90wC(LWpTD*2RnKkF8!kjlD@pKZw-%uKruINE{vzw=x)mPdmdd zBh4~m&-`O%Ai|Fh^Z#qf|J;U8LFGuXyS;v%byi{dqu3M0!<_F?JlZT46PgT=hqJG9 zDF(+oeZ|P|>M_!ZYXp-U?uTkLxV!z2%T8$TEpvWyt`+ZPd@1pbM%cM$-EwK>_N73Z z22SiP>&c2WLE&73FGgWS+iEU;JG1-cP3}|s?(Thg=1jSQ_`pu_flK1LhQtBWUlk+< zkPF-8?_Nzn>?AKMvvZ&1ZcY%%{XN+1h919~{$RLBU}e?custLElU1_26k?JY%Dpuu<)%~90GMIj6kJQ zQI8NlFdj%l;k!{eNDkVn==8=Wt&CW@G8)wgL4;-2;5K4AwHXUXKqPPPF0cRq730_` zbFXU%snGsO%kTS_{mac|mOJ{z><~(UEVGGrpSv&JE?I`DAKCV}Ya|#ncroFYqJ`N6 zSdp{_p{mYn`TMi?NS6LY7R(%qajp~y8x$+0IZ;zhoC14f<@@*RS(frjrX7P46=r_2`#v4DeUH%K!XOF2eNEsXhU* zogEzqN|rinsLIpl{y? zm`LCngaqJkm(ooF=;B33&`|}Eqj<|QQje=iCE;#F>P%ues-(lx89|!F=WuQ-71F`| zmR=sgbfo|uNgTp=rS^-SH$tRI?RM!rF)nXpnr+>m*J5iZc|>&Q%zF!2d-){?|7kpm zAZuu6uloz4JCXJ|e!b}M7ytVxgIfPsrsS_MTYW>HqJQ zga6aQ!R4I&p#1fA>WAXP7z#UUtCI#Rmrk6$uK{hK%Bd}q{%;O|Mshj!)5$f5okz$p zbXK<761%xrb{YHIs?gsLq6wj!nU_x>#B9t3c}+q2Ak@>-GwET3iGG3owgt2>rj0g8 zU89u6!q{ngM}Lsgrir{4t34LH{^Q^b}hs6#I?_Oa7_%2@~!k zQf$L(GsP6y6qWw7#!4aspyt4$<(IZXMTaWghLYA%B{O>m5*3dsdG@A-b5aPzgGNxu`?GaHm2n*smFca0of8-6U#4bVJWaFE^Pk7FLR??6Zmpu0OxBuyf z@+S<=X^6y`oqg`nvrR6?bv2cB@drXKGHNp$c)a45Jkahm2%9bL2m_DD;x{jMclY(J z-G{Vr^O zrWyR^1b#mGVZ0OVzN%t?ZrYQ*F{7XQmkWF5Z~Q ztQ4y~crBWdDaR02qpH?ml^Lxa_R#x^0Xc+7t`o<=9c=_?fjmNqyxJz1ugvaw{LoL?oLi;L<(b8Pc8wnEjKwiS@~cP zjs69QbFXQq9^om@>G!AmY5iWYK)KlulZ)3Ld(uJ72e*b4mtv0!+e(k+_{CWqvP5H3 zpGJ_dbAy#By5@zaAr9k*R^nLDbDKMP++R_qr#UPxo)AA;wmj&6(sw>DFAr1Q+0!G6 zgJ{SZ6r+7!=na{;*}SXut8|IWsoYV{8$~;c7p;Pr_wkC`!FfJZG1SW&%8TEY%KWb| zNMF0E{mJNsj~!_ULvD7Hg^P@g)I-->YgkxbIw8aqH9H{M%}XwO(wFAi2L`VNX)mXd z2ushE>tD}r;rmrETa}l04qy8xyhgH}U~rG=--p@=qs{adhJ^j-~rHq2d9+okW;>itduV7X4v z<8Z5$0fBpQGr!+6%42=58vQbVJ%qu{8V7>Rk{XNnUZmk@GM9W~%ORzP&}9nbky_zFL#A zjSatRCpl3D1_oRL?Em|?{^A|UkUHM>q}etT(uUKB>+E(;t~0MrrRNV!+AT9Oo%~Qf zIyyQnPu$8s7Qa<&pTAZnHd_CZ9vJ^vsq#q%m>yBQ86z@l=ip%FN+EeC4d!%s zC>zr~ToZG~<1{=azpvazrTww7iwvfK_Rc^=$A;c=5Sd;#`Jd6XLHY z_1D*>2b?rPaqYXuyB-!>bd(xTtAglbJJswH>(i_zhN0dNxwpxCU!Poj!7aay!C!Fp z`!k+RIG*G(L=e`_ZrXC1LfPt3i}Ih__xG27lc#JjGWk%}6t|_hhAZsJZ?B(oc1Ule zww<`Vcs1C~)yr#()Vc$I6J9Jm`B5CI!56<0WR90Mq~5Ezckw?X{xLX)GLi?hlyx1x z8P&V>oYW6&+sO5Yl%*Kv6xx52P2kpG!s`9qbpP&8T5l+CnNSb}E>B-q>H75?%)+o`k$B1|31+-CZruC~IpOAfo z|Kqrr2zW=*YF^m9(|a|R|Ke>A8rPqCdS7$ezE3gmCRUuRt=ymQ_eObd9^?Mi_{vYK z`Ag<=d$%)EM@t!Zd`w`dz-ShI{Gj$94@Oxh?hXgxdFy+J%99WP#$V8MOlS55j-o;}8!EdAj7AU?g&;Mf=c(*fM4%SLO<7T)xal)E@M8=um_XcFAF0Osd zG-!+ZrAzGR1^ny8r$?6;bfDOqy1TolH#Rn~qC4+YWI+9F(-+Luw2$R1Yx^z^Z-Ril z^ku()ziat>CRJqwaRJhbgVy!>OfT(ih7s^C$)ZqT8mG3}S2V-bhf(F-|NYial`EY) z`cBs6JyS1ODms+(Ri$2E*Wp-r4R3^MTuM3W{=?$F7=h~&yZ>wVnTd&CKk&}cD0R$m zpEcC2-f!vOw_M)Z&$NF+ep{Z!e{MXCx88OCf|NJpm5uA?3HJGuvb5&dWBzB(pC?qB ziXOMvDE2F(D`>!?nXXElVX?Q=yTP=;Ai0h~wkvhS^Xk=2plS|Ni9T_sE zc=q(Mg1-NJTMh5js9>S5e8YIR+WR=8oCY-IvT%n8uON3JM&`+pKV-~m>GWm zB25H4lxnT>Upr9QTiE>b$-v`r(;jI_tv9M*Q7sL64cAy`|nM>jiD=t z0rOxhy=QucSCGEmF0e0Su3J3qyB{QudMGy-CATr;w!^jfpJejS`h8qxkYwOcJ9k4> zm+0JHskA%Xeuoc6Ge1Ev=NiM6^#RSf=~f6pmRGM9=a-dP_U>AQ9sKVP;r*#RCsXgk zWC#PMQ5rTyj-|3LY2`?EK5~M%kbIzd zGJC?g@9~yP06$NB1_Zj3S;~fPaQW_$18%bK^t2rOh_BUfk#QtGJT40nuiRE9qtz(m z5%0V>el;U1w2kw}3f+xRcK;>c;&>$K?%?C9mPrpZCUEaf1ktws$uHi5+uw^|LpE|V z@XLddc>yW%N4>+P@PS!uCg$FYYvnmAXynCn4iAWI$Ki7-GqtWhI*|IEL5O*o_$;S0 zL%nKE+%M$+x7^7BugEJYljoq#{1rcGYiH*?Ln1`=6*^g6E5l;3X9!1ga&xI>%7_5O z(XBb2Es6fVQ!idGdgkTl>r6ZCJG_4}&H2M!)wn(eLvKHQbqA}?h`0?Giyr4T7sNts zpnd0t&mnTI6XrJY5PNJGd~2naLziSqlIBS{p;XOB%01f;9Ba){zn-ABn^Dq2@BK8H zF!xu2Q{m0rRj+^J!1pHW9cO(-t~U<2;s7J}g!*Buvc*?Z^I z-k$Ndm(2_*37pac5!5f@i2sc=x;k+Micb24nc3OwU7=;`@@}(nuG4}7I;9GKX#sp( z*=zw)9JT3iI;!a2iKVpYC^eMQ8>6Fp_wE(NZ^*;A;;krv3C^ueq~S3)C%pQUD(NN( zJ0{>4-b7R%914h~i`?=Y7&mT^Mq)wdx74kcj5pt=J5q`fiZNmwJ*F$fp9Q;nZTP~c z9`N{M&{^OrySXoU9~8xqugllBPpn@(>#rC7tNR1_W2L6aau+SU^L$gg55(QGQNP#C zb08hNdH2#!N6K4@`Ru`P+tW;4ssnl-AIzVzGhszwqDLql6Ya|2~R)*mdq@9+-J|1=I2LLx(_|@@Ypb*cN6@3lNPUJ z_U7m3yQbQuHYJ_5ydaco&vcnrSBQ)47-4J`4)DVDXV0bVjkG`h;_}w-Q5@;=nS@}Kc!48~aLd({+kBi4L1vgLGUf@dGy)8ZONoF@jEYOK`PxGe*q#wPL zf#s$uFRgZ;(mrI6UL@GgQ|{k(!gB2`jmr6ZN#)AI^#sU;=hDhec$#D&rtyG0?;-tu6I9WV=mmp#jvGwo##Drc1!bQj5mC6z+XPm=}nP*K-aPxy=oh+p+z! zzB-BbU#T1d2vs}RU)Qp*!m`lE(qKY!NZ6Q+ogFw&*>IFBjfshI?7z8vukpq$X)lH$ zCIfT1Q$-rvNE+=RI3G^fp2JzmndY6lIw=oI5jF z4mCc!zvoA>e1#)<<6U}6;0^CkNc+h`?L3UN+s}Ox z38jHv>&lu1C@HI62dupb^Z<9U*WUR|6pyV9T9S6~W)q6>*R>0tBP^bJ-|_3K<1I<( zRDP4pM%RZAL-(eBb#-$yc`0G|y%-jn`qS%#xYsckl36Gj^>`(qkx?KFPQg5XDXyeGVtK!7z zREaN<^-k&ZCLy)zt{n8b!uJ7Xw0Ubq8h@y0Gr2CpaxcfZvr6qzxerUID4BWe)EM6( zDOlG-2cUlCdT79k*gU3q?p(pC^bYZ&ng89R%8vo=;Ox-UKjDDI%F!P)0RIc1NXFZqe;TU>-h=Z4tdTosI_06|IN(U;!)& zsCBy@OntV(7st8qGImqMgKbuGBTRHLvDSU$h5JY?mKdvqSSp55ra7+3P?1@W+241r zkVvQlzY|(ry+(k1Al$f`GKeZMaCR^g3ijkQiqlrduLr!B>|S%s_Eq3EvKjEfq~((C z6q_*!Z5iF3JEqFBbTuPa`YHosqu58LcOTF@+^aZHt;(cV zrEgtj(d~nn?yvtAu;#AReWeUI1VHHP9(h zkRLgTbhbbqRp^+Qe~Q74Qnuw8E;6jvIRire%1nf1>B(-`^!qFP0d{y2q2e;P8oRX; zE4Q(&WU)x;C9VCc1Hpwr25+$$f-3--R~Sa@w4x6?lx?_BKA%zAT)ugr+P`{P*ulSJ zZWHEN7aU(97_e-;{;`|0D496Up7%bMjFW)Ho9F0wmpTfRp66KBzlK=y9JC4d`^n6AJXr5$Cf{a;;}^SvX%e2ma8mgud; z3vJ)FNDHQuA0N$<7!bI3?DU4hdi&~!gB4LOceM*L!je+YH<-j}>k_Tf@06U_^CM?l z?YqlYB5lp<5dF#m9n9crW#%Y{@TN!Cmbv2YeJ4|e;VE6Kg+q1a7kiO(*VAYFSI_!N zHVgdk%8%ir^K*0C(jqx!>Co_GawKOE(5#|IAvnCKh7HRZL}$_tA3b^?XinA#P1nie zk~GjGaVW1JF_@{3x^tG~zR;kZs>|u|;hF{GK<%;O#dUhHsq}!cT{1v^Y>Y2iug-av zNZ+G+(Te+>Yp3ImeAul%8RYj-}y4f^7{!Aj6D5Cdf3|7h?c?m zMdWa{Ey>3CnA{~=o)Hz$9g4g=KW0HmXBl6K>(hu)(UdSfPe?Sm7{YsB&^x4TC#FB# za`Ttyy}2=d?Tz$cE56;^+V}X*-OiETHJd|Pa>_FIgx#;@udo~H;?Y+Wps$HTdJC#p zeXcrV&1I+S6|$rC>AF|L<+NeRLewgs^0flx6r6uC%%`&DzI+vs-4uoB9NvZ}~ILuKnNcZ*fu+$KO0tCg!aUPePnz2e>L zhd<&KZSIv~J!0*feqKLmSfln=9EqQGpFs^5TE-p-ZlOQQK5%3&l7rsKgCy*tIWl%f z8*J()K#C6lo5K6xRXi6T}D6jT1qa2320|!@Kt}y7$e3LY+NJ zEL2*WtBG;z^cavVb5|m4TTrbP*;!YU*)&9Qmw#ASBIs1SI6v}i_KdRVm;sn0Z$0}? zc|JSX6*soOFj8<_mA2H$EMrG^kV7W~w`)_gBTGG5zCc6Wgl7ZNqIlmI^pfgzf|?Kz z)fn5<9YN2@=Ao;4Em7ZVGCNxLGfT;g7`FvSy^PVkY;ZCoX2epD9LfXWorxP?|LS2ww$DCFR>Rw`wf&XN%26&TNAUz%Q_EUCX}fZo-?EG5EX0DO`Vw2(+kX}I zLMvZMDp(J@*_xw%JQ|8!<84Qrih~Nly3|+>38Y9P1B)pvg3{s3dVPhKTf={J`Yw@h z;Zp#knD}D7U0?*xnNtWEAR$rNgt1MPc>-$r zlTjWdY64`565TWg$8x*we69EybY0Xw)tUd-xLfha^%`BJNlqoJHW*LK`X)b3Dbv7* z1W4{Dy&*uPXfe=he_Rt!Gnc+2lqWp1rJ2_?oB(N;EI7I`HGB5khp>1Py^X;YFqEzj zDtAOk5^kxl(UY<{GuCDjOOE58E>QVX-ON?zX-Ij4hDSY}YVU*4Q zVuAy#NGcX*!EDL*o^z8elSI-hmMg|Lvg&tGN((U)T*0n9PSqG* z;_pCW(g6UsMUS~u|H3ArG^sw9T7@-QAT2_+}tjQrKCI?rsA)3&d+0Y z6p$mSM$wOyeFg5;q@9Wn#1!(pDWo4lCNZd+6Wj}-rIqC!WKAv4ZR#hkpXhb@HEr<4 z^J|Dy;Mar-_np**to7=+q8~H#9f?$!D?GS-)@UJvFY1?1`Z{E^^Os%~_$un}O?p$< z9MSr2N?LP{U}?H(rYwiG5s*g;pQ}_Ua(YcaoLGr{_60K??(c1N4y&;5JqI>D5ujoCW?Oq7)U0-~&p646HARtfNx$rGa>j&a}o;!DL z4zd(MC9#a2sl?gOFROC;iPeY3&X~&TvwaDQX>4q?dreKSxm-2;>4dhuOJnP4hGny7 zi=|-mgL&(N-4Et%;;&rrD7(%bvqWK**Rya6jIxs&HWnsc2;J%xl69pOxfcjq z1LBdAPYgewh}oQxU>}%XmvdFRfp{s?XOqerKay#0?T0D(wvLW5^k*(T712wqy9o4y zWm8axaaHlx_qTixB2ied0hQQ3sZ$f^`&Frsol;+sgAmPI!P zF`Mc&!I$95(_6B?c@6U#`}b!?`eXO!MWQ_43XX=q#eR-t^kgEseXQJkIQ?9|N0G_w z(R>n4ZWke3>AF4&9Lc>AEqj*1O@-UfaEKdw`fc>_pW?)htMo4)=ZsI7o;kavgB(&k ztzy>xy)|xEFqfsecQBWY`T|3qM$fU@KbB5rJ!o5rvz_87_pka*LEGp@t1P*>Z>*6r ztwP%A@w44;t|5J=FTr@OMsl3qbA6F9Z{NP{l|6}%M>?>2iC-&1<@)?OS4ur3@ctOR znjr6wY}4^hn8Kax?}gTX=d+9y97?Qkq7uFSL$|EZk|+GT?gJ6p>0XyPl9P5u@ed3> zjRQxZuO@)iqk~I#v)zZO(3&I2rgF3Wh%f%3g@4q z4i4ISO+mIg<`WQ5DVsu0B#!c&%*)M9E&#c6Ix3&3o}ndCrTpdw*_dj(xpMCq68cNj z!a*+ObpX4KyuN-6rijXL)-nAI21II)>2i0TxTXtA}j_ z!)|=0BT88*+iII0u|6O(w_zoOT6-R%qT56v$kRU{De_4JSo|3h(BG!@p@&kOfq%h8 z6KK5yszYMvhsvg$_7dp7B4rqeS2^%CRH-`t9wt8!lU)p!MsjgrCs?Yx=?$h0HVpvn zsu1Xhi0g@!Y}1w0gu$mf-l;cC>6VBnBy}5OV+6kvi#5mQb1NLrpJN1Z^@EOfhK_cw zGH3_xb#um!TJ<-tErGeLJX;S7rbMEACpq5`7I{tQNUZ%r2JKu>Xm-TotA4X5M5Wkq zgr5`OD3?u5b3P$Bxe%cLpB?v_$M=_bDAcjZAMu28EEm_l%5ZtXkuqc!fap#z-X0*` zo5gMum>DQ%VVgjXWnh%C(f2y%TV9-F>LAhgiD3un`gFAKM?T(^Twx)rovQ+d-M~zr z`jcZo^_IrqZi|o_BMTN*tAn*Vv0d_SAAgn3ePtU%v>s`0JD6@o_oVe;Mb@1QXMGoI zWYe3AKlnB8`N|>~b*xHp^?B7$_v3cB`1VZt)W$CY+!L|#cA<{E4Uky;vNo~EZfbV# z6c(JJhmk@zJ2^=pxX`Mf9u1@|R4igp&7~(Pnp3er!WI4);ry6V(ivr6WE8z&*`yg!om_mp z^lRY3z_W_0{e_d|Tpl)BN_{0JSW(*Pd7g!~q_O>e20k%@6Nyx&SGKAY>g~-aF2yho z$9y8tmxs3{lj7qsdd)#&*YG1GXqYa`0o*{r#}q{oN3@Z?JD zi4!MOdBFWaJ^=ynqobq41_A&RR~hHyV|Y+gtSMZX>KB4HGa<_sI0tPL&63rTbESavmtH(X`B#HzCFt-S}jDJzDdIlnywLXmfi_ zrqRq*jKIUhgm4h?Xo0dG$B^Clp>Uz0mCC4OV>QuMRY?N0C9p0W+e>f{zJi5tj!^98 z)C7Zcyc@hZO!$arUkPTE%bcL6nqjPxp_Iv?QV7y$CS>+Fx>jiYn66ny@k$AOvR-OZUxB)7_i(^(h-uh$v7_s}cK|)RYfOL854p!y0PA&9Q<&(l}t$Zv7pV!3m zDX8g%y{`*l5s%$pG7APm%KZmR5Q8kc@M{(Zk%Ec9SqP%X(t;2+m0-ycPUK5*bJ|H1 z98o~DGA2C@gc=eP3aZ3Hal|0HA;?uAoJd2wM9jVUwUA#-O?e`>_^9iG61~A}Yx3sL zG7OPO^1MaA-ird(4{`yRg$5;)zjXx}(l|#Xam!|%>D{q8Eqcexl>1WsidNb9QOP{J z0=_>fmt~ENo1wp-Is_$iY)vvVGc$n|P;9ChFb&m9FYSxwlH4bt6xA7EicuC7940^o zC|Hd(*oJ9cLlExqk2>T0FpOUl1F@fd8NaG3+Z*mBKyXc!O85w_>_PzX%M@!t(B1Y- zUJT<;qCxG*=D~2&6TU2Y>gI(`?oh9)yI>i41S_8GUJOm)DEFZa?|1KgXko-S43_?N zB1kMunRZf@UE|mmIH)p_%R_?g=MW;sAL<>t}lNbc`~L z(W0F}ew|>l>IIJgd#a)dAn}FKO^7ZCSPC5L!(;z;lU<==MIJw_`dTx2cUn_y@g@E9 zdmO7?)D3w>49qonK>jCDfnUqw%i{M9WwWLQoUcGwdN$PS?FgN`>EosXj98CIRTY{!v0uN1JV`A{v=gX?-SA$r6VJ%khEN>JGN~2e^w9OUt+Kc z&Llk(2l1Jg7{(jcm>BT1a1ddRa_v&mvdtVqMmOpSC_!%+@r?8@! z!hQbUlU*rzEClfm;73%IMQ60$5Llb#WMwPCxZt3EbKWx_kD(zJf$ug7HX8wtUG@!> zync#>24W`*cht*0OX09*8h+JdAGw$3Xot3m($Rp?O^`Z_?BdB1H5JVHHqa=jrKN2j zZqU_ELHZAFD0kGLp$f2L#v{cl%}N z*8HUD-z%0D54%((P1mIXapH%{q!)yS%aGZN5Qdp7gwIN$EIL0Pr0&MllqQ}^sO&9t zgNXW>@M&GDj2)fo6X22RIEt3v^>+um9iJkVat|YuY0%lpQ!KPr;Rt0DwNJ-HOaBm2My@xOdysDUz$OB3P+oWabE51KrE7xVK+xC1y@~FLi5gD z1UQ}hGBB|_LqWY(VjTKXx}sk49qg7eA-(C<52Z z=@p?lcgM@10eb>bU+=#WT9R%QYw1Wvn!)()*knJ<(xZrVV29Ao#+xwqg$s=*A|=^@ zcz+fzp`9Purp|ZiJrjk<4=n*A1;r-QVg4R0{pvNQMP4nbk0QaUuW%BZxEOL_lWJ!A zj(VZ>)923x%s^u0v69a(>Cza&DQsULmZersUOrgaG9w-nFpWe&Z46bB_;YWw;BeJX z`%pHy_5^Ce)}$uI(Zkg)pUig^vz|Drv+CwE4=4s3bQXoX3K2NLh;)ln^$6X<}ob}pmg1t!= zM7u145h!dOgHWYZ*%bY-t)1N^h%et!Hks^%%GjU^J@f>)kRgGVa4r&>ShARW$@-1YA=+BTxnNJRf@q&Fo6o{IOOZx(m~? z`0pUY6(S;zCOBZ(FC!>ZED`aW!GPdQpX)DIbi88iSPrpjW{OdF&J*od0dgx*Qe zOLKqfa8>=X-6y_(7wc+m6^u=e>ztW!h2%)!>@Q}^Bi+J1h2}j44D-_}`TXo>c$gr- zkjwn>;#~^(3EwT@eEh_TWR7cW&|WJ-N)AE+#cv--((xL_UG-` zpO}Qg;s>91f4E%vFfKNXEJKisD#qZ+mi;P(OGCuJ%Lz~KgGkP9_(+m~8f&&rQQ%U{y5n)v3{QlF6f9B*P(40XlBKR%Qwg2?)A-|7A}r1z&lXqou# zZxQ7>__(GvcCGWCzanRXDy9p5aoR#Z<0&q+#t91}!?H;55LEVPsoe$Z6xn=Q0{O0(aDri%{Us1ETw zpM|j$8pG@$ZBQ@aZ#`bvYVRe$v}e`@)xC%{(>n=RBHYE;AjtU`KHW=5Nis-JFo>4N z4;}bndcv7|!RW1rz(5zFYq7MgqpNFz^x9#IYysPnuXO*Cz??(7aSi?Y$B~Hn_>DVq zRes(?LGHNY-RY@yO+hii+g3ylQ`O&N??V54v*)!5IE^5Eo;E;%zn)Y+~TUn!B7ac@Htm@s5bBn^5 z9=B%Cr~OJ-aD1j56@wpJh_2gm2*sp?jFEAJT$AnY*-$CAp!5;C67DLSe-@@??gGkP zQW%Rw-Q1t~K(Ie?_Nj?H{Dv#_rA2OQG-7qOn>Wzx8+3X~Xhy4~aLI5hLPO!_!2mwj zKk$ov(}ALt)Ms5WAE^oacb@on{&%k3VLR2WB6XxpZ(C)(B{#u7gp=vJ(_oDv(f*eLxE!HRAqGB^XeH^`JfjGap+e2 zdrf^ZA)ddo?HhdFwuZwvzLB)*KGIEev$`!F<>=CeCf4PQz3p#EAjQw31@sK1PUoH6 zRmz{*M|`|)|7nhi3SxBL_?OyO|8>~qzRnsPh@RYPsS8S9O~ad#3%f;4FI|OHvPZb8 z@sczo@=Y45T`3N@yv34iz*f=$`Q$|o2s_@Yq=sNc_04CMg>{T~G$al+*4~WozZxMU zOldc;pNSLfK5V%Vr`)!ObiWKg6PiP^r6V-)unLo66`XK4#+u&Y?{L+tSM12?I)rk& zCfFwF9gX*pG=CmC#H4q9{Rqwa^(F&X3eI*?-R`(*lKRYATX0Zg$;&yQ3_-*wQ36Y` z&sG5l9TZ85^HF?;?A;6@D8}erb-#{}vvo_TuKC*VAWoJ#|A7<3v9CP2r)6oNTVc zu|F2B2~y8wpk-1FHnoNiv~+3tzIJ?T`BZRVBU57oLwjOA97VM)sWn7yDd(4m_$Jg- z8oe54A7)Od@U-`m@LU$Uf3_ZX3P|PD`@bn6P6Nz#+p{4uclM$?f_Bk=uX5bAW0{4k z_tch)>7845QfK4W)6LH99MAGA`=_~wOg!_S>|kZrwQVoVb-)n7Uur@jZw zxnn_Af)wUl_HlrWq`EmT#b*fQlsDp@dGV%}J4!;Mk)F5cjVM9U=tS8!{-@iZ%gFMH z@;Q{@)FzWx`;8@QGWD3WbdH{-#)U13wx&xo{FEC(`i^39mh&3Q*0k8B)2^kFMy)eurQ zJ%m)PO(H#h28EAjkTn0~FX$?2fk+~D0w&tt@=%))Z)(i=*;Ak>$h zI|jLK!T?g^0PZ8KjiH&< zvZQy5pr5ay7B0&^i%Rnv+Ft!I;{0A$&0=de*_e(8nO#%*=UX(K0)Yn4ylar8;4&dd zfabnbx9?qx6ti6*x7eLpS|tX5cB=#@{d#kVZLT#-F-U&PleFSA%(){*iYdA8eIEtK zP~nnjAWx^1v0(0|LP)A7BIW(A3>#AOIGzUMJs*3|7|p=8XVPCKRC+>cZ-fvjI8mnf zY#t*YO4S94J@Fpa{vh5ih-Yb2pz4GRV-FNY zeOh+9s-W;Wkd$Hay3u;+Pc}|x*NfRs=pCE%f);T1La5cCIT$nmeYF|13osb=;1unGu;-bQDl|$q%SV zKQPgnV|R(f^JDuAb-J2j9QyLhsdRry%V1ju^Y9A(yXXO%FUtubedHl*1|8AB_uBil zjSV0Phfj9_(#E1^dr5j8qOr-t(4Y=4vLcL#KSOmHn0kks6#Ecn7oi|i7Fwc73Ev@1 zr;L2jkoq3VS=M<%I~>3^DA4(6#zMjs@4juv`-)QZq=qgO>*x2HEad=qSb*g42+o3D zL2Q@Lf-SpBnbUWX)S!lhF-s3*J+h-BDWg)k0Cglc4=rzIK7{hOnv^6){kz>p~5vHa10eLZV zp=tVEjzviBlF&Z__d54)u?$mgh%_PkPKIrcdu2rowrD`?Hqr($UkX}D+NC|aGbXO{ z&@}On$w^iv!ANC7`hyuVUkdHdgyGp!ZR_W9M@}mWKHLE*fb=?jF642B`*BTz-xFI- z+!a+oPC2V9S%i4GL6r`Z?(XhCTP0&ssyW7>u7onkPXtUt75%xw)8Ui7bKu4=o_q54 zhZxNd{o&B458!v3&46Udl5#snakdCVInFgTZ}{yJuE?B{gV2U`pzg{eCh{lsp!}1L zry_l9%+f3;!+&mhk0cyAE`cWindgvUhYXVnr~_WE*iPBQ5ekJ8eebOmulf57qsLg9 zn|v$L3&$VA98>{yw&8e_hLM7iGwH68Gq+#F{*n^DDBxwlVYqAvKX?s<+#Xr7O2~l7 zBM^<+Ypi!>O>%%5r7jF&`dyIq6*4(cl)#W_M))Z_@nAY$hQdx2vxg^oWcYohIPvn) zzXI2UJWV`6MnffI@YZ^!1TeRjUKz~Th&xgslTse701mn_JK``A7Oy+(+%C6%{rp#g z38><}-{#G7N@VcS*E2&)04`xoU$wp=K>0dHS3L!pJa|-6UVc1aBFB0Mm<)9H_{pdK zeX-mRmo$>wUUaSVjta8?kt(>Q0iTBf4D z8|eCwx)mQKge{?jKf_U%n{qAQt@X}Z$b|mqA_%cEPo`R=k?(e913GB_@Y3dWHE%&j z`chN+y2P~*K87$g`RV+31VwAEk2KuD;!Yzg)7TM%mpwZ(-iS+D%*MI%z)1K*2!VCazfjE5q z4U$Md+Vb+aCD-ub61S#!?$*|}AByMyzb5P>N}3#Ff$){HZ@Pl@r9G=_B9AviBb@Eh zR~7aD(5cuD91#y3^C#s78Yjl ztu1SFu^*9e|Bew$c8XQAwSeutt9?(U)ZJX!^5%yf4p!^ThnXUDiU`h>B%;ms zF11yWNG-ea`(>!J>&4k=*O2mQP0Zd>YzX=8f1T?(pn#~N7rU2P{12eaSZMM7?&vlS z$LjJm3~2<4zv@Vf_}`Sw5Bq+kUB7+>&?ToB0F&P&`5^(jagPS&&ohNlXq7_z*vDQo zii#k5X3Mi??A)vSdPHEwt_t|o(XW&y)8yIRnfdv19x$gwknS6o+_|jtbZeL!!eg|P_#7?Y zfIAelO@%9i!kT&**D#ADw?+GfL;eb`G8@u?c(gE7btdoO!7~u~r}Mejg9N*Lzsja<<^)VoLqjocl@ z65{>-`2=!K9WN=2mWlg4p}HCbCC@H2u;}jP(*zI`9N^ta0!R!#Wc)F zY*vPn`jwM{t*Z+#-D_f4_qVMJsioY2pev`yKdxg-@sQ9~B@BsN>m;uQPXZOY{A7KN z@-@1GW_^{@|obm(Bb#? zX&)E$P=eO%3d&`?D)AcAu!ilntCVXa2>l-;8s~#_M0wp0jEi+%W6-AV{Y{N$&t9Tz z4#iE<;NkI|AVh!99;4ior9KE@BdulUW?3Gk-BNC-QU;4xhUGL_q9N2P@LL4%lHrJe z0lG{Y2>ICqdsGN1%^V8@9z~GvBe{2uxw@s114?L0qnhJ2-ikH_ zMfbB31^ms|av?4PNl<&?HiVK-TY%cXXSrG++P=(2Fr}$Wt0U@W9r`mg4^o^aSm?Jt z7@wqo3sQwj%TJiqmyArhhJA`3N_+uBbsj%YJS;@w{ zjso-;-mbG@E}}}syS~NVg5%RW${TCX|0YFtt*Tn`t#BcxYxa)AxUpzfU)7kn_mS07 zfoY&o!(&fZw9xsr`-b111Z&KLT)&g~5_PS}BWt|s^$maQ9Isbbb^gobgxeH^4s}|~ zT7*g7<>)F6D4SiU#g9L1fYYiPvZbjnG$~+07`;QTZ%FOG^!J*X@}Po0W<^{gbGg`! zk`p}MS4WVJYL(aZ43#0RO7m&+5mi|`eS@D}IrUrc{$WlD<`l->jSC8y!?P9JHgv2) z4LLfDJLrD?tjpukhGFz|rgcy4y6w+P%TbC5Ik?b2z#ZhXq3 zBi`tsvKhW)5i@5^Hk6kI{Xe{S_7icMe(BYb7xk<1svYA)WvO4wsu1|gjU@&C2giL# z`nO0@a`y`5z(|&-c7x~+uPA-(j6>6vnPyd1e~kdT#d^A8r4~Q}e8DK{mq%A+6few{ zm8C9_?a9aAZjh-~;|ZT_#^b3j8R@s2XmGcsK+iG&^}#W=mu%`tBNNd};Z*uIbf+Je z%hnA1K}-mj->ag@x+81Ar7Q;~M(c}B$H_zj*1iobzNr1+@w#K$8JcW`VGkbNrvd}3z+8u#&hMVZAj<;GLe?((FhiKJAVr)v|3NZm-?Kc{=9$XyJfcfAN>kK z7lp%Lanop@x+0ijko*{;9TG0;4wy%4GRiO)^QK=AGixn4y4%sOe+|Y z9_7cxaQO)+hj;yK*?w}}YQz8z6bPLE{3nCi{ObPo4dbc_8Cf^FhIc%zCcVP@Jfdey z|NhKuc=ad9fz0|4dw-FYifo){v*mHIH>S2~Y1ibFg9wO@X@EII=VD$Rrks7G$)x8k zekpq=vG0M4d@9QIStvfP0|F@+M|sl(qsuD)lR={c*Gy*B4%ElQ4fpqCa$&>hh>)$L zeO7Qg??({b8jd-DOy6no=iEdAX9>2Hf+d$mPtzb-vNcQNRqK&uQ)Fm zoF4Rg9rC{p$j!FqUe@nm-T^VS;V9W_CAluuUynJE!Hz8RyVfL{a(Im)BTlWDGi&@? z>bw>yn2|=mqb*4BMn_kwoaWdvX{<{Co45OZ~qD1@rmo|?u=^3 zcABeaDJj)#60za!tAde%lz(u9yF`2V@ZsY=Mtw&3h6X}~Yk6F(=og!r`?Xx;-z;Ak z%79u6Kj*ZSY_@aqi`k(GcrJga%0l-Pg}umU7(7JwK0u|E_#=K{v;Q^`ibsVo>2+9J zt7zL_Q7J7KcR9HR58(DEfB||$Bua%WQBNO@6LjaO5bjb=yP{;JZC~qC_aH-&%%KQe z;*h=d3lk|KLnY@K^*ZFZ&|~RW%F*7*q?_DNs~5*Vqo4bat))&sWh71#es}d_=wl^n zJnoC~L`zNIpdlXjIdeI}0zdS;e%s}j6!!`T!A@VMC3sMX8wYkA-qSYhMh|}cV|ayp zPo!1B3Ac#rxqr`*(*~ZGVOd4#eHy&o527#A0K9*twz~uD3^jYrvy^dX;N1*Z&x^#3 zj0#&po-2@>HH7V76N-y{?eBfQQy^+EZhtSX)2LkXgYoEE=?uV4hDSK1ry7y9dbCR1 zaFpUlso`>l;gFxTm$pB*AC`Yx(6VNl&lD@j)#RcJWR!r7%8%2Tt(h-ob&hs?ES@4Y z{ZZ<|hN9EX(c!fbtNUBtC?|A4479JB8npt;;N`ldK}T$-x+=JaLXpaHcDJ$M=+%b+ zh2o1?oxgaVM;LgVWMS0*@lcck?et&8Tl!$**;_<*u37-c8R`i*X}k$EkMBs0*OH=& zxkrINW<&763A7k_N-Z6ygTtE{FDYaftj_@7X~Os2gmu{y0dB1hN9(mQE0|7q2cS8e z_Ua+|1i_H_pzD!#&{xCJ{8)*=9W8hN3}(H%zG70xHLI0P;6>h|0A6-dzh@Ylb=?_RG*KN?!ETF zXs%86#H*-p$1hH0tp-Sz@f8iO>x3|NhC&FKZKECb>9Od@*VRg;LKKbfzDN3&g zVgk*u8cWIifmvmnM*0yiwy4x5?ek5|y3dUGdwTrmm>J2pgLSn<};&qXj zMsD-6$O{p#hrD(AK`Sbz`OXdvSN;#N*29p(RG~!jV2BdPixl8EyRsizw9N2EL#;GU zifZ>D5{>t?FxQ~qzkfw$V4R+yn6R+aIN{}uYgmGq>C>==ZDg_Z`vS1&j};>PTE&>Q zftenf0CVd$4h#5WA+`U)*6gn65g*93B#(1uub~Uv3=&{=!ghKL)jU^t>lfIHU7tvcDh~Cm5QYgWaAN0ZVx}OSWKMi+gsP_~j zKJcx$E7*_DS01j(FhN-Ae**li!eYIbI!nw$8HYor#^P)biyvOu&GNB+f;fnkz)=#{ zTUx;*uCM%^A>=2v=C`oBF#x2k>8`J2#8`%uq}b0#o85h7C#_kVVL~BQz|7kOKX+Dk zqYM$TpXyG7A({-seI7naA775CD7y&#Jiwa%h7t-0|2zlN-MZ``C(R1@OZXtYNI=l;O%Zi-VQzb^TL6k?2hc|@-4FF8cxb!Om@-kKadt_+D6{E|GhZsp*xZjz`r zoCS~XG`faGleA1rC@hoZo%|!=QcjDv+u0}+$K~g8e~krRavm}I8nN3BP+)6ntF$_r zkVQjWNgPc@a0hX9`*p zQ^Mgz%XbhHBcW0c3Jy|o4 zE*nsiv6so5le{NT!U4*v-FAbdL#8SD%pa{OYx@COW)_II<2tk|Ld|q>aPs5c>jo3P zzTjC|beRc1A=Lzn)@C%6kG}&RIy8-s%`7f15*D|i$D~e`co&q(Ca|yJ~>ycf;GskVFnu4iv`K^@%cqo|dDG9<oR{m^$Pq;@QlH8Au)D~&xdcH@1%f#S^z-xlyH_B1zpu< zDmRdFpc@lRP&)L0wYy*eStzcs>P3I7CTew8TN%NzzMOxqVVeI~&Kh6pUuzBKkmulL2W^ z#nQE0VEM;BF*L-aEr{6h_7QseTk{88dbOHz$16x2T$8;2bUbc2Nc>~5R0G2dF|>8UB+KZ;3CdUk zvck*L+BHVxfPiv$Fj5vfmnqWzQ*L*92L+k(X7U2&m|kk%phz<)kI)PUOyG-X63^=* zg&N5&TcyZO_{#o=;6xF2Y8vFxy=IN5r$v2dwCx*Hd)z6Z`ydoh?$yps;q~D9m{ah0 zj;vAF*D& z;trMq4rY?HSLnXGHZ?YmLk+60;vHRlxm2}cD+S&C1`t2!pVhANl4B@GXN!35n!Gx& zQH}4M+vy5JQb&X99*m58#MX2;q*HBDbi>ihS9>#W+{_Cth#TExble<|dd72ewOjf6 z`i@_)bdX(b)HT!rJ;i3?UPoc5#)%`*_dMq-(HXyLNPm{E5I@-1S_?pf546#LTVa<8UeNC z`TM%~T#v3m7{v6MHx>Lx?_6d$#3(NE&g$}SNo}?ZiGrMexPeURU=Oe}y}Qg-{yf6q zg6uLq9hX}ZqhH3uWw06KB%W-7XqyF2ZJq7c8!gpaFk8=_>yxNo?GsJf?+{VjYjiCz z?8a8wt;rt82ba^_`ba}$Lk-mwMa(gd0VfOAk^G&w&Ie|X)(S*wR2TvwyUSWITW<@e z)aYT&_qOeRQQGH@2H}^ho&78<_aw5Wn@pI1 zuG`9WXyWqZl^f0X$|Nb$`v_JtQNU+HneUVG9-&;`+PfUJN7<)(?0iUBS4$J2W7gi? zllSDU_PC8IHYqNz<=nP5QiB zDbtOoTL(S2R8hQj)R2QKf|-TMe_x>T-Fv)z8t=9+A7lP6$?uRV>q$Nh)?>#Kf)4HV z+o;0A!Y9~(cBIx}3A@E7Xnl^|L~?GaD2r>6WgzFFoWyK(;l$LhowIX&?Po9pm;7XA zoj0NAdHww4P{)^w7?WN7u{|k{n~{&GDE|ZqDQk-w%Zvv-^Xo8+M(?ZEVg@?zNz@RBYJ3-rz1z&Kjyd&34tZ28Oed_nCQ_nC?U|pH#*vmyqt7seOn_S6AgM*|TA2K3IDf z8AX1ZY3YaO9KM_~Wo_WVZt{M3@Lr^c*GZK~*xXx`tEoe_9F2hJk=ExDk9x!~0oig) z_u<&T_fopmLE_l95Wj?|o)e4Dc{(LZP3U>QU#2FxVTX#JGkl-C%FE16?T`KJn5a*& z`*wDR0}lQD`!RNOALwu9liq9dAlp$+qlu>5{&u5~?1nv1P28|fS#AXt%IE2s>!$|g zBfNVuZFoZ?r{rbLuQE}yI0`*rx8kh|`MzzJ#7--UCAOU4y{CO4VORM{RFK#Dd(t($ zyC>6?t=vPqW8DUB#m=dN>sQ<#)XBF(tRMK8Ke!!z5ByoZYiaveYVFLxv3@61Cmkt#?w@ zc9$!&AY8$GE2iXWjl#gs#1}E9UOhi8UU(~!zNN%$Y|AOzgo|Ky5V@P0%gV;qCtVya z$E@rZx3gx2>D<*y#vCGAn)x~g1$N|~JiMMUSs}~Qz2dE{&hRqg9L|)azA3#LO0)AT z%R%CBblzOMPUL#pv?Bqo*RJtDQ9)hX8SPdO@aK7sEW9Zk9Ih<>9*+LA4p}{m>*S+F z50sMpceN{GAH<|1n_s~aa@FlYo!nF8;C9b+g7?^9wpQ$kq-#%T=*K3FdAqd%8b`$f zcO$p#A^5eaytE7`i{D1KyFGlm7)3=E9O7GVm3jL16jpFqF00969rQZj+l^0DiwnJhdL4DI2zQsM=b3E zYT6m2UQ~}&T@Jxep4cy!HlDciS=g-U7~}P+*fgV#&)Z)=pX8)nS9JtOgR=dsOaF%y zJ^`1hPBN$^oS8~nd}%W)`m5*H-cJ13KFwX|e{7C^&uzaBZPNtYk=JAH4-|4!QubB6 zB|WvKf2RSKQ~RJ5Cv_8WOX?xz@O$^}WpMA?w~u}?awG$#Y4Wj=S?%P3FAm$E9E!a} z!4k`&v2bs9|H3=hF3-bTe|`%R_3v8$>a^Cgz0U%p4ol!#l$8>*VuoYF>q4#~Ryh6(WpabRKj&*VAZF@By7SyA`v+;LgJG zd3*OmzIrd|i$kpbK*M$Xms*phw6yLNhMxRGb9&d-lJ0Lx_G3a>tn=E+e!%HTd9nVJ zE{5StCZv}yzr836>YyKk`Ja`ib)NAcGSjhf(3y|vK3(@!XARMeuDw5Yh#g)z|B8fM z-bDN^ZP%y5PdC1d|19G2TjH3V0AAqU8+H&QuaI%bFIlwR;UFep>!&eFULv#BJ00Ff zJ%le?XHFgUl>J1H@~4^U{TW@GEk#2=>~Ue|psEwoPxA{^-(q(i5B=Afl&5LvzpK_n zJX#hmqfE^8S@y)_d+>^TnI$)OjKlhBchkhzX4d;c&;00B;L85@L$1ge!Pa zpc*z!7CQK2tT;1#_1WLZ-<&$(_)idS(I^?~j%9kHP>Q`hb60#PRBjS&#C7O#w`i|E zPtVJf-px#Zh%cH$T!0Nsl2Dp^=5?LZR~ozy?fwik5%r(K4N)Jy%S9VXM94pF$+SK9 zW5eOwCko6LX7=nP5Nn$RK1@&-Pc+t`_QJM2Xy_;Q))-6ZB71}Z^F$kbg+X|t7> zmS|C7mW&lESpzQP_s<{R4>Z#jL0>rR+oc_~j(P&oytCre=8#^8Q?^r4QSrH_LH$fI~~V^l#r()>Yd*Y1o@Q4_w@oRmlu-|}63uvT=JQO0#N z>0q%qw%v;V2Fl{Cr>d%|jJVCCq~+zeoCbm_3%K!m&|v*zmm3&WNN3C z8o$_JQ`TB*U&BdKT|?x^E*4N2`imTDylU%t0J={eN6*eiK7JBSi|e+coeB`VPxBb$CwD)p-6 z)(zv1y|G6nmsOD(KNzU*0^}4_wbo&A%Dg|<+pDRmxx#-p!Uk?R#BM!MNwyN!P_=Rb z3aqakX?*Qv>KjMHF`I^YGdYIdITj~_bnw7p2ctu23fXIgHa1&PW!U_@(;XAKQ?MZL zUz_`m;1j#uEL*EzNTK^?-OQH7&*%MMHNOi$FE{VX82y4buKe`)O7~9`GJk)dyypo) z4CNU>ls~+5wL$^z8aL8C;>TV}=X4%gApAEd&ddty0v~K3S)T?Yn z@4KG`9rJML1=km$je~5gtl!!Q$=Ta9Yo?|r*2Vmr^qKS;P zAD`U0)uEu1eTe?lI(JgmOMf)W`yjph3%@O#zr^_Wo62d8rFu^hMaLsJy#)~vU>y5n zm+B6kRo;x__PYIUC_R>Dy4}m>8Pgx7x|T|4;`cL;^H~frc!KnPIlR+cL1>rHxEwsA zDC(f>C^fZ-y^x3qX6sn`{f+%Bl|ea&iT~2PyxlxFho*Nsg71dS>@d?|g|$x7;1V_u zw`^=zbqd5_G&jjP@|W5(UHIz=zyHb0KzkMU<@opN7r{OQXTk3n`gz<=x#TG#CS}F1 zehfpoHVyC%c@6PDI=jB7@Z_F+j(?p7_l>zZ(>BVcxyYn(#5dPAs64La{lV?{`r8PC zPV+vyh6VABjkd`Nkzp8NRp?MCFx8I;8Q$;zKj{5UwC?A4x8p#3cE&nm2M}Q)A$m$6 z!|xhV=ETY9M+3KlO4krK`6*_*n9J&Bq%A^|HzSZR!zFASZZV?SFw3fd!ZQ5gdRzM! z7JmQpq!V@SIv~e;zi_AGdbZZ9$9=fCPiJdpmf}FdJPU_J83yst?ThvObN-(ERow|hR0KPe~ zh;P6ik@C(}Cu0}}vFBD~O_tn}@goP!9&G!+#21(Xc1(lYTjp)sF22pNwbSSRB%Wuq z%VilV@AP?}`D0y=U=v0CoA;-lNeuGe{^jeD{}Io>Wyd|VG6%1eO_vC8N#HGItwtfk zJTa-ev6_aWd}?-gz?&wxr_?1i(MH<@S{-6XVKI*_=5Lbefozn3!{)@4_e#_C79WoA z%_ogh0Y(4Qkk`r|tN%vMn|cpea>I62`WET~uYKEczK9Batf~8BNc#34zY}9AJDJQ36O}%;lBzsu0nDJPFzU%uu8Izif%_!1w zDqI5V$)9ntZMzgSKdV=EZKVtO7ci+%n`jT)wDE6c7iA~eOB;x#=)4j%Cpdi16A%H} z(OatdbSEplR6@n}==B0&B9vap>f@B5NM?qw_bX!emqel%xa+eRhlYX^6A~`6gB(A1 zV7pUbtmtEV+5ck;j_kKCQvdisz3#_3;z;DSfnBxoC@2#cZaE*X?ayTR9MGU>?ZB|8 zEhhh)nEnF7J!565SM8bJuTL9hzu(B9+2nlmKUoAl`xy6J)fSbMeD&^A`k{o&FqILT zA;LU7US<3@{K-yQHErp}5!&LYs;a8Z_30XhH8OH;?+igR1;4SXc1CF)#nMd6OI^RZ z;$i)@mH2&eBNkY&*ZB@EJ6JoJPd3NJHk{|(rLbAv`zrOnx%J+0WjCgnO!pUqzA?t6 zww!Pu+qNx@&i=<+@*n~c#3UDjJyvqXWW3gHnC{{B`v05BGx5K8rCWF7I{S%bTCwXxW>^#@Z32ABV$}zGs49DJR zBUT>9Ut{0f7YDlcB)~h#o!eZ_F#r1~%qR75t}-%rr1Cac=wSVH;$_7}5VEjGF>I^P zqVJ|Xiq#LgQ4o|o-v8nHDhaK<>P}adHYY5_6dg0H>39}*m;+3k{99`1B9_dv_CnmZ z(q9v6DtlQKx;_bW{~w%bNFR`Oi5+hmKs+ygE*l5WZ{B(`4~$j+xaW6wzV>MKbl>-X ztsfxNeYIz@yZ=gA*+bm)^e>0DK1G`Nyb?VmRf0`ArnI$%df)9x}E^~#Q z`(OvW)hGty!D|gx*!}zW&s~!9d}+uzko;f&zhxxv5KVLdaZ_w1EB&|iSDLC%%L>zt zT7h*F8@p3>;H6(f`GeLSBX%NB&v5@+vh69PGA9H)K2LWG;!V7F#Z(kh@5b)@dL<`I zAn3Z<_iTw{|0cxm34URjoo^jiHU?3Jo`WOePSR}IpJH&HMhCzzcJ7dT9+UH)d;gdA109?L z+raenmMz)d%)~TzwhtD8A26%&?6W5dbf1TeOy`KS!vCf9y!g9Ihp+)eufqdZfg86R z<=q8}DtU-xkAE4GbedyBOM`^?PRrv6eUe4c>eV7!CRd|e-rQN6C|s22tzm0okPd}j=TQ; zyMA}i@$4DT3N!P*_dd_3p8IAgwC+#Z(k^R0LRDT_ra_s~4yxwAaLSQ?^gKb6D}d-f z>dyyHkAyN-fAd-JhDBTY1oNLBVQTbbea-8eUs)m2!66ja`2Y241KkaZ#fu+w+55hy zfLoH?n$$*+FO0?;0D?EsbF*Zr2_Y0F=HK=;pX&bmW&2QzdFv9bUyZM|ZlM&c`YR8k zJPN#>i2S`t`;Z*5U0<60!%Q+r{sRe6O|fDMXfY@qO}qWE zO(PBxf4sExbp6}J#0Ae#RB5dx{{_Fl%20(CYEfNpV`gA@GlLwGCT6_vArOjp00CRz zsYR8iK$1Lwf}&Z8lwqh=t3;n;po?lUA z^Ns80S>>H2<(TCSI1XDPA|oT^H8pEmOlRNk zhHg~7{`&v->EB;XF=FMSSxY7|{z5;_Cc?WIkp7YYOB+KXuKUJP5E0&*N!Gvp_aFP! zG_FvEXkI`($YSCkg~sPZX?A<7h)K$IV;re?yt@#j{)&#CRAx^2zwaI_?6EtA>in!G zi~t7{6A`^dj6N{_$(`je?+*4KI*_1AVdsT3<9!H)!@nwkuo*K z(u)!#gV+HtWYVj9NX>-Xs4Gnr1VnCJ8B%jcb*r4l&;3{bxxNYW7$%KXvjBP+3qBs+ zb}c8h6Gm(~P-w6hgrt%i1s)6!qY|$@sulg$a*T!j0yjD#XFv10_|Kb?ARl?V?R6fPnJbNP=SrrN?@h2;=~o})79Njo7$Gx!1YoIw}bUc)uh&~LFC)SX;< zg8zDnJcWs#j&AlEj)Cpk28y82AD*iC2g@{fu=9UiA5(18gd7#kc#@a0 z-NDkqhBU(f+0h)|n(E#fg*xkXb|&sf7X14=gA7B=}d;OxIP(%vOHef?qPllsv3oQ3EBG7R*tzj82t=~A`7 z56>3?su1l$!yA8A!OhZ!@^1bpxL#{J%=P%JuUj%l@iX`SBqdO6?_bCFTo{B$oP|9) z&Cagl9DjImw9H>*=O?H5`{fcCy6`r0d2Vj@<+yRb@Sk1#P>1tN;FY1Ch=1ZF;-(K>$^TaJ~426Up#R0Js;lOl$I-al`dfd^c+*4jiU-P z`qM8jJ!pGsSQUY3TjBvr8f4-fEhJW=R|>t<%ut1TPFRLus=XqGD1K>0aA=cl4sf z<^NTL#q5dA#GgRB0T$-3fQ~nm$6&LATn`Jd@$AqH^KSQLz=0$dZvdBrcbed+p(D!-r@%| zZhXRB=B7z^i3YIr%lnz~N>B^!_4GO_r3eH6Yem3u-{rte!WA*~Z-{8KId1<~Q&k%?T44gIBHkb+8Qg4?*CBEKfj(Q z7yi2(t$11wK*s>}a@n%-3QL*@yNEMeAtl@W5)t#J0bEEj&w{#RnOD&yOd z`*B=;@$DB>nx1|{3?WaL>+WvvU25_bKP6@Vk{|o0Dfzc|{DAOKlam#4A{_|(@uNe) z$;qvcSLmWRBlwuvnUgBs6=Z$*Fp2j6;>T1|CRhLlqIY{hJKs$9Ul)e@O+ zJOA}Fe)JfYNoq_2oDsu)SRW-Me@+9Ya8#+nMztpyBv=3P7bBptEAs^WJ9)Zt7mo|T zXKNC$m-6zrq%fsP=gLDxxnPl|r!bGZW7|vX?A*{{;sX=rF7Y3GUz7qboMhhm`Gt~4?BljOyE`2{tv zn6h7q^yVtfe0{nhO%C~#A`o|(B4nTc@T)RDS*COt>U@tAReXMiRynLJl*8L!k0_Ca zDCMZY8N?{uD)Wz?y!XHx<1~wXUTv)OGGIh0xBbUuN^HEpeYl^WP!>O$k;(YGa6RwU zzU&$QjV1`8^nw&iA6>|(;j`A4MB1H&<7ee+|(#72G89Wu`L3jPx+_HJJ0 zyKGGEee!%uka4Noo{sb??OqY|-c8x6$zB92V`_AXmZB*H;F&%DA0=a6J@mkUgocMK zphj%^cra^8Jyjp$Q;Zf7vVd5--zG+~U%dPYmSteOwB*FeX8O2ux$kpM1fANQg@r2z z%2VUyyt6GjO{V+j-kGq_upnNhH;192H+wE6>aZG|%`G^6F)ZLPyRteQDYNc%kybX6 z<}jHP^2B|&r`UCUY!*j%_qQHfcRedSLVu`x%D(lda%5w7y4!4eR1$`bKH;|bHekXV zr?lTW)bW zbx|v(==hrfXJNy;>?oeaU@a z&2Y&V@OOg2;aa=7Iq|CS@Q`M)j+h6mQH-p4686UT#*)1?o#!H~RI9A);?gw3f&a8l zS#9@?bJCreHqjyFO<&%VC!Pmqmd_2h->8{h?^6isCSsZ;`Lt~ z+@G%=k)$=-(f1lu%;46hYZvSW#=H_^`g0C-xOQ&NaTfNj;lTPl*$2=-Sas`@mEHEL&Z<;ymR`sjj+gcwly&q@hbL@ID*2S zp8wPIL~xhYaGMtv#wi_zwMI*ythRLXt}s*;$)`?^9h_gfdUBLovlHz8 zVf|BKev&}lM1`!~+VBo-%A+!4@kBC(d%#_r)5j8rN4e^)_4!CLs?sPXMag_l*BpAE*(8!r|yC+F@hj-qb~48!b{ zKU?43giT1ffo#=bPA+vm?0=lpcOnf-*%%MIzOXO^&91VWjScRUNW;##lt2HkEPx=S z+TV9|eW_DyFM^OPZSY_VZUc9kYqxUs#%Y|I5N{=4kVyFUO59TXqv_G&gKy3=gpO`& z>Z45q4HMy57iT{-U8`^nmKR)Wat6=FVXi$bpLO5#yuCI~kMN#?kn_CsN7z1qPhPvT zSjihqs$ZQ;+KVdG$}(y9XKok5CmMoF;z#ly85&s~&b?asf}cL`xNwP>hTEw7hK{gL zB6wgWiOExgYv{%MYcrLpD=;!;@uG186u|AhYposV-Hpq>2u>zUH|A|KIEk3 zclCbmcLj2^jTlBC{|B|Gp`O}RtOYE#+d!s9SCSv|zwd0G_+=!B2$5f@c6QqFWP0-C zbNTRaV@Pz|pjl^}WE2tO!?j)1cpk^}^3k3jM&3H|FiJe`MPW`#9`R;7L_h}C(R*6sZ7aB?6^!TJ{f}uV17u}hRyFlhh)vs=7 ziGCJ;9YY#FVCQk%XUIPp$dJ~5Q(=Cithw`wD(*?Cd%dR1sM&r0rAIm4fp>M3F7f{l z9D92~Mz&v4T>Klu0PaPy>b@qVE!aRiKylb!Ju8w_d{n<+!xsz6qp4wl8kC!blf3WaagW9p3Olr z2WJr$OrG@DxrR=ZWa@L;iZvhe*oAM4k?T8+jJH);>$f18H3k++Jz!Q2bIrt(sFAYt zbrIq!tJoA2-$P@^k|bD=a&J!xbwq6Zj@<}!9l)LlY7k)w8JWa41f0_dxD%k5TyYd<)$q4TG zUFY>JiHapLFTB;`CEBVylfjTwdQ4|C>$-=FeUSOmONPJcpY_w_$<(Z3tpMOnn0Nuw zvlo+mrMh|ncK)dFFLJ`tVG17YGX2eRX!h*lWnpiT?mYFmO&RBAW`IJRp!NxxuFZEc zrXM+%9hN9gPt?NpDTxu96Soe!E$YAJ0svzY$MJ35qCNs&LBTt2y(~vX zwmm(~p`dB^isxhe@bK`LGu0cBQ0G0>avPy0UzTK9N~G0o0jisxwV+RyRoips>7sU* ztVYCXMS8LKf2S@}IUZpOXkdOx$m#=S5c>k|jXphl+4JYw?&k-3BoOzynrb~UMILC; zi%V4{(qy1bJ?IIMPtkT#PiE@itGCOqIsZm0PXdoMkLF=tMW~pLJ%+RdvesxX7#)vQ z=?rf8kzE6wOmiYX&hT~rYVY`S_S*&v?dMY$m!2DkC#1k5jACF+7~m8j$s93*jIz(4 zc$3`*&KTN3th{;I8e}=`u-#0ozv^|#*KEIb=@H2g9Qq%tRYQ>gN6zHee$=SS&XX;HIj`L^-_XKMq5l_m^!>B#Y?*ko~X2~@WTp? z_`cP}e|8_G?WQ(yGIcS^5V30Q-l-TRe4#)1$zZuF?%h1cxsw!r1 zUc8Tsz3R?T(cKwgt5MO?oZ&c*2#xrBpOCrg>j18Y%kY-V%*IqHmipAjYe6w|TLz$V?mMY243}2(D!|v93mCFM8Vh|b zW?y>iG1y-~Irfg3grl^y;N>^rTZ>$}?`zQ0e&xRX`QW!37q=v1)nXq0$xPO{OuBSX zy&h`35U-a@jz?1=&^V|q`5-KejB>e0Xas!Mv6lf9|0D_x))&@5jDfSU)H?l7dAyJ$ zgPfrq$i%()L$dF`erz|?sV?OO=9H1Hrm@{Zly-aQiA&7!h)HB-pwH-RQk+HPs75)a zvfeSj1FI}}x1r@|*PoN6x}03#`H%$q~bHD)Gq)VK>&+ga^P zbNWE*2Q5y())gPus(mvV%pYrq2C)NS**t42S8$ZP)mq4X`Qz~-6tSxvfR)so9Hx(z zP`#^Tp{4qHUQVQL@jUNdYq}8St3B9%ewd~+N*>1%Ax~7@*^)l<}`QbTLNU%vNM0VKg?vMKTI@Ai%yKv zF>U6!s_>Ka<)u5Zgu>1NgtROp)q4+ zO&8Bse{~qZ`bJFtbAWc`QwQ}#Eet=iH_#U!S;`#EduCJg7mGDhZB^JGr&WQvISQLO z9M0GD=gv($QIM)u1r^WHd3&}H*16$+rbgj5PvZEtKc|v)G23Im)Piy>88@YA_SI+oFWBj8wnAOHPKx|p-0+~p?(lN5uP}?_eK*GCOdsTSm%(*69Egpw(w7ss zg&XLyh@ac`&Eu0ywJ1-edR;KG>xSH5cXDm=?WKc2mj|=%i;-Ij3)@=mxL>k4r9bz+ zHoF0!sRYDOTUC)u1UH;!D?gtdqNlRwx25A)D8{N1UjP2;nfU?le)df$6i zQP1?=pq)2{e;5KDxVYr9kmy>TzkT~tkcZnNJ4#L&aN+`F$fFK_vv;w_YA-ueG*h0e zYF4sPakAWF7+PjO>GJWa<3^0#|O zEs#4VgNTuOKoKSEqL@PEOEPlNqN&C9_NyUc0K~ec6^`?1zDoZ@U^qJPUBI5I51B78 zgAq=N_9sk+YWLrE-G)9F|0;Pgi_mGGXnYv8O4?JFYhygXK<(vYQFGrrQzkI$YK1vm zr4~W3G$aqeCtO46kuXGAuvVQ|>oGuaH=>G`k=O}hfh?vPvd(=9ls~D--|_LLx5sdO zqee#Ng6ub>UWF{bRNA%UmvY^HVUZ7^GQSdX9GZl;(Itm3L_-T5-$pmn(Dd3gcuXzH zn`^3(kfg+w7;%LmV;brb-SFVrd^<-gDEU5YSb2-njZvzIO}SjIa;pXcv!2#+?1ha;@_J0 z{n2m2EIC#Q;nQoi^;ubMJHxkl^)fKK@VXgRx4WwP+GcjOb8*P+wbqAw=<|~ecge|* zx6C$PFP8dYsyM-_64#kSW`$_&35Jn#mlzrojW_pwyZ8q>ouvWF}!%n%4H*&A4s>_0{ zDiyHC-c(x?BdyG!dw^WS2NGUJ7|B_k4&gyDskgNcS>}#I*fYqiUv9uRHXL zAxFg83{l$PHUj*SvH~$T`#@^Ba5K+K+x=L&YvX(4JH@ag^et#&Kx?|miHgmJ`H;!} zZX_hDg;Ge+nygXNa#&#;-k%B9*%u;irdaNIV6Ht7wdNo${&Dp&U1&wQ>F3r+221wm z{cuaYrQQ@D#eBslrnxW8`x?^MnMjwH$MRyH50#lb?04RqxLAKir!rw1HOJ~?;dj+) z5IrdNJP7D;62X=;3@bg&ja5Q^?Id^Tu#v`(?N;E>;;qzRuSV5s-Vk*T#f*kqr^m=L z^fGr9&3&Z$CG_r&v}|-hD_UMBnT?FCtGVppp#KdQU@s>QMK6!eBwd-*AIql{l3=-N? zQJ_hbFb!KG=a6`Hp9EKbSufV=t+2nwJvT zaA#{1D^-`9KH=mvXE>^#v0XAUBeq?BGgRZt1WI0OuCu(E{d}=Li*DCFBnO|%kr8*J zkIzQbV1C#Hhm0Odp~pt&`(m$;0b#&0>)}X-bmVD;hCpKLeRAGMXm}0Sonl1ve&qQEC7d%r(oWuF7!E{iZ}*|4eZk~|a0#H^HsX+9S3b0!V!K?G1Z{TXq+3*qypn_Rw8X;_?0HdMpu#L&)HlX41J(YG$Iw z)O*3nNk4bjrt0VueHu_KMb)9NCN1vtp+e^Os;p;pcaiRA4~WOC{EbH6|CnK6|NUOQ z7HU|}b~627bTJIm6g@tCu;;>Tgn|4WNA6wD{HzWR?31<=6z)#KTr?u`!HY(n7BO92;RZ(Ghf3OeE z=odjx?4KN6o{ZJ*oGJB;VP`*vb#9gm2RHT7Zo$RNX@Jy5cKBKzFvmR{wH&d!`o5 zJdvU6wg6>k=8 z(HuVtB(in$ujP-5lW|>^I1ca~bI%CRwYpSgZV z=4le~lKO|9N{SVf7~Ep$X`W70cz9d?MVvMupO8%y+=mdZ&}_5o4YQCK6^UoS4H(=H zbe=?o>iVakL+%&rFUV-Ss&E+@M z3YXv+HsNKc%}TznH%f#7+JR@g6t#mKEQkV1)koO%xn0CYq()k+?cnfL7Vdm)SiY4+B>c>D4*E5r=ny&G7 z+*xTpJY)eHHwv-3PDPp%I_Y)}oft0?niwk<(HgCk2eR-j<$Cn{R{d*`hX(O3R;v5Q zhmUx{p=RR<@zxCKYgc!U6Wt>*fH%<<9kDuA%X=`%Tz{&(YyGOA>D8;*%I&%I6M3<3 z&&0@WH|t3~YL|qdk544p zWPtYDW+eSVFvjdt_Qlo7^88(7yTpg0M<^7HAVoQP$Y-=R0*w%(k2%|#zCIf%=$Q8&6fkgnJA(5CQ}EPby1LPDf9 zyUFHjqaO{9Gka^j9K5{oMjsDgeJGi*@<1}lxFht>l3Q8#CaXb44g;UAmj$-!+vM0A zX-~wlOJZcj=LpgN=^v_i(Pmz1d-Gu7c6fPtxghbL{Pci|FxfhWdynSBa;sltl$n`u zaY5MEt8P4yx?GT);*+a_j6U2U-*Cy!>Rd$In4UwS|YpTe4gqJ%W)^VVMgh= z_P{{WZzL4YlUJL%`NB*#s=MFC*IHBxt{YAA>0T~S+C=l8Ke2V%EnFS=vT7=BzlAL} z4%x?WOS8dF)X1LOd2P-9R%^;mjoY!E<9V1YL#w8`fRJ`~Fewc!xqbYa^UdU>1R(M$ zxL3#)$O-Zt{=jcd)b``y9!?%K-I9~l(`46qK`nLh_vxOOH!*uyO+YHtkLqgE!z|*< zHnF83iQG*TW`Tct)=IhEGI`D}*|`W%I>HqJJ4sFE8ad zzN0oD(sNCb!_?rbhqZ1PmD5uu;mcKCW3KGCgYXO!oR4m=22UdB=R8SRr52~@2|xOhDBIT*ZQaqTJxVSr)+pvE;B~Ct z=2sOZwjz3CAV%}Y;OS>EFMp3|EdXel$z$0*j}Sv=9g?4naWqv~EC!IUSjqiJ^APRf*dx0yvk&edwu5tL0>=(e`SNlCSM@$^>W`P_vR!7Q#UQ z6v;_gToG%pV_>^`tDDo9awD29tCeRnJ66RqGZ&>dQ5H4-ttWY2vxmEm;(8ewma79<-mq=Y+4#Lo2#%lF{Wf%BRa*mz%#;0ZQf4S6`OQZG$q$?8-;H0@vn8@>;KmJZwg_` ztgsbmAXYc^`T{Os5Ni--w}H@eN;T4w1_n|D%;-|oEZD^F*cL77&b27iD&>gOu(R5g zb37Y-xVho(YA|+}${8pRJ&zC)7(zSOt+bBmtO2Qbw7WA$jwg3-O`Wx*3syl zt|{>;;YZzLFvnYT0{ZrBHKkifEwGl4hEp@Ixz$=Enh{rAHnt~*mO%M%m#ZN3eZP$s zUB|}?+qkPH5YUYMP=^et9PfHvS>EI(QLLEp#k}+zA3I&EDAt`3b+qZB9*tWaHcK%< zM)vM!#La%y3odh()NDUpX~2p(UCw0z`aYENJC)M@K>nOXD><>1>1DUm{{Bw9YRJjE zr|(IYdk^orjhb0L3ld57AISUc40Gu%eFEi&0-^h!0+(9JNw?ZQ{K}POpB=rw0?aOF9VrChSC$Fz0WAu;Dt9VuBE6OC#ZFANx5kdQU z6zZAYZQ)zzChT5C|CA^lHwCpBzo;@sFMagTbJ~4b=~utA*b{nwcAPQusl~_%aBVA@ z(ejM7ptWHI^Sb4n=;#8s4am~zkX|`aZ6?9J$BfGZf#UTk{cRX^OK-mW+PYjG@2t~W zFp>HW-OK+}4#9YIK%L$jtY?lePu1Y?B(1z=qi(o|l3XyFqdN4Aeji}n zUQjMldM%+sWaIg9Y`F@wbZ9a4rC$4q&&L_kC1uJw6^w+kXv8|t(om_9lq@gGHfcG5 zmn=wsBuNmKaJjl%bdD{$KMN@IVe7tj%MZulwH&?kUSw9wE%A2n-cF^|DvmU>F?%GV zm0qeSZrN5k`J|lwDtMyMq;;^SLbY9`w6KkK>x~Dq`PcZ_^7NLXpIM1V(A_9&{M|pi zBCG(=G@ETU88Ax4!Aq9T^57O@qP|>cSq_8SW@DjPYV~~?r*-a_lP~z?%il#RL@+mz zn-h+JIIA-~A|L5WFdJ#B1->WM)ROkF_jtMb`)KR)Q;cXs77xqT`%5E_Jau%NeOYzl zh0lQGezMujIr;LmdYz?Kzw_%yj06|2NG7i~ZdPh$S3Xyw1+bZ;>lf@v82n&E+dZP1 zR)%e?nS)xG*u1VzN5~N@Nb#br;U25a*TV=Cl{RItm7bC){x-aD-5M*kSs3J95+Huc) zy|s2b*f=I}ZzD>Y()A(m0!qhJd&I?T7*h@6IC_C^K2uyY-^E-Xo_mC5UETi60=Hs$ zI2oQ$t?NS&$BrzmM?_W|jj(a=Ev7!8h#DtUL1QVl?CCU{PZ3b4)hi9pBp5VEO@A+J3eSHa2zawlTAg#esDx9WI8L~FG7OSuR zuZN%{Ad5nw*>iJ&P(#UFszLbv=P-`^&COlEH@|RcYEem(K{yQWnxY9&pNY$`K8tf( z$Oq>`c|lKtSeE9e(XYy}#`E6|4;5L7`rzHD;Kk%7;uAM2VKRf754bBP*AK^UqHT(- zE$W8fxgb>c8uQewzE1n`a%?UW^1L0_*+Gq0OIP0wMD3OeLYie|cUTd-ho{E@B+srn zEeEA8>)JM38%XBX#hhvqUo7#x7RAVGFZxPaalQX_0!(vwE7ru+XSehg=T)*t8|yYc#+>I^m)CV zS=)=wm4S%ti4 zGSIdX$B9JbYcws__ee_{_lGid#vvn!weRh)M;5#G625F@)Vy09M@p@^b1(xB8l{g| zJD(%2N^!PFejoL)Tbo^vGq*|~{V78R^se@HLZcwNA=9kME57FktE`SQwfnQ_6_e$8 z-R7*NPG9>#!#ZZ+e#deQmqQcoa#stUl(Ny(0N1 z$HENAM%OlJj}Qq$D*Cg*urfF^0tGHV$LL&Gl2!^N=$t%Q>3gnHtrJU?ySoa`IffUz zg+2NP)Huc!R8qV312Kui)SzMp6Xne*Qs#VZ;|)z$9SP2=z?A-m7d>%GdZ~F&BVieZO0mEb^Uh)BtRu`K~xZ87If+B<78ULGSEIx zur?PrahDx!balVy-v<#z)?8(h^4)K&-EE)sIh6Vo8h6bBp?aK(GH3|sEweiiXm-HuUENc?3$*9{6 z4T#~Nv@t}e>7RZ5_){{v-UnLG-s-46Y=5?1b**gO!3*Wn(~U;-Vt%#Va9wxS zRF?U8njR|S#^6jV9w_2{qJcjwAqTst#%{S(lliGeE8Beatwqw`#XPFl8V#nd`K6(M z5|IlFnFB-Xmo8xvdG6qdlb!t&nl~H!=aXGg7~zJie1(i7y!B|;(rgvcV0*s16lzdh z4K>ok$OQ_Q35#7{M99?o)vK`u~8Kmk`&r z@2vqT8+Neu`9Sf1~98ni!MN}c#;|KkaFvlZ20N4_Ic<&-{C_CM0$|I#gAo7l>+7_6VKmP zt2*RpJC^x6>?O3=U2pZe2yj68yc-uMK2#2N)xaHB8S6`0@5VfP~{4C4H;+(m@O$5jl z>ib983f`U%Xa_yD-3f7`@#@cbDw}2v5KXfx+TUowS#*YKYg zR|8?F`>ReVt}`mfd*V*rFyGEa`+*%lLVr6>T`hM^)FPG7m0{G?{Ogr$+i9Q3xmBX? zf%jI%9W#x#{G4_}_M~Hn@_Xg0FfpU5Kg~b0Nlx1-fu2I6Voe11Gz}E-)uqx-HnCsY z38eg*3f-~Mx_q{5n1R**`(nUt%8-FAUwNHE2qp%~CX2R&#ZV{uqkY@#-2j&64G@MCiT5{_>sJc4m~ z@1vdm%7Dlg$F=9L#gp+n=Yo0YDyKBTfr)c{w`9W=WO>A8jz=eF3&U^sOl2{-xPk10 z>G6p*GAUNyn*(nMcSzc8`jX>8jDeI1iu@V+>kCdM-Zdj$(oeO(4AO77C0p6E+Q{>x zN?_1&*YKPB`Dw@u_eNfcSE8d^jg5};P)E!*pqg+yZF{fv?`h(j+#g!f>w^pGkKy;U z;PF}KB1T45pxX@_8-s9zjl-s3&n@FlCRa}#9Xq=h30B7VA71?Z@`kX13Lg442f``` zM(ShmcwUJDI=Ibu^;hS=AhkRwtFG;sZbk(aLWQAyr4r2#$MuFKj>jBQ2@r>GN7sg) z7#f7t%b^$FCLy-;YyFucZ%4`Kl%OWxA*BF?L=j%Rg%&QWek2tFE?rEq$b+=#^Tx}) z!@9NNgA%*E$A(%6L$>lne@`ya=^ac~Nxv>;jj?#9s7X-hMe6jn>KC--8l*D($QJxnW{Y6)> z+j@P8L3neF-xZ{p^5vPYfN%!sgT7@LQ&($0ZYdZl^+GmokPAr+_26kqkZ?V}_oi|`bs-wd)|Hj+0KMrDds!d~7hfpMr3g>cC{$S~7xS1vt2$NKis=|)%KHH(UoOC`qX z62)Jpf5a*Hp4OjGb6zDkCK&=L!-QRsHq7iNiyYjHf-Ry!e5G1>0kpz4@_cpi9Ufav z+&iSUc_1HVG*n%oQ{OLMdvk7?$1X3=X&;K5t~vQG^lE0zni1)vF$Fa5H^7seo}h?2 zwt032Y)mu8yRiw|$|HxvLcSkxs2|k+a)k4Z*&Gl$%ZfcdRwad6sUQ=n0k2bsXQ?Zl zoo%&RwGI_!pv87}MRGlD-Dht!c~~tQWq#Apfl(an)Sol*)}OFAsOvStAdL#sOnm#A z*^!ZtTFI>glq$X)9U5r6k&KxQJm$KmoKD2Kw4i}e5VIZxeaLbGITJ7o@-$EHudJxg zNfqqN;ExGsa^}~xBSx7wr2pXj`{<;=7hJ{mc0BG!UQW22{jj^&L5fN0Vgi@&&*E~E zHIgyx^P}WcpW)$9ty-rc0n#jlL3{X=eZ3tVm^8>C7zB#Dvc#hfA|-2V&V>QKd6CZ# zk`(!72*mcAa|_{5#-7?_>Fo5=t3=KbnOxmkcCK7o&JN@xq+|lpfsVQen+yttvys3e zvaMXeVR{Ud6#W;8eA8E%^Nzs~UD!v-NY#mB;qx;qvs$Ofxf*o=*&BWAg3MY7$4v85 zwVj$BhU$u6SbffBMtKe<;gfk;jfW&xRbgZEM^bx}2Ie4zd zi-`>UI1pA^kZ3C|_3W*>%j#Yi21h6>^Bc6$N z>M8W#+}RRJdNjxJ?66U!*6PU<_`bNTa_srhN{WB>@wI#J(%lHJ5pImB#=>pWQ>Ul`%%Fks zy#AcntP__%CoDdN*sdV)M;E%v#^LKpV+%#C46-!+cz!mkk0AxsU3TN5m(Xlgo?*13 zN8W|H<;BKddu|*@ePQrum-fXoJ{{H-Gf_=BuXKLlC320Lr} zXGZd2tVa(uHSIFZh?KM5b3iQ2`6W(azJ=#L{LeY3*5C`Y)|5}5$6YC*+CVM?T<8S0TxrURT-__^bKy6>Im zr06BR$^lg;fAerIAZ%!Ec$s56z(qdvbxxA-UOd3Cy5WKv>fqs%_%qY#7u90Md=zA!_e9)cVk9el{2JY>orcyJFEEKgxWOCeKqB~wL4s4x;`v4!7$riWnfWJo zd&EKkikNmS@guOby(rmAKKnl>UEaH*dKw3whIu*qwg@=C;MS%K=0Gtp&v<5K6XB*7 z8Diys22EtX$fu!dOa)kZ(vZWEu2`Jt!8aU z#NRZK+X8J|1xC%pI>m~gW+$RB876`lj!yZwWW1Ff=PyOA5|)-lE?b?PH(V!V7Exj# zIPu*wkprCaw&L|Yzs}3Q;bm`XKig}_WbZimVC-& znV8ku+1}_Bt|4McpDPOf45U<*)%$Z7sv2OvEkGyJHFN}SOmSa}x)K-D7Q*0HjB0+| z!R`h7_HdN8#Ejc1!RR0%VMSMro=L)2tLu+0)nTG%$4%EnWQp1EjVaW9_BJSG`OOeI z8E|`Mz+u=o>M32ycVQSY3?o$M1q2ih?&P%`gB$$;=2f+ycuwGZv77(cG~4EANf7xf zv!UdZ%U=M9+pFU*_}(iqAZnr_16g`FxaucOwUvH#@>ER%ma=YwOKyQn!j*NFe7|dCwRQt0Qf^4c( z&|8>zm%55gHT#qW!6BQtzDzCl&^2&YYh0(e8Bo@dWOUiq%-om0zL&8jHG^FnpY&;T zZhu!`?9Nt|@8UkS5zke9RvDc!U5uIL2;%wi;i)l!t3T6SI||51As?Zj(OtUUmEa$` zMsW#d#}CW(^;&U+K1^%s5i?xDF%N`0Fnw2BgXhsMFQw%7q}Nj({qR#8>R!~!-$!rF zH1Uvc&DpM`?Z3yJmJ-{fAUs>?`i8*YDDn%in3U!JrlC=e8OD(2p9vs!pY7sJH+!3r zTDNw3=A1oVTRjaKS1$`qPT-|Hsp?ND2y3b6fHBfiIOm0FJpSilm=0Qit1M(bPUd;} zvu7mZDEu{cO_GdrCpYlMExbNjZ^zB0O#OlFIp10+z}$=T`sP}sZ9RoBB?p;7l3iY% zQ;Ua#ts){;r{9yyzj%tS8B(qz&YY^AJZOsOWPCs(yHu;pEU0h4L=Dnp)1nHnteHgq za|ZMV0%=8&Y8wlqRyBJV_osH^HVASir}l^2)f;d2IVam_tL^>GA7^J4-B}waQzmo8 zu(EO!1BVSDM22=$Z*ZJAvQ5-FfCQztiD>dbsfkKH-OM_Iz7lR&{^%x=XiOa2d)0;W zB72JON=mY$iSMsc*Xk|4O5eXqP1PGdk&bFVRe`wmjxIZS3Bzb>48%C9=SKK z*QR>Obt_p=EL_04{?zM__fDd6@2?5rnvT^cx)^j1>>8OoYh28FtX^lG`Bvun#}BkO zhjq8GvxOGAdUBxEhR9?gBEMNLt-)@7PD#VZHzV#sD~k^biMXb^A|OISJ`@2rLq1;k zh?xGgXL9=7J?Ng|Vrp=>IYLul+Fi5Z2lc0X9xA5sx`Y-(DO~)ei3A=_Q^Rl1B5G5( zEG|!ZA#YPsmgsd27ka)>DNr8guMxVuZ8R}vdLC&SFjDN(Vq}vG;c(~ES6HGu+UGz2 zKDGJIpD$STX-61d*dy_jmAffVe2lv)HMTp>PSf!XPt=HOojDnY>!1fTVHJ8Z%l-Z% z{kgQRkIr1W;|z5tM&oz~3S?($Ya`BkYQ=)H8bj_QwtFPrlov3w=W0QDRPK8H{qlj= z9<0~pZY|%Cf2XM#??fhNz;(~chU>!hW$v*Es^s$<9&+6f%1b@f3b3}fMi0ak-Z-+T zAG5kcOeV8>B&{ zyFpsIyZgTLedk;Eo^$S6^M`8Tvq=OFUZcWbao7#YHZh8P)n z9Crqm)b|S)<8upIfHPu|E*<|oPrrFhrwEIsLAxwnKN^Um>&ua=tu?!{yQ$IQO5QLs zs%X<`mKQGVciGEuoGwdRhjor>Nj+vszNXc#$WULBW1_1_O1?hf* z0Mdgd9^LGiPoDnmW9RjZ(3u_BfX9az1i;$6Uzz9)h}LU7)iwJa6gwrTBXsq zptJh((L$pX#*iLiR$>ZUjMUff3*@q7Ok?kcY=)_6g>9NFWV^U^Nk)BGG;CV@y+fOg*@6Ywuv&@^UV+f4& zT%Ri6AEdK>QJar<Ez-oKD|0VTiqV_`2HbX0>|@w45U%a zWu#zmmd3~CK1r+Zx`gm9nMzhEG@NBOUA4U6TNmp{k&AV1KS)l^8lM;hD(UZ#t}xfC zr42%*>e)F)()`;J0xN2V@Qii3G@HP38YYP-$R}3+9>4#oARm5{JU%yipa+9f5_iOy z>5Y?!eCGG5@V8RDlYNBHP7fK;Fw9mV@J{Mg>FuTYnLlF9?v=ez1f_Y&?2cg-T zKH<`mJ4T@weeulXeBKW!X^;0D+O6&}Y9ITu=q(wC%r|SyVa|?a)VKfuN$hitp;G95 zh8XBV7GV0QJD{AY%D8+eaJ$@__Eu2v71k`(TIa7$ZB~=7V>Nxla@vi;gJR)|=X}0f z@)q)yS;_{Q<1|p&hMAKBMAV!EWp;eL;1f28%?GRPKxGJX!^Zc^2^!;PB-5M%4_o9b zn?f6oK;&0FgPNR7ht~k%cnJ;kuf-I3rXk43*6Gwq)SdM42|^gzlgu;)isf#mXIgA) zE16R(2^S)#Ys{d$Fmi>@!MYTV`JX)L@uH6tmnWNCMmGYG1|?Td`{K-petaHVkZP!a zRH`x#uJm57_ggQPj+C^cvX{=ZB;QdUOvn2VR`Gd=$N#Q3A*utU~rvQt%Jr!^4^S^-1vBPjHuC5NfU^bymimPp#N$ zmt;UgO(6t2fOdVo?6_HAZC6oLzI}hA0M)2{6|#;jdo!FuqdZ+Rh0ufWqLbT|UCGag z?mJOec(y|js%3D~DR6q=vgH^fY355SScJrvEl|sTKqE1p3QXfG{F;F84fd!BYlxM( zCKK68{)S1TnH?^i?uiwE1j+7;3rDw9_HA2ek;=}>a^e`Ah{2sM5pz9|w%(bF!yfIu zdbpkiuQ0|`*`4hxw+SM`xb*xsF7dB;Fk7ol$K!>*7zxsVFSW<>AwvU6xqJEu&)z@z z+h<7FuPtHd@iMMY5F%~bD82Ub*?T<*P)BBPIsAJuf7|c9Ie&#V$(G7tgbXfGlITj+ zXD@cy&%!$0Fu&npb7fesTFw!uHG1&6Q%kCw-xj9G#d=I< z*_3$>{5VQ&bu-_Iar`L^s(tb58|{cXx4le!r!J$8{P9pKc(CEaNTH9ng-&+xU{Gn* z=15iX=4QSVr;=v8EJN7M0S{rFRgzNCz)8OQX{;cW(o(~TZBVC8XEMljwTrD$3SUYv zvBmXvG<(8wM``l@3pv>$4U@fo@o6ARu=JQG&({2>fY|MbDoZutK$c}$lfk{E^wtTk zO1ZU!pwT;#XQPCh`=xjMp;L_0<&bRscBR1=HC7TJGhCT8W28IE=yOuiSbCusodC^P zrzz0rpVb}0N_{7Q^!LK@)!1^ZqoN@+|W?yDl5jOOB^j+y;SmmMtgi=9z76BSTsN z0tZX^w1jA91Lu~5-XPKB*?f%k{wSe08x!mO6JBfLGELamr{nPwF+J_=C*_exxK-}S z9H^%T0W+($*r8^UuRLaG)ZN9n@Br=<2+K_YG{cWfQQj^{43otRa z(j>)SB@Z{bF1-GF>+tdbOO9^cJbYffT0D(ciAR3Gq->am(g2^q&gq!QYub{q}bC7DXOVQ?D1>)Zq*UlQ)nGF~BD^9>Ro+8(~ zPFC4n&vR3uBOzp->FFzMZ%INh3iNopJDq>LxM(8EWIGo=S68U=_y9}hkWKt({PHym zqu*a+Itz^az)p_Suc@dV?J_I!ZnsiUo2VG4ufXD2tblLOr0pkuPs)Uo7EZo zmR~MnhMe2@@F-#XNxMmdxSJS>IbRNvaJ;HqchVux5PY&??qKD)_Y2Wc(EMPTjM%j+ zPJ1qbICGKEe;Gkw15d#CL{Gu#hd&}L4+NXT$D;zpAMiK^bB7buRn$u0B)9fX&-rNEf>AtIrd+hrEKyFL`6e`<{aSc7qOmm!%;W0_#uw9%j*KJ$0 z+}TSk*ld(TK+RA?&i!r$o{`9#A(SggCN226-9l3qW#8W0u;e3!Z(w+Cqgs17pYini zyM;*7crNRJseiVq@1sm7x3l8;T)l!aU?u8`N<+yOBrpgCRawZRG#UQRrP8-a}~kg z|2i9AVD*d_#YxDSg_>jP)jtK`js$%Bt<`d0W=5n5$dx{S4#tS0(dOo+!4!ETv-}W4 zt@Pc~Ri+O0?tZ)aWIY_+y0R@*;6BF*w2q=ywL2Y0M`L`lRwt0+K36}7_Z-d-d zXP;{a$rzU0gOPdXlYSry0+j-}-RO6{5=t}18w&QlEwvcRLXT*~C^mc!J*aQ5nw3n% zyWUyyjq^W$s5Z@QsVme>sNd=o?7WS9hkCi4B;AYHXW}2x^csqn>ulKla#h3NJ^9u; zF(-a$Es%vj0b1dsm109YLqt4K(1qd|auGGZ7u<-EBS1$CS^q)po?`?1>?vbs5CnkN zL5$PAkuNgSVstZ-wxJy?klTw~m0HW>ZEYW-={&bzv9KPBpE0#CjIQo(y2I**khHK5 z(icYLJkDf3`Q^$|G3Rl5)Uc0cP@1nEj_lJ9KW3^YNGXKtQ$@s2!Bnv*Z^hpnopp{( z7E^-uO@D=RwG}7^@zZO@*UOff23i092mue78U<)6ly2`g*LEmsZ1Om7V}sv;B_{(d z6(L?;XYW}CE{58_C{>!uDf@cBD64*RI*d~ln#Sx37C(Ew#5T`%`LTR;)qLh+KU8mQ zv3Q{S-FQ_L`C!_64S-$@78d0ZzCOz7Vgw~F_O>N=Zw#{Ls#B@7&DRu5D*N4yyqSpyREV@$?E7d4bJLRn-Q}rFyzN z-+Xkmh+Py<=$s0K-F%E7bb!eVVu_wQm?2+pl|Ptg1Mp;#e9l(`M$JECr-)JQn?nk7 zfAj{u$dEAdh5@;%Z$>M7>0mK;$NSd8W-}z2*BxR>Kie4RkI%@?fhLlJ|CU%E?FO>f%{7?AF96mp^DYgopIRuwudLvM4ZGj z3g9iYV$vF9Wckun`+zQ1D4s%9js#lo0&v+0?~L1(U9QP{jlVE187tIh5nwJi4@de? zuQv@zhmDfQD@s)#pBhK&HT(*q(T1^^d^tiz%|^SuC_KQQ3_M4;NMi}A|6DSl!SBH~ z2~w8qI(wV^o5!`lq|K-3R?Tm~naZ;ye>fcy!l11rgmF0X*$cs$Y|-m-S?DB*U%Rnv z*i-H5&0Acf@8WwKEuL&f2U<|X!Mi)3yHT;TPkpgxo2%W5&MsiR+?RM1KC%Pu50kCJ zFibl8@s;l3G=+el0pLz=mf=|GaajU^3gEc`0I(t%ULm$Qoz^N0Pc{MZBz@P96I$ELsE5wN>PbQm;9T#JV zZ?0dp+-z&Up=SHX(`A)XvKOub^ivJhd6Bh0e<~HKG3yhHv8!FTW3V|qcWw<;$hw#_C$Tkpf4`EBCW$U8N#RRhc@~>5@@xs}+Cw!H4^xwtke) zboh*AxV2##WS?{SpkD-ZoAQt$lT*DxsOEzQ3muTB#&`k&3xr^gL#gNE+1ch(H|H>h zdb<=o$SQIl!SVn9GrXeFu=>2h)uK#Rm}c&KvA3F7pcY^BuOiX;sz^K*#v7mux#A z86W_?l0vm0<8!&0hTcDdvazr5wq}9fy zti!f^01Od1_>`$SWV1^NJK?Xc;6{C&-I*=GLPXi23MTG)A?rfSBMKL^qO2xk70;T; zq$i}Gvz!nd#F{h6p zaJ*r2CJ@l+le>TU`h&%9CYrep(JXGUkh|;c+uoSL6YFg%1@f_0T_1_|*}Cahj@Oq< zh2!=1CMaFa$!wXU{)DZCe?V!aySv3S2UC);PWMzIwhTtg33*`;5?ipw{o%Q#LDX4f zzT7y6z{7`>G8iZN(q`yrwzEL?Q&hZ>s<%8jX`T;0lI0`v=23-s(8WR6=+}t1^xm-* z@BYr-o*)p1KC!SycYZ7W0^#Ei31rSYe605qj?vV;f7nD2pQCqcFSnwgFms;V9=KsI zBxH+9w7e4dj~16?V011_`TPiaaonE(nQ&b~an;hP4UMLjx=339_;KMQaERXGxMt$K zaAbOZ08{Pk7LyzSSNr5kaz#sx=jtFoegzdBxa|XbDzNyb2Nb)y;qHNc5+;7=xL?d>ioc ztL5<96cx_m{HqK*3+9{ZOsxC0K7DD(zuQ01E@DV@FfN|e{Y7x)4Mr7H*_$uaJaktuiKLhft$64@uv@d$%@#2j(T4DrY7}lvCJv(ND=fLKo8(70o+eDCOvh z3(FrHAG9y8NDq4QYtaW&i*duS%MW6x$IDi`mFrYM8nJ*DW|==ufa5aO7l&JEpFKjb zbbATOlgE~m%Pft0LB?!pmd-!XDEpRNXm&n9g%lbydV5=5*K?vg2xDs>$ciN_vAx{S z2EuDaAj?>g1l_Y|0|tXJ2fy0xFH&1TH-^XQ12@|%BI~C^(VZfjGB{N7nez?cg|j67 z)K&p6wegKF+v9O%$yuGgbXT^)vnwe>jG9yqhCRc(uij+LSSxKLousjJmPDj_}(V!{$q#~^cj zTxlLpDwQziPp`>lxZ{0k+woM5w@c=z9%~K|DF$4653}*FZz1a|ozB;84=0pymlrCp zH!F?|b=49f6YoWkmafO;Iu3t|^s5Y0bJVVGxQK#U9~q036oeIXQSow0&`0La7qi9s zs&l`GVRuIW-HslDlyVuw3%Ssl9udgyZ|3QsEB?L8Q0Ie^o2C+F>RG2hPmb3Vr0{)p z_EN#nXetS^{l1i?=NmlJ!;uyVG~{nLt=myKY(~y%EpsgmocrT*JIO`GyJ59Hz+EK~ zDz!@-rV(Um;atw4jnawl_f@;>0DV#EcfV8@F8e=M=nosPGLEof!7Sh#bU3y_YCz68 zX80_kS*Lik(kTcc<@1o=e1R6|vhn7qe>n}3FBx+7`?2Kd^8)v9LMkEc2KSh$y;+fI z+oinea$ULwoCa}KfT=nvvKkEojUrbDh)X)SJ_wDd+a&}db1u7$o3 zM(6XT^f)=o^jI2ye4@;#KDJ>VzdssqAmCF%=xVXlIsvG@$(zPTAEUO+Cli^K_5r6; z(BX3npmfDr`;W?};rO}#oN z^b*PUA!%0geZ{&zv78efnSz@JK8}#kuXgY-*W7dhAPg}Wk=}Uf(`7CWPnRhe`uYxf zeR19T$Um5DIr~+(K9Ake-eLJ@d2GQ|WH=L$jh2mn{6pOX>ccm`c=_S+@#>*)A`Sy$ zChMHF9QgWd!HhbsIX3{kw^z2L zvN)TWI#`ebh;5upKLkx+C{@#(Es-2TCGsi`?CsGPP5Rt;I8h;iQKyjqupKR!p)L zm5yl08TMT;8nL)cv2qOmY+-*w;dM{c*u|apm!O9SDxwETp6m4GZ93JjL20ICT39y! zQ|ZjQUjTYxtU2J&03rq7?X9f^n^ocjf>7BpU@wF{!)L_XpoL(>awCNOO*sXY`p0Lu zc=|Envl8%43fz{HwU;?aSPRkh%`F#=#3S+2D^^*XZmb=VjeH8-F)7h5E;gi+i(E1H z=#8fGW}FE6t!4qp^(TmosIy(3o{;#n?eiS7ulvE?w>@%(Rrp9VB!rS8ryJhXi*RHa zuh*AYje1_MNs2ElT&%J^Rx^;nkesDv`>2%%F)E0}s4de}jnw>7y?zih-zjh1D~ox% z`ruu(xk;^TB=NNq(U!rBs6`mEtEN`xdSSJof6cma)d{7#w1#miY|qnb{6|t5&kGq) z@@@UgODeptd&iq0+2Ub73-G1bs|XP(HkwftK|VZ^swWDTy-p*zA$B;on<#EgCIcB^ z+@Z5~t_MJPRT;9tssAjFMoB!7~M=2e)QYU+*t(z}_e4)3C$cxyfR&7=*(}`rc12jB+!pk9-E_ zW!g2@e?HAv+wQSPypsd%V3%H9xfSj1N=@4=D1z^qL18B*!we4VM*``MCTKz4r{PS) zB;iC5KSZ4Az^98WiOmj_gA4Tvj#87@lwYauyj3!v9_G zp)i~nHp-+R0$UXic1Tmf)2NAQNR6GfX3hg2aqC#4wQD(9C zW)To+f_Q5?8Dpm--`*~+aVMn=)-GpB0HkSq>i!nSApIK4I8T(TT0Ro>3Gb++iFG=+ za#DoZZYQE&D4(ZAFxp=Z@$ST{aOxeWo;Dt&U+px`p}0l%>AaqK(h){KOVT>EaZ5=v zTHfI2-yIavI|>`m?^h-UXUKfFB=?=syJ~j#U}~xSkDTbbm0?*f{;dllb}c2CMF|m( z)UK0V>&i-pcKD+ViMSsnD4o=qXVc-gq$bT=FFoD~1%VIfDlAoZ0N139SMSw733G%b zur6XP1k(ekf_&4>s;XSC13k(dVoo+OQO+oxMWKecK&T zc%!c{wh@9uA7RLdv~+hxjs{~D1A76ZQF8SCx1BT^FkYcTsO0-VX&-yCAOuly@ojL{ zi}Lx~c>J*ZidMr=2^A-uZVs~d>D1XLpNlJknl1xo?Lo!d9BoUKS2!t0hwXiSPb8DM zYio4X+mYnb6`=~*Y$E-8mJ#Xiz=ayhTPnLGZmhoPpLSgZNd0K+w|@ads!}SxU_(!~ zMb+*7o4;iZ;AVL$%EtoJP1KKW0G7)q2j0_h?W9|Zfdy1`zsy>04zrIy{Y^v*LbPm( zc&D0|fA4lvqwZ6KpfG=5>b5x~*aqT+0ou9!@#uJaV_W?ELh1BGVa?;7y1ouz>SV55 zpR^ddZq0L9;1icLOeY0-c(LK@p#BOEMjJn)ryHu0^nbynU15wG z1kx_rww6z3Uz@ASl_0{aa3&GZaUM&7t$M8Yj|=+x2FMf1&KI=Ce5deP4vMe81>npaLLcTv?{WTmqHHHn@m7EPc zK1t;x?Z87cx8!cKx@DnOTjD1J5d&hXW(KJBz3SFyBU>l$(YZ|$;fY&0Eif|Zjd|H{GI&j)^eO+rK^IWsG zO1+58KDwHtORLeFCmeurahu;<_6r_Wb(1=I{!)$37ArnKFEp>U7`w0t*4kyDDrkF` zERbGh+@og?awImJge~Fi?EhCPr+(`n6mi#Ne=zk%Uk(Hv*Z`3>Q``U3bO?_5 zaBmFk!t&YkLdg*9G5&MHP{On-fi$}YZYKFHjbe1F`8$Ec!C;d^XKuq|jy}kX4{ecVr z^Itp|00BwaESqPBAfL}4PKk+#jE1K2fH4;T>qk@*5c`PfbWbO&9Ly_(Z*SqOsqD|@ zB3~HAi#xg9Pth0g>G|c+dCIO^6OQvF5cO4faCog3w?N>T_)M?ARvz|5E z=!9UQdb*=Hx+(?C0DgDPmf7Ul9gkB6fM0iIA3OUK`T~VZm)7f29mc!Eaj4#mRjA}m znM9grqHInUwN21!!{WmUMF1P6aQPeQ*t|E4BmaS3LmtSy04GHn5PMP~#ImF>n@;~4 z8eMi?L&zE&psn&P19pb50qajwjF6S+Rs1I;w0bR9F>&E;1wX?;DUzO>= z2V+Ldj>{WB-#nW>-RuZUlIvURHLgpp@qGKP9NEZ7IBeegTNCE2WiN7!a2wG@pSG=* zg2IQN@8qU-XL3;N?QJeYaM)OAL)FBMhi;5#ScDL@LmdAwq-!*YAR$4FKXdL|rw6pR zn|n)#?I;*G1>Xb(jRJVoiL^TBb~_+nu28zp26kM3)H6<6J5}uan5($Zh;o?RX7}rw zkhwNCy)qLEtHa-6bVYD0Es2%p9_igz={VVes`T>WuCPptbT6{ALKDpI`p;#wTQr<~ z2li%S8O6}6?+wZS?u|S_=6nfkRpNCjErG~p_tPn#`$w>byCun}gEtyzu^&X>BDnougDG+#;SqCN323wd~8= z-A>fT$F&fL{661w-pr;*5}M5w6w7(-2_@Gq9hqcoInDtXX+YULc32$=mmoj(0z=~+Hc+)uHkQ=4&gkhMQQRnSz2hvgugmKK5; z8iHBdp&)FY2EES{?z4zUbXhtx#zVs69j&#Ep;e`Oj=cKU4zN1+Y9{9g(29Q^7midz zLEY(ZW{nolx}7Bo2ucYNPdqIiJ?%g^8X1Rm7CcL19e$Pd(l;tc=8xxd=u$`8Oq+nW zdUYNX{=ab)E9S@*86(vWS34rNL<9>NF98uX7sHs7AfPpJE$6ByR$1|xI_texrdRm& z%WP#oYeOG}`@K(qx>AjPsM18Yv8m+LH=X=~dMW^oizauGe1 zb(`_r^!Tx;UTvk}iz5SGo!z@O?k7gcKhUb*_)&{gyFT3rMzflfOwd*ECatW#c}*)n zQK8>x@HI?CkdN*=QD^p=4Zv9WY=yhSYSc^(O4AJliq+3Dd9T&q^lJ?Jv5 z)j;MnS>oaRa(`S5{C&%Y>&F&@c2ASNp+1Pc0MHFKHsK029JDC5f|yqh5#W)e@lHVY z-Hy4tR9WV0Dr#)2Ef0iz-AmFtIP&p?O zuyIQPqHRhy_SKO6bKB$fmo9p3$$r-Zwz)3~M7wc}<-CR-!|EOR`W7DSjke!!w^tHa4w9d&R2cB7YpSaejkzH z60piB*-Y&}xH8plYJB(RK?G@ikY2VkqF5h@l`z}vc>#(jTC6Lt7@_xkalq@oz9BQo zP&c2esuD;?#0tFknACKBt2RmV3yM-HbgWSkr!h1hE#Bx1iMAMMln>tDjjaF5>pIz+ z9ccB%Q9|?3d`bOzPWnK2j1CIX(9CPc4W)%&>428Dlm}-V?QM60#jg9&_Ky*RXFm4@ zBBdH*xAmt~u77ap38={4&3D#+?dnifP3ogr@krB?++7!opjI zKVOwe0cC8uMYA^M-sg}+85o13V;E;_Sl4 zUj!EBUQ?_ZNtKi+*eRIN_3!}me49ySjIss?-9c%5xmpA7xRu;(ZTo8v2MBi9Y*H-5 z`35nqlj_pcq`q5c{}6#H)otD1Kb{^N774gjlr=5@)d3z-@EF6Jt@P|~o|9I! zzG6hq;JFRn95>U^+a=zpi2UqcS(4Y2fS0(U!2#W?H!q*XMv-yXT`f|knID#;qIkVK zukI+h1y6{zo2E-#S9OSQID$+e>D{pB!wsNWNApX03Fcc+eS zR*SsJE{yDJ@^;VuNz$p1|8Fwvcwd4S9E|x5Yq|ILo9I_5etw0+zj)m*fE>iM>>c^U zoYtlNa-(MJRA~^>TCi0cz!tA%HMWF1{dRpF+wS8{N_836{g-B{o{)#+Z+c#MrGcAV z=?*&|z0SnVi>@U;KbxqRe7$m+bfH0}Fq8nc90QW*OH#HTPBq{9hs?(dDiRV}B24VH z%J%KT#cJx&n|$`vig^PxDyKU;M!p@{DdVVO?pOVCT3KUuk1Cl3p@AC}dD9>>KR2iNbqS_EV`%ccEZ;lMsWABlpt*7=!UA4-@Ou zQJ@AYSLbc$Fkq_!^+6J0=L60TRjHN)77@qGq%)hTFi|t<0c>tN%mCHL2PtPHbczE5 z^Me%wTG!!M>Wxau1IBcuCO~+r28cusW;sWaFW zFa-IMb%SJ{ySze)!k)ZqbfCz zCqT%sQH;)!cd-rLaeBsc7ktIrji;A_0uN8JddTmoi7&NZTTBH5r<#Yy61-D{0kb_# zyDD%jS0X!!2g(Ewy_mBO%_VaKV@f-JXk}aa-hFcR@&?!SPdIYy&4~e5K{t-QSOxZ$$CuVOpBnKl6uAo_fO9GVKhD2#_Du0a0nQgkzJ!UBLG` zx|I+tS)HAl{Wb{IW1;QJe|D-=s-en=8Pwt#NrHtset@S~i_ys<@3Xj8%%Avto8vQjRkiuo zF-_R8>iXx0upb|dElkf_y_L0vUy(?M)2gyl2bS@%PL<^*?M|Uw+9gN<7_gb~{(@+q z&slOBpHOY&Tw$u@7pE1b<4!EgHN@m?Du>**lGQ}x{hz1GJCjnAZ}BM`#2KseM8w9e zdWQyB9?oX%%CG`nU@fI`K@UCZ`x0b+zjM{zIOrEGE#|PLB~z=N%(KykkR||y=tJfTl>Pd(lLVxJ3=pW_Rt%K_ z^v9il`C29~FU)l6t!`ajs@2Q0GJ-3E2PO(i0k`zn>)L39CnbE~nsXbl0!i_dXAA+! z=v%zaQW*iaN>-X-t?Hy$xDmHQ6d+?w#+S{i9c;RFY`I z?+e&$ojk7Juee6Nx9ZsKTE2!z=mSKiGR z(9f8PvdjJcS3gt>vKbH>=HuI)ueTR!B8qFl7Sn<&a24cFy=`H8-wOVl)KPW+^hJ(G zE0}4Ns{2??Vm~;%3;W_J&CvGy$AZ1Cbd z7Z9GH11TGuf0-!TeRV(%tcIKETUCyn9H2jD9oe=iFZlY*u&j_J)=vMGoGu{@=HYkP z6r0IC_Z4s14+sh{Y<(yEGlm3pZEtgaVukG`d+ zNmc7V$k0=v-P~FNK!%!QE-7lm4-d(-XL~dy0mOevg(@q&`v(9^z#V-x9RasO=P=p~ z?`_5?B!M9;RF|?k4bdAm^B>Ooom-#MYr?$o^!@5y`EdHAc!*C}Oi=CAA|q$4;xk$V zJE>%*g;lUD|H%qY_BaqOF9npZ8erHhTit?M)6~l4gU2b$n2w1e;k5q6aFS3G|J{g@ znd6S*-dEpzDZCGQYxav**dd^XC>jp7$!cL7@p2I{2#^GjNnmN7BnF*b<^zK8u0&GV ze6vKl1)GtPX?Bwnsm*M0?O!Nryrwd*Abz#(4Oo&h}f(Y*jE?@!VQF0v>pUMZtvi}^Q)Mu-O5Xq-zfROtAl?>?WkDzgk#Uu#Z`)gyz z%$YpxPq&vOa_y|OYD`?t=N9^~`A_9Q;!u!Bli=?9iaBm~REU)re=MZ(_o0C}}i+#cw-lGKN5Yf3|Nn&P|Y_@Bpx|;0;1!HQASYnSUsy0(JSLTnY?& z+Dsqz=O9gT0$`;-gF*A395+6s=HuX@%BsuToUN!0ibmuBeP%72jgA2?aLKMpn7Q9w zJ^!aLZVB(hvoC=|YF{xMG^!(W-v69+;m1io$&v5gJ!R{1TB81(|MJ+0v@};0bk5D6 z#_)JSWG&z#ZO%<~I;2C--)3t3Nn>$uMj7%7u*Kr{f0KHLlG;U>V?ksjgO{=Anfzi$O)6CS=`Z+Lg~Kv2AC68^6!-XC%J zzRj8Y$N*58A;BNZZGnEjGOKwd^G=D0NgfuJ-g~Bgh1yRFTnEzP(TT3BCFYO49wE85 z7aIoxtfPyNe6zXHY!hdq*Xx77gb#jo<2;wbr!`VjI2>tkTzf}AbAF-~3oL%?f7^ww z(=3yQ{F+0O?2&}TG@W0(8rh>Ylg$?wTGzguPI^0>UA{cYGqZ#k(*6{SL}GC)>YJ>{Aua?$}k5>V1%4TkuS~RuzIq?TRsvP-~59~`}vW>u?&`N=ea{PoQ2er5Tx`dAZILY zeBjW(0x@B?8{mMHL3B}~dEz!;k)30qoSkg*wzTAo$+hcO>i;bn$WT#=*5QP^un;0sT z`wucmn**8f)R%Rs@q2P-;7ZIxHE-Id z_g8d5-P4#I2-!hJ2a@wGXz667?Cka<1qvei4r%9l*x8$XcJU@jm_BYG+gjBJw;7)p zRU>8sEv5NDbBtvdZR%a2w9l)=+A1 zlLQU{B>LWKy$#DUAYc$L_The5KuIlEpNYD!*0k)C;lD6IH4q0Mp0UNe2Ay6jR9apYwls_P6ib_ONC_MB(D&~o2e#La(QQmOcj;&^eD$yvuiCgjX zsrRe9-%_|)zOsi0o{Kw2cN0;Y=MksISMMXald=%I$LB;;Y$G;i-?h0#X&@jZ&JT4C zwg8hO23c(xp5w&>ZrjGA4-j|AyyJ3IEPRF-J5&?WluAdgjj;YGrYfH3~XpzB!n%|$=tU?+~wG~O;K-xK_XSB;bA zTfop;wy#$+)th6mjQDTU!~z2OzN!qY&VyT?zxkbuM9ygy_(AXY2ndTTx@r~6?t4LH zgLSCrR1YZ%nwj1=^1Iy5fd@W~z(XyXu$^RoWb$q#qhNPUZFLBOnM*sz+g;HXJqwR?YTTFr1}*bZw7icbAd3mvR0K_>R|zk8`{26NNPD zV>yy_(g%7lQfrOAME24XU4{YL$FV%+(D1zk#88k@^X*Zl;)e|~`}u{YBF1A5IUvzX zhr_toySc(ck`{c1M8ZCY2TtxjXRfyUah%4iu0FyEoq?!6-Pu{V`c(?Mm!6lg z2gCD-u(pb$n{&&X%X7;W1MnlGDCo?nlJ$|K@;8_D&V*=?i`swOGglIRr8Lq9&Cq{| zc??27+R3SGi&X$OzJI=;okUnPklJvy0!i3FR_>!rLB8wGxA6eG)#t>(IMSC|HR6)O z#xl}(LmWAtuEt~EzJ!QE^9-PQcyrtUp@r>Fg@qe(aahu&L^-=5{#qgdaYk-0m=#GiiqQk=8%YG*)ew7Sl(_5~!$NPBnem>Lq zZusSF)XiTXsPQ@bG6YpjrJ<3Lijk`K&4qwKd%i;%wu+A@3RF>?*~+J==iFWw1j@H- zID<{!fV(*1uue64i2Jn!4hNeb-BV+3NIf&^;0w3fV9;pP8Y4t;K9M`oN) z8SZM|@uP86F%R)%F%)p;rHfl(NtW_#uMj0JM1 z8)xPX4rLC!DbN4*r@{ZsA5(C_3~XN)PEGzRrqW*1kF$Of9*lABV8{3|z66BT^6pl< zk{BPa#jq?XrJ{w+0~2YDC3BDUe+f6k#S=inpQ4XuI?>}T4PUY}y*UzEurZWP&tq8a z-d!|}M634pE(CfjV42kaRmx_``L+}1N1~GvPC7n~u9(2Qf4GfrPY;woJ0MScLqc`8b=kn$m!bqA?!F)>>XH3zS;Kf%<##X|8$dji zRqs-v_l_ILLY0YSK(o{cL*-Mb=d~jrfGs42q#rcTuFBg$X+1aY>wRq5x&qqalE#4^ zpf>_z(!cU;S8eoPKa;IULW>4Pa_tsNVIL6{YIPkmnOVvvBGhJ&yezUgss zJWX3sQyD=!1`tK#OI}gzWt6A|YB?2p6If_xXBp5z2LZf-?Et01FlnUL|921n_3t0A zQ4lt*7aHSdh56*sl}e|0P|OXI2+z|#f1rhJ?7@o+(MI5 zi>EpU)D5bQrlJsNQ{hNX(DRLj)MIcAdhfQFvMlPG=u?0kxAycmi0;gB_j9>2GZBu8a)}mci8yMxw z#&(QDR12n$kCn2d@CVzXcqeNe+k+eLiU_LBTGjGpi(0R)=XmY2NGty!{PWe>69IGt z=MGNx1jHD+BRaR3^>M`SW~J3?wU*CYnLyeQasQ0Por;qS(z|g(d(OJ<0t10&+sYNWP5Ser*o}LaIC39i7bTB8DjR&&h%p^m_ z&IazvkUQB-;CqRd2!^u1<$P~#%h=BDygLR=xS2eHXPYg&Z(ruu9^)3bV_~6_daTmK z0+EhbL5tzwNJSJ$!cL$wQ?LbSbCWo5o3>KBz332Z(Z|BVZ843Mcy|qK=$2~J4T3{oESDg(YiRNuA!Jnac z=2L_Tkt8x;sW|M$6^W8sZ}0sZ0?Wd02MN2=h01yvy{t!0KLMyoaN{bm>+1jcT?&t? zx!S+&ts!^<(zJzZ4GoxJAomQtT(CEWdRPat5TP!P19a8NxqJn_`D28Dp2{P(?;jAP zK9@C~&4ltsP5MvUba|mhI=>SD?v!;$fflv~HD;(G!SefyA)q4e6T*e+dgQKRgV8K_ zPQ2Ka1We}pZ=oG{`H?7r-4UP@R-8-upEDNBz!|v;7eVb01Z}B!@3(=w*UoLKp?s*7 zjcjHg;UwRS%R!C)3q0m{6^vHnacR>ODuIpu8_49pyt=Vmcvb_XC&%G!PI7;$2<1N; zwoa!ffD}@#z_zC*unMmRrj`+Y?*ivkZn&tzM$8h((9vZPY^X5ogPK1wv%||`ul8pO znDb+tm68_sKE>nej3(}2O2&CNgWbBP&RB}CZ1DR&0ih&M#9LO+wt+4~;2mNQp>eY1 zM+N5PVLu~D<%oobbYlE#(54A2FLv`S%Ej)w+VFunDS<1R+iMhP4L^&3Ng$1h#SQ$P z=d`tG=st(G9ag%!7Ah@`MxqWTyvr950f9*vTdU{$ZY-65Z1V`Xf7{(cWb^`fKJ{=m zKi~Ap1o)vB*m|RQuFOWHu~RoUr^7UA**}c*B<+lipTT|hP;m=7NTbu)`2V5oz2m8V z|NrsmC?hMY7b>Hu%*x8BBPk;}l0Bj!BP22-hollJmA#S?8QJR~g^+!0j_m9*L-Kn( zNv~cx@89?K`KOd~p4W9f_P9T;=a0oiud96V+ZXCWu7q-pL+x({|o5#Nsq^Pjb?My3nhpMQ9y zt1gLS6OU^!$XD3w%@$1)DeiyiT=-MMY3 z^m=)ql80VfF33#KVti9=%NB{uX+#k;=nV&?!?w)eKI7>gN~is6szbG!Y|yi}_ZbA* z;?aE_72{Huw^<&uQ-Jhz=K^hB2w2rL&PKtWT$wZ=AY2srVpU+H?O~ z?Ey)LT0PWFR^6+iX6AP`n9*?0p6q2xCB8}ud*@?W zFlx^;H>na~5*7>dhX|rSfU`JJCm?QN?3(Cl1R$5cb$bY7ZF-s`fN~8PGnYS53}%=? zhc$n}+%iEs^8jF;pO0!JppPBBdIZ1FS?iYP7LHi4V@I#?Fr(s>o|Kxug{edZmva%f zD^8qHhPn1P%nW{iMYsRr$QC~3BfvN=zlwA1vs=-mE?YalVq<6y(L)DS6#li$E* zTuL=8wTj^EE>Iu7N8L@~TOEOnSY0s0Twj8M^3wMhfBhFEG@!Jb9T&VlBzg}$Sw&e{ zul&(H%sv$B-s>ES295S|@0js+4(&g21m@~rp9&N#W%Yi-Ym7Y)mCi5y;1dBraY0y1 zuqu>$JCZ0z-*wWxHq=qy?M*w<+Y~?RzAEE%asgmg+x_=8MgHjFRlUzUZ6m|!QvT>+ z5@Th6`ok-g2b|Kc%p!L~?T7oILXOK^tqxK%es^-8#ly#79UI|N8q?}`o{4=2xud%F}b0Lz?9e2KXUDj0LNEys06p@kx zbE1-9?4V^D1}MnatrqjFj>RIy+U>`aZ8gC&hUCg*mwz}p18Vu|!;!Z> zEF*9mTE5@AJ6cdql+RKQj8mYJkGTC9EO;5O!R}pxnRpQH9XQylQS1qqkv^XsBF@*pyK=dm>nkqDblXMcRi9 ze^*h~AUUQxzb0I$l(T$@Ullu<;C<~Q9qAe#OztU`flgQ!O!}}JKLk3b!N>n&PoDCC z@nf($Jz=mmL}Sr9uUfSHC!{IqyL=es*=*^h~jJY*q8W&=P>|5IoRaA_t2^ev}>Lnx9kvC?~Pbl$>BPG zrGZ+t=Mr*hQj+BUWSizhMzI@Fm6ew+GYh_gX=@7t%Exxz+w0gH%pi34@LU}3+WQvg zDd)aLx7-@b|E?Z4w0o!wN}4-elPqn zH8<8;b*KNq0iCD&s)lK(!WXT5w6d?8s2mXCH)cE5S9f8kWLQjV+$`n-{kFeD(N~bL z@%mEULzM38J+P0CyCCG(k0^PWtas4y+oc-YZT1sNFcg2&iFxuy34&ZUcU?+gejfY| z@KCR*;&}7l_q!kc`#Qlb?P4Tf$uh}rFzPo&Qk80^MTm&h9zrsWM{`OaN`Qh}cR#!;&s_q)N^JDera5JB| zE-wFwnZP$C>0?UN+6oJk`1W^Whxy&vMeiK{##sJP-sgOhwFm_erS5qUi3<&~UKvjA5?iFcyyNL)HyU*h~)3dV|?iMgO&)+5Q9FIq_ zZL)Ycq(x6y@Na^WYS0;w$(dYZ7La}h`k+0#RRAEXRTD(ja-t^EIzaH=xw|!H>VsAh zT{7wxyZs-&1pShl_~(hl-^HsnC&muc8jyHbb#qG7&xY%uawRe@oDQ>$ldBCJT|g@9ll&?Dgx{cMr?#JL15&PL4CjXs2(; zAQ1y0yzdl;kq~Kos(!Qy_i6I?X8@5sc&Y5Jz3`3eq)7U4lR%gqFQaNcMOzb&AST|9 z$(}PPI;+oJxO##!vrA4 zRJqwq*o`6W-b3&%&j#5pmiIq>omn@Vhr_({GJ1i}0{5SB*lW;Osx6+Qr9R zHH!+fDJ37UN4#GFUj&x0J&2{XtRYR6iMZH zg6N?_P*R5Q;1bKj{xy40u(R#Cd+y%F01C2;iM|$6&P~zL;7sI<6~D|548~bl*<^b$S&M_{TFeTw~FYd$n4m`-)PaAI&{)^+d*l{M6nn zG-!@1Bj7EaT%E}4^9>IY4xDW|`mAszEZOHlG-q#bBE?tNNS)04Fp*(iypJ9X?Pi@V8Aa-? zl9y1h?;Em+G5j3cr+`rJvlZ%|{3#6J6hkVG@BByc*paI@#YH59h_EhK2-}(pQ{1ls z#(K_J0*0PwRwKGxg&LgB39eNbA{e;UuOV=ajP702WF!?_B~8r_?Zy*=@gL8HCulXr z`8}6NKOE|A>T$`x-1lIVG7GV}lrUkYGT}cDE zwBypVD!Z|`U`+^iOrc@t_d;@d6X*-!QyLG@@{J-(5yQH8Y~d%@_9@Bal=Co2HefTY zx?_i-(-1W5(v9v#>={VJ?YW1?t$GaS_U)e@j?a>n{h;2G78Y5uaZP~dZE>@m&-pPW zi?lxfD`rnBFK<0c_LM&Ho=LelN($|m| zE-hS4#21Kn-}#|EHxS$y8x!;7$uoYl^Wrg{{hOmXcXt;Ch|d+BR+D-Bl%g0|Scbn& zRdz4)g!j*`h3Uf*0_k?AK`4*)Cv#q-yCF>q<9{zp%6yPiW8k>}+)k;JI|2qyQU^o;D$*Df*hAEQ74f> z{*iJ6%g0^*h68RETwV7`Bp9UB4~ikr?1ntJrjxEBn`X!y;a?4}h!IY)H6FWfrIWZZ z&wq=YqT$o8YstGw(&Z_MFMwQyAaHv1%mM%Z@!NsKZ z6=~khJbET}2-`N>Pj|RORr-+Tn*cN^_;rRBo^`)&AMtgn2+@w7f{IFnis%>Ux_V59 z&3I7UVi^4L`FiFjU)&jZL8r>RaY!M^P|KYu+blO=?4baedCrM=Gc4lnj! zvey<~l0MOS(5UxhjkKFV-qZ6us#Xj7Pl)+fcXw*w#x=P%@-oyFc}H_me*_Me+WI*z zJQV%o?PZsl%0Q+ zYV2e25`p8gKXiZ3;>L-$O&BJtTr=Bp^uIWxR&#J2SKkG>of9B>Q9({Rg4%2}|p1>CGiLb8d^UE%1oGqFmt6m)pj(rmXR}@@{PJ zj~9p~IpWr4=?a!gC-N}Y+JQ56Zrl-+obxSj8iu4@uGQJMaC2D`)yJdAPI)A*LwWOt z1?f2fybp|8GTfOpE!6Abpjo@9(dwL0>GYroRMZ0)=W9dcVP?tB_N2sdA}7z?k3wArRwb z^GptX176H*FQM?GjgDZ@-+3N|g!xi*-vMn4gU%&Vx5U)V!T@G?Wc9@SBw5wdcVoJ960sQ2v$opYbKyQ!-`Z2zkC{^_nboxR#z^Ir}< zcyZ&a2~QfY@hh(_!E^cX#|qlfMyf^*+a8<}klxvMxO_Y66V8polmEK%iyu)gIlB4% z5fLud&8`=kS1-=x4d1waRAY5@YISktOn1IUPh#Cl7waLB{UoF)aykyQ`*PPCW{H!fndsz4-O(j|{4mQ;Uxmj_IDGxJ-FaDr5z*gR?~Vna_QaP0fh~s|9WmF28~JeQpveOiZWqVIFYc>S?qdz zd~#2?PU)8aFzNR`>URKgLj!wF$V@MNfAjE7r(vo%^ zXnQx5FdO&vQ%0TNvo>u#D&8|ccDy_D*9JuNU<2(dS(2ovDtbCRUqy2Q@pUnQASDLV z$>Ca<)U0GxlU@(!%1otb_;7;$XFYhG%8%N;D-9o=REq}%diAV}ant!{|2{2Vh$BJk z4($ctjx(u;^T^!UGUAH~pF?BnekN$c-Y(r}QsL)F5I-EufCv*SHgYxVB9+5xSDml^ z)apuhj#qK}NM`x_lN5i5G*AnxN<}`}4(~Fi3(`kptZyE@_TS%cC;|Et?BuXR{F5w- zQPK2dU)^U>hA|yxhmkKPl?3cvRLEVdvECy&ctZswnWq3;T*4E~>>nQ?J{RW_zxJ`<4)mEnKmFJG7dT;kuk>T&bfzKbC0U&J z-r5Me+YU{vmu3FPKEgrq?5t`-41rh9Qr9%O;Q_uGIvXngpKsOg0~O%8)uan_Q3A3Z zGMgbzZS;OP{Bz;ZE0ORDqY1hKVAA;R^Lzhd-nwe_F~HkdQ+W?@p*h^JS3I5nu`KSn z*fw_>_uV0w@-Nw1e*h*Lp6AsT*8$(mpufZd-^a;~a{t!UKYl>phK+oiJ0*`I{{T9` za>xI|-@5VZ1oC9e%!2%dKj$ju_ZDSlW=f6DdU<(K;S8q|`+d$0N|0_Pk(F`bvQ|{;=$hr#(UjL!r?~P$D0QC}{^$#5Y1v_(AuExMf>>R5s}U{sQ+fLMJaIha`AgY=M-3(G4B#e9)rCI4u^ zb2S94iWI?a@ig4zC%R+u9q`+6GT7x=LAtq`?J_Alf0OB1Pq zuNFq?moGj=tcJORtB3Q%WN|YCu@e~S!U@aw&Nzc^3y33S1rYgee%Xh=vK-~{qLTx3LTj62#Gj$e9#CJg)q;RnC3$Q}6i zVkqkqqSUYit7oAcpB%6S>qN0{FTR??ey^LOgCR%x-%;|qk;2{u((JS{%#VfFTz3|T zT@XxYy(g&0O^nii1Y;tgyQz7&EYJ=j#x7G8e(NgZZ&c>tOfMo!!3}F%ry=@Lx*$z( zuDZvD={f2(;b7VJgnXj(sy_w4<*^rC5~ToQz0_nmkkC3huP04(qb57Sc08!7M3Mgy z%M#hB2rQoZHrdz77zww|Sju}igE2PF==72&E^Me2cyb0OJW3e&N4R3>$3sk|wm?*l zn!7Z4?y>~~-r~9i+go8--092)E+TuN_>ATyuKS6|37?z1P9nxWjy?; zwKe}B6^IQ}+)WOm6d-4(gN^S0e{b|Y84~-2OyPbDan&Sf%^j^hhjSVYk%ur{f=t9G zPs*IA{X=(wH@8_ha|@yfUc``H7Q8;CC@3hnr{MPDv{$$HUV^W`Cd3%T^c<4~CvWZ> zszz*~=Q_>&I*5LhE(bo^wW78gqFHcQWvvWMMD4bF$uroHdg(Ds9-w9L_b~l*_+n+r zJ>s^{lMG(RyG_Wz{A;6tFKCoHJX@t~XdiK5y{`r?t2{-dm`4^jv>-9aL^&GUC6h(e z{6--T#CfIW-2XGLIB66T6~XJ)aV`UKb%(zEI6}5f7N;hze#JJGU&y=0D(BOYM=Hbx zk^;W6zfEL60|zDv&YAp!=l3}kG(RuJH}T6f=O^La=IL>nbOF*669QPQ&GRiX5ihdMs;Z3*Cu&cz5X`iqhwi`an?jqMi`)*$an2We?_5 z{$KTJBVHz+rv{8ZL1I6D z&EsDxjMb!kMEOmHjfRo(`k+`k#94hvmQj30c0hx2NQXIDryPnR(}Co_*It>$wE;%=-Y5XZ;Wfhsvfv#q`76w-#1sS@42Lu6vpqsWDu9Ge#!UjjKuiXT0k8J* z6(>V&LF@4cl(&S}848PHUyz^ONy@n8dSzVNSyI$I*w}X3TaZ3CM_bjtguTnhvj1JK z_+{C)!+FLjfcj^7Aae;0jDdq;iqVF-miS12s+vJxoz+B*jdKAD(7~h`cUKD+Q$N-2 z3f+QL{GDXcM6`_9)lN-L?%6Hc)z{bewmMMgVvwC=1;Z}bNITX4f81&XBAvS&1GSSs zvPTW9nsJVqhix$StA3)CZ1*4Ln0?2TjKljZp6&kh-!rVCz~BvzO`6g7uowKzXnOAT zSs?cja|WW;Hp&J_;%o65k_XIhJIk>X>vZo6>I;`y;*mcz%CxZL750R2ak(`G)xO6N zum>s+xZec+RN07*-bOv#HYZllot9oG9jWMhk{B~VnZX}foMIF3*n*`5Y9IO_#zaDT z8O1}6c`23artOqG8iw+kW%>KR=v> zjfp3R%|Mj^F{pY{*4?N$`k*sG=N?Fx}wq4GhFr;l4Y-(%u=(^eExiz@Yw&&-#( zRg?AU{gnH8T)*=f%EKk5Xi7V8Bn~(C=F{@>bXyAvyIF)FSmNM3K&~>tSm4!5`CBc88^+|QKt?9m0lin_Mv6+ki z)(nQTsO+sE1|!r`o4|(@RcQY`U|90c zHUFhaNhZ*wsp=S*)ogI)Sy?mhTMRBss0!De{=2SBo~dkf+0xSb-AR{;QJ0Yo~PdYJXnHXi`;d zqEp{w`slxeKgB0}D1qf) zdp$n*Qn%S&_{$_3!h<6`bm?x&IA!nqvgg}T$W|=m;pnw&0t(;Y1Q=fsbPtr z5`JJ!#+PG(Td-mXYm&$Ilg*Ag>k+DY&*e_MIH!HA@PZj{#+FOnk^Eb9TF-n=e9b3Zm3aRIcmbX$j zrwU1pWWLq8Kg@3;v%>xl*$(fLD&&nPtc(iM!8On-ss3H1!pZ$dr{#hcgl3&QWqmPoAtSn3^_SlPKx>jYX z--w>=2p+d|$q^9p4oFH0h!}Mwl>@oIzT>yk)(Sb(^ZgXLkjtf^j*QT^=fa(CSce_y zNf67ePhVAWDmoC6=emvKaVFZJ_miY&@GY++)jx(^8bdJ~)ed4CdjvH!*RB0e*2G@hjwwP_iUt0A#_eG(jhZ68H%cASs38tv|Fnzz@5)Fj zq+3L`>&nsX;>FAt5390a*feJNX~_?{B&!h2_S|nDnAFS>lf{j`sR2SA1%U}^sOU`! z`X=DK+!LZ@a^v!9==AN%mH}!rOOS~JJQywEqpA6lo}o@zSML~V?9>{gcUNe^q^J4eOdt2^*J>MFTf%`#%WWBwe>xbWhN4-ZkF8 zPx*YO3te_S7njsWva*k?M_#h89)ET^@`9p!$t>yBVoOyc>m`E7(pgC5bHssmmzD>4Z#kg>V6URA7IUwI6H@L}4?_{|VF7R!erhZYBVcF+6ZZwAzsOx~9#1#_p z6A58~OwY(>&qYX_&lTJL9~-b{!(IsqJ9ZLYAEo`|dN{P(N?XS;VmWu) zSx8=l8p>lYW=9z=$y}eN1amMs$#FN(prD#!fcjAKbcaEWWZdR_>LdNfNgQD{cz8|> zE7O?@v4|o4exx4vxd|wz(n-y5cXpd~U1w1npMfko)$S6*rei&&JHVYe%ynsJa%!By zS}pvhFN8EI^aUl8?`fy(HMJ^U1e5h-Y$>c`N2Q|)pQA&_N}W8MY=y;9e>;n)cch+a zgr9-2J5utlR=a)4iP+H@D5xUvS+^5!mn@-%NoBIkRv<;KHwHd+cpsu`Oga*lb2FDg z>>N7Lg1QUC3ygKQVkv8S5777sYw+3I5R=b@+`;UHO{)|G7vF%f}+NgG&decUM*Q z#-_QxEf&8rA6_Y4y!D^O>1Wrh+m>t1mUE6Xo`d^t?2c*qRi|WKjSM(O`y<5Bh<6|K zT>@`Y34pFYMB>?PLYDbqk0aa-&8D|)B>oleggW+MMmwzHP4%khQGjibx2=Id+Tv61tR zjD9Z9RjmDoozGpuH^lZ|zly9?GDA8;QzCO>=)5r@sAyrAK2@mJ4ePI;V^^%(3rJ`1 zeD9xISTo$4FEmK~I&WTs^>9u$%ktXEA@}G1>nbUm~WUt-X*17CAs$xyvo5zNlH~(@mS)FpU zxvFQVk_;-?DgXXow(0^ z(m4Y)n|^U%yK4O>HxmBo>YmT+EDMR2#qIwkFq~S2IcQM8MN3e@goPs#*SoknV^#!0 zIOD=TLP<8qW%(#Q1v0%Fv!iF}>O%*0yWccZ4k3l1L{uF0ev`Qx*0bzyY&zWgKHrut zOSJcK`geh|ycqj5OHA`VlAIMjo4N@*SIcP~Tc@DG2i#f=69t-LW=5*osg(uE-#buq z5A}{A@n1kZ>vft4s;!e=NBDfJmJrlsp79_6R z(x1aMQ>pF{C^*;O3T+=P+pQf^Abu_`kyTZDm+~SzYpmjSE$AQ^fMXS}(h28B*=-Ca zKkMH@&2AIlZ9>~;=Xvp9xA*~UBbf<>fJs>mU#^7qmp3Hk7C+m+)cC|sasIYB({T1T zyV-NTgtnhQMT|Ism!lX@jASC8?8ci!l7+BoSId(OsqC!kHu zm>Cz}PQ3CfH330&jCIve@zOWC^e94*ss@RSbxmlx_c^5ei)};=?G6fiHPwIfHx{5i z{r&R*{E^YNLt8^;5*$JfqvYb%Y-C_x zcEs083uh-59*)y_`j^>SJLg$$%)s)Sm43rTtLo0VxL&_ic3Iqa7koWv6>CS^HUVXf zb*YI-Zz2Dyz!{z5nd=8;w%zJ2ETs8v`$yoqrcG!mvN)$|g6P7Xy}6o@1fQ44%3?}i zWnZ9gvFM3_s{X~gAJ%Hl?LnxUsd@PB!|um@+1vGm;7wr{>Neue-v$nAPw|PpjcwYfn#;OX4^g~stAaGW!j9U}(fVc~ zet!KgbIujIXD_cIf_~Oj1Q|?QMST3(>xQU5FW78%K9G`l7gdktlm!X6V|6R`(Tt}cQUB2kTl#mv& zl64W{JGb@tjsSHwE!-K^U5$K8^95X-I4I;iN2N~kkJ;rmDGx^5=;>5iit|689;XrJ zW0j3%hLfELZ?tkn755%U2((*f1aJ#A98@$)Rc6pv^={K}b}7Vbk9NQOqBol^J8h*K za^=nI*1=PMiyMfqE%V1^g2k$xOAq!c3u}hY(D871!M6u=hVvilx2E^-LiQwl{o;5{mve7nFa`By zTO=HfCgA!!d+*Ni+_h{6@Roo@?jS@G8IX^Ey4$?t@q&oQn};>a{@%~^B)urBmdh-j zmRyXc*iEF69d(Js7|?c?gU(@u?g?h5LkC4OTjsSq)u^v&YtUmuLRRKWHZ&SRmi(sE z?;on@4mjh+s}{7y1-!rhY&BF@BH+e>ucsIKcG+IQ2qmTTOq~fo^FR*61T94sj?p{9 zNQ2qW;v<4ms-AMJkTJITcDsp9d~&;fwml~8LMd6)TrrG6*7#Pr|qp zSQNQCwC=58PC6dm8Jr!N=~=B?U2G`KchS>x@)jAcQ>GcA_#%2ONt`8lsw=N&2WMCJfZw1D*61maz*mv*s&ydz z3XicDX5}TNA85a@!wc} zLpL6$l$G9JT)SdI-aBdDo)~P|B_tehZ+6-(_$JBQkcbek zzj=Xku!RLlkQ&^ka{t-Ip)5 zwY7`V^>IaN)_B@XMWLU!tco1=^}f11I9IwU@dVl^$Qw0=PH!K(dHC=&4HbxV)X)pw zgw`&C*ww46-AhZ0cUJ4=-j}}860pImzYTEMMc-Imu=wS3KCe@d$+QXczY4galJ%0{ zCadG9$1Lp&IHg*7Re*{j%W>uar-RyeQk~ntr+z|QtIKRkDU<*5RYiqV;h3$PLmbuv zQ2ZBq(_>?w_&hI2f=TZxuRxdcm}rlSzq3z!R06y@{WA2liZD7)c+A5fAn4ca$R%T< z9u=Lx)&;mcg36g?@pn)lZIiIs_ol1Stc#Gwv59pgq|0Q}MdEBsEo=^}%L%q0;{o9r zxK^-1OM_T_o9$EDKh|ya(QxbSuSXSgt-6F1+si)GEc>0wy}6%AKP%xJ$JELGTqV}! z^61g;%AtywR1w{+{?HZ8kh#QS>WZ0y{GsV(ZOsC+dFYtAR}lMVM^w*&EwnI=@oWjpx+!#^L%&i$ga}Rst3(Ej_NBUo@VcOUDBuM!LU-l7014?F5`Xu2CLP+f z(9uJPabQZc?-DvKcz~J9j_z+idUg6+#K76A~=v2FUi8zF; zXGZHb%O!cHlF>?ShFnirplf8dO`n2xasRjit?JK5)=3DaBtY!KJ_P?1k1LOjjkN;1 zTc{y2Io$C$XT;YzbkZ$!Jr;?$Q;QS z=$!Q8(IOd61VY<^c@kyFmLE3Hs>gD^yV;sC}?o3uj+GMF?gi_oxV_5;jWhdEvEBi=7AlWRWkVUG5&RKm` zWe5MHu+1JzA8XTEbx)@4Cf@a%&T}>ki{61z^E@&(S1W{AF(M#u0-%6C2@NrBoCj~J z-OyJ^HLVxoqKy5QSifVLgbN%~$=ZFplM9+SCB5fT!BK{SPqr#S=a`SA`LQjN z=~c{BIqylwb_Ol>nTREW0p~9L&`ZLbjz~iKfZ`F=7&TQ-vO@u+@m0TcaJ!w)C><--K0OGge7aU~(`Cj(Q%F1^EI zWolIql;AcuRI>?D_YfyiUoK{onKw@OVy3~ax2GEyy1euaMw`5&k;JqmG94c z%MFy-?tU8<%fbE?<0AShqHlzs%e>ycJvGp`;Eda4OMx(b1It{^rD_Jeazf0T8s?9#Nd%Q@SY=mpZMscxyE`o8G%+j9Q#L&**7 zTkGa!mL8UQV9f-LOx{6SW@RU92syiOb{3FU<~UfsE(AK3OTUu@w#;kd=t^^QM&|b5 zk_7ZhB#D^NPZ~yD?}aiR%-?C1m`rv~Nv_Efh}|J3_^TgFTYN)TNp#zOASHtHe==)| z4Vx-T!!56Q||ss=fLohZ-mf^I9JT4=mvUx}g>6 z8hVWxZ(+O8r0p{1;S6a&>fw~w*taJ%Hv5goaV5D^J?Tty+1V-l*YUXNl;pA=7o-aG z>Le&FG^Bi9?QKHM6{-k59XhxfJ`aeNm?9UwV_of3eU=x;ostY+8L;4eG7nP=DPvN5 zsjW(;Pfs*>DqDPDn@blTxoVA{5yNd8*H`{+gns{U-@s`6efoL?fZOkwWH=tqZV(w| zn2%G3D&HIeD(ZKuc@wa0u6*Tu6DX{LW-QW*{M)dZQ9fGvIdS5rBl*-F%U!qB4pB{C zRb8EZeFi$-dls_m4kfhC@1|Q_>fD9w<@-V`vY2T+#QJ|oM6O?y|RDKYHqPnWT z#?Y&(+iFUk-$Va^aMQkw^)!UEB;C(EoafQwq?S8IeVxBtZAnt2t$Kv>CpI%fmxOM} zfeDOjgsd!WDK?KN!$O?eKP03!(ASr53{>=RCd9EQHa*)6-mf=B6miU6ZXQ(+=4}oC zuBF8`!7$N>ntN%f5=FUZSQX+-UOeZ#8D_ZboW~VPFdu(!@?&y7_ z@;!+T&K7)Ok`9w0i3`O}LQnS}>?ec1y&e`mzuvdk=xC7lgq>;WQtbe^{@-!SXIAVJ z>t;Pef=g0_&jWfZ@ZaR2jX$13kGQ8Fuho_qMM&C9J9gsxQ5Wl?-ShN#WmoZ}$W4#o z(8Ss-asV(y@D%oT1*=tC3ofpmMT4G$E){}H6z-onstP2mFP!?W!Q>ojh@w{i z)r9@SYN0Jq?)D@I zPxp78@!C`bg8>r*jw#L{ofRF7P(iu{C$ft% z)M*)Mz5;ED+#rb^`>CcEH-Oh}jvu?Hr}-0IcQ7ICM82?J*S)n2yRHASY-D;gd7B?f)ueKs5yt?Q`=0P zE!ZK7M=sOXn9KF1K&p55yi>K0KI}3buUZBg&g2;?xQ3O2tO>da7u=7JLCiWXrQy2S zx~b6Zayi-!g+wT%Z+#7LW zAuFSiIZo9d-qEoWAF7Zs;7K;8v`y@XFlfUv7c!eREH%&@_;%{T+~(9zL>>toV^UJ7 zJ#MSmD7fzSaDMH8S%GAOlg<*8s=~4(quk#)|JqbT!Tfw#0vi^RF27>2AtFXAv?Rgj zL3>$gCyYbBwjEC4wBdXkXTX)9#c;bIo(XT^u&S=@s+UqQ*e!A-ck;!_Iy{l7JC3W^ z!kkgdH!1hVlfhoW$m6p>LPEz(k*$HYc5eOHoS&$912Yt;vHw7M-2@fZIs)lse4X?z zL>#{haS5RohcvuxHp>spP-`0D%RcZw5+6PJGRjazY6LPy#F|#QEaaNQSWjlp^VV0U z9Tf0j9+a))ASu$*mFMYRul)M3vko4P&&j@bZ#d(U7L7yckZVQE8$Gqnj75bU5T4)j zJvsS_P=5Qky_|h`O}rbhVaX*5upW&J_veQ^v!&Z@)2^12A?_GgtbJ&6&}$k|@^NaU zrKM0!D9)a0v<&atf`@=PM3c9m{-rDqFE<1S|79l?v%M)d!yGsshm_>zj9xV^RM*qv zr1IGP0~v-jgXu2k=$)_;B)fQ}6$7KC^6%BGn>q_u9kugf87e7>*l~1JJL#B11a&8i zix*3 zGRrOpR{^SlaBwHA+^ubW?P5q3ZDUeOgm zu;(G_fIYW$ZpS|-AB7?2Th@9ke|sJzLX06KiYMMgcmaSuRiO(oN-4P(Hkvm^8G^QP zz1II{^F~n}+`^S!r&#DMI$0otZOCze+V1@P@#oKTI1>UL-ru*dcs;IpYcqmIM8&Al z+t_4m%FX-AS9ik66*JkTSm5tGVb)j3v=SXKuFL4&W|`)A%w3=Du(@*+Oknh$jyxEeX_V;eGtp;j20rpAW>qlV#gQW zU~P=`PJxPAdC+RXN81iUMhKYN4-;334MgT`2ySLq@LN}SGo+u%cohp1|HTaLu0pc=am_0aHgHFHLKA$xkx z=pvpEv#-B?Y9XfP6+Y>jJt?Z=5Z%NH#xn-^<829XKj;Lf<@ogq01m#Is4Ko%j`@+j z98A#Cs{rg(jP=w97|Wb+J;Z12g0UXF!F!$HApP9_b-*U&cSm!Rt?i@d*(3C*FaJ{; zI=nKjdXyJ!L;x|IVGh4p$aUgFdbEo_L3D7(S_}T~)s6N*?}W=lb@7uR*U|-kkY?lF zxzyt6dxcQK8n;Pu@J}pdn>#eADi;sbFulx{$uBDd%@xaQL~?lr?I_zBoL3?Xp6|nj zC0Y3+G$0N+?@+%BM3PBKmy^DU;qHgTZd8SKslOrLxEJ8x6Jj-~81wG8b#|C%sae8h zt4Y2^H4K39%v&~lV*ny4!)V8YX_kR2F{LHCQA;l3Eu z=BVy%Uw|V2`=9nYFgp2{X+U2ln-=8CchtT-MmA^u!Fme6#t2Hj6Lw$f8^XiGXXMRQ z7KdAN*-ir*e3WUV_%aa>`Apzf^;*m!LsW2tA{&rsD<@Z)~9O(0sp#@EYRP z&SgeT!iy-w7N6StIql9ICX`YdzMBtZN&nG>W2ofaT=tT~-6+Eq84L*t=?Kz!#%I`^ z`1q@^@PA!cFk#r&`QW&^C=t>_Rt>NKn1*fQdk*~e2YyTA$ot(MLrsM5rP6aEH7j=n zmv;}q&AHnyU3EHk-M+wm6hXc!7d6kU4JS1S6{)O+0#nYNY5o?L4b~cDM3z=q<9gDZ z(I@?vjkI8o0hkX(E10npdKu@pB%6&u9jB!BsKqJ~DZ?TBZIl5wZFV`t04STW9J(}$ znv0lP7gceP>xp5c3o#sOd(1#EjoD?md7lVX$y{b%YnFtuSg?y&XM}Nh= z(J;F5E+wa|L0J|ju@prK(HT;(Nw^5)&vV=bcV!cyKF3|enMTtHvbPk=KqX^Jw(}5- z3F%cd=~{<+IHzRAW{GS3eHjS=p%AA__^qA_noz+~E3?n_Zl=8e z;|A^XA^edL1L0Cc9-N zcL1%^3HGO@ue$uo-8wt0ahumhy+m#_$QIS2WIzV@4(&s89LEm8^&)2KR?g79nG(AH zaFej0$agU8a_CJ{?9qwqa7C_U3bg8m`*P+}sGkc72;7n*cVIEF7;WtaT%mo34Uttx zncCc5v9wG{NM+uyu@Zm7FVw0R)$R#!VF)VV8U=_`#rY;4fm%xsW{B8tC6dwyt@)?R zb&4_r!iN>E@U9_Hm=Oeu2M=%I2PmGj6m;W4&(OH(TPQm+@P+@$x3w)ry(NuF$ph?m z148ycWek`uDo4boqE3@Pa>*b5{Qr^m)=^cqUH7OU5)#tgh#)B-BHba~(x^zMbeD7r z(nxoMG)M>{0wU7gottjh=i1=&e!ufRerKFB&VRaLaL2V`uDRw~8yAf8)<&l5HCtbrpek!Ul-d zMJ-4#fw6|>6aCGjpNPyICL*=N1Vl&g_R43rIbJCCb+o&>4abawdSalYdzQz*$NWC~fPh5s&* zDSyMzavceCK7o@LXmEnjHt_C`#6grZ`di9Hu)BW-PwYS(SoppHKT zx)NoQDP+EW%Hrr!60KJ?cw7)$PA@x7_X7qbUyH2JpEY*yT zu8qQ`OL5sys~+dqf)3`^_b4R_;id$q!oN)9Pk`+_1Wu!XPTPqDgIPf^MOkJuEG_>p zm$W(m`F?&V5y{r_PfvVmk@Sx>1(1PsV{HA6={H!vL$_)SS-I+L?`BR(n9ON7{znUs0uxzm!|8za-a5MQt7<1CD zURhV?ytTjFLxAY@Y@^Pzg<|!7{KN8|1eAllMgO`Dteb=oMwNqm zHskOo#p0hx!;}F&8wi*=928!b=D2@Z!ULA|{Q-F|+f@YY zg1h;F!RY8`>4OsASOuIyT>4Dirp5c~V$vNyAeQKyVyXU1EHDAX6M9qI_~0xA3&A{# zJb3b;b82uF9DmEwuu)e%_Q?epSF&w19C2T9Tf2+@%F_#?K>K6tCov*2>#YC{1+W89rA!LL7=UD;#f7ahp*90TC9z(``~U6#;AK(4 zXR+Xu^9_+4nav8o8sF)<_dw1rnk^_%SA-0b9Y{0{IhMKpn=DNNbu58XU??A@<-guP z!ck%6uaw6SBu&6p*mm0dEAYTfb{QODtTFFu)V-q>G_$%rU__2mKVgK-O>eouZl?37 zhL7rR?Thr=S(UuR$E>p-!XM{zg%<S*W{CzzdpGkdv|o43)#W zL-AtDM?N(ON1&MWALF|14dU{Q2>;vm5s+iRJod{{NO9KcGRfz?dq+lUjsjqS0^i3VzqpI4gUN;|epRr6WEkjU)T{+;zki{43xf?Ze+f7x;1*>H zm;K+nhK7cAvgkLvQxyz9e#YaI3HK$lQ@mE_9xYgi=j3qlOn!7=zm#V&V@XkLzx3sJ}-3^?yGHkWm1^ zf=qrFDk=fK9`xKmd@wfjePx2Ay< z3tMXgUr5!7f{Q+f#~<;`#|1(PR zQVCHK!J?5?EdM%(+rnx-KyCk$#|#UsrFLe#zi`pU_H4y^DQ zPu%!VKlgwBf!PL((OPXi>q^0C&>Jh#QuMITYzvtv4jUJhLVQEltU4BL&!(b@QQ)}R|uJqKLqt{ z+;RTDTjJhAMfU*xzHS>b8N$(iX$9F<#~*N^p_To&7hQt;PmN-`{a8S?*fbm<1uaPc z!F(7Hh_U{m=!a)viSlGT0w z59hVWps%CD=XkcijA~jUUtMwQU@SF91$3}=iRKp|4jiYd0~7u)SHGDQUVvn!>s|6I zQ{66J5+B0(f3)xcn1!1jF&>%S*jIc*;Q1o9iKt>3SR04y0V6~8w{#DTK}_3L8vZwS zu7Ux{>tA+iJ zY*@~C@d2IfaZO!A8&iCy3wv(c2MqPhZeOf7EJA56(vqGiV>t z4@RDwO-ze74#fb00H8Fc{!dEda^K7OMA;^4yfKsQsLBBbgbC82$uAJ+wJCRq;tJrB)xKK) zEi3^J0-%r?R4(@se3PqKEYXKMfl3tA6_cr|f6){AfWdp#YY$-CH=AT{*2`8L0@+TS zJV+`&Kkj*6lR?nDM!Xo@hc|iVNm`C&+85r$2zFDyR3-N@1Ii)C_C>Gb`1uDjS7N8r zG8D)y2Nw|3Nd4=c-Jcw3b;_-yGsm>^n-O20O}Wa@U51akH@6H9SAAzf$2?Vld?9T< z_y&DuOFP+e9JgEe(p~QRe#L(#qXtI6$Hvic{^A> z1ahHdVk;jA+n{Y)pBDC)1%R*gfFU15Gg*V7!hMTyS#Ke`dzrFaOn=BPL+fvHtYDZQ zrdb5*WUJOAXoS+E;L}dO=?5xY9m&`2aOm?rEk3BUhHmdh&K_^R9P#Yj({7Kv`pxxp zpe++Qn-Xj@zCM~s-a|}V6}T?gHNJY_KMQTAYs+dmnc1b7bvY|k)SF5AT)XLUz%_&0 zygn8mQ06+FVRPk>7UdKoNcUR{#%VS|*U;xrXB+FQ1@LbLQQfcym8r>o2P6@`oLf(B zF{sj`Ckj3_NJoID;ypZ{{F4}6lu1JWQpEV6AngCyj+oqW-~E=~p7{c}V%WM@nG>Mp zvTwd-HYoZ-{Y2)xaRAQt%E%`#?acW!NB_EMckVdNU-0~63-Z;;)v%g@QAWzvK||ST z=C1zCVd1XI33Pz=m0%ca`dyX_1&Wt@;dI5NC9eFhA|O)m7rS?NMI z;q$)yCiv1H_*eqkV7ZMJrCs0Ripcnq7 zVb zit|%}O~49SKe0U?X%al`tql-=T_Rnpji1U3)sBSR+Y~sUWyKiPJy~di>S$;nQ$SDi z+Vnk^#&?$j1@NyIL=^%bTz3r4-=(-TIi1sYe|mYQFTh_rV~nbI?%Rjz)ND%YK3Y3# z@j$iw?&oIcV(pcy^96K|7w@Oz5$^cx5ydr<`a`2-0vTz{SSmP>ttGHWv1jY&jesOszWX!`>p~Sj@ zDHqTuZ;?*FNe0d+KV@L=&RWl{>-k@@)CwpSxvcxoBI)wEyi{ngDSx|z^s3L#R|!I3jfylm;|<<`+qw zBC9KIc1}`cT$3yK%XdJATtc?EYMe#2-oCaqm7l#LD?>fG-1SnMIJG2t?xsC~lzJ-V z3{CTT_yuy80t0f@8Yl{BQtt@xq2O2HzVV^B{$cm}D#A;} zP0G4uy5%z18WnoPb=`~0JuLf*1s<%Hji;z*WKun~b_ePqstb6Bsa1mOcnGp&%Lb<% z+Y{s)_?<8Q3dr1zz-%Cd*~1L4F0Zm73D2h2sr#Y`EHlQt9#Jg{?2^HyzhN(ebBG%Z zL4X5;vy=OA|I^}1FK_**;9&vLi-Vqg?JVP2)rW>2?K(*6X*-ktvsPvhP#AL7FNZ-b zrS>8jlsAu1#9Z!*{4?-|43v22)*U7T0k!y`xX1gE%6F`Ay{i8}@Zxxu#baSx-%ZAO zm{B0_&?Sjo7$0cID7>D$wH9^o+xbTe!1HtFL+zDROH3`9`ufp6i9Rv;Rrp6W`SFl? zSlF9yY=&;nJ9$Dq`b$m%n<2-$cb!NJo(AEP`TA-BIY$1;4)m}c1T%pw_AG1ZqDw$li`%tuVkG?1;ied#u_1v6F^ylTIcBM&0V~iHXsO z)6%oHn%X238#xU3bXPKU((QBu(zJPZzCzF$I^*Rz*C%MZe7f~I*XtzN>b}0+CPCn+ zM+bp_-_@s06KqOF#kt9r6N0vm&KcgrwM8$;iVZ~2RrG5Vsl*)A0K(XsC6#odKc)Mi ze$F#c)EChy5Wy*M&a?F%{}eL|(F5q{uD`w%I|qMdWn*Q=hKuvg z#n?vSMy;Dse5q#FnM!D^Ywq&c4Bg?=S6-9{>Bog}v^W7^%rmiCV)6atHF%aRE# zTgcCL>hi}au#SStJKK;irICX$k1o0F;j!LYQf=1idP_n%+e&>|W$r(CCI08uC0dz~ zb^7jdvkR?Z)@P+Y#t#p6&D99dr$JVNPtBhx^C9jVz(>C97US24e7E@p@$+7+ou4s! zM_8AF`uwYtqk%Tl-3!<|kZ8?C{^EA>qB~YB6#tpNh+o?Mof*M>345y+7gO_WndBe2 zY@a#6RZj&G*6Oq2;~&NtdmS#Jmrgk|MD1fhp|flRWdb7z1b96~8Vn@exP5+T>(M5} zJ>6lgWdYvUN|DVpeOudI{&Zf|a$gS&VVCsAD(8lm9jfGBN+M%dQs8*YcY7Y-8??}p zDfdJv`$c#1jRUIZ2x0e!iSVfJxYJMbf^(b!l5{U8)KKL6;fwj0M^rP-7k(%lC#KZf zzlGN;wx=<9)t|~W(}b`)Jen-Of`7JM7u4#;O4D8SwD*0mYto#uDmkYiml4bJw^3hL zj!-i%pk-RAPlTCX^Qzvn|5?Sq`=IVIUjS}meh~_bdc+;rF^rr)m7Eb&o_t%Q73_*T z^rM9#Ln+1mS#V^^6Zo(@z&f752T%cvX-}_;qI$G3j%?z1Hwb%I?1)fBqLm&YAs?7l z)-m?vYw+9$)HeH5{KXn>#H*}=ZPnTe!eUJyM9wO4Uo1*b*U)=aowOdM39u?ZA&Yb2 zWQQPdvrAC=QtuvVV;cPu45}DB(XtF6`h?w{<-n#|mhiDlLR~ZwotGV6T8uwwRDJf@ zk5uk@8ignWn zJyXbRlM#6skxmCS3EwF%JY45pU}#a|HcTgGXz{-C4VD&@c0|)iWe@B!gu3Q7TzZAn zL^KI~^x@#JGG~Uo%rad4NuT*FZiMI2YmdrQmUXq8ryDKnBz5lH`H!(%0b{566t)oI z>ZqObAwn(KHHun%!tLUCTRAr+6fum@VAf*DI zCOm+9w2}B)E&&+7+O6^5K`5Pe2L}C(bv@K^AW;C4KJ;2fjJ~UUvg|H(;y#J|Tf_|> zRo#BUImEb90TgPEL`@U^fvZ$Pl*LYwr5{lrDD`9?z92>FivBKKh`W~~Z~kuao6zB! zQZ_?`61AjA_Kzq`Zsmt!(VE|^g@uvt-*2le6FxoPCTaFWZ_De`2X3b1r{|8+ttQY)79WC;Mz@cDYMu-2;laQs;hMy>#B&Z$gk9%8$b0RkttTu$-!6!{jx3 z`Yjme?%v{ep=LpqEak+l5Xd;bu1CcrH0hB^LDI?DRnys32@0pW(Vtkt2#Y0*mq{s- zAyr(uO|FO5QsZ9fAhdH7@(Cfr!2%-r-Ckruf8Y(^)eh;s;RUYyiX7pWzH5+I?pAYe`FNp8Y|ZuZu;g&H z|04ui{V))R{Lns5R(F&=SEh~8;4T728nRfRfak!(gqC!}+tyZc!zfwcBpU*AezUf%63ivk+lu9FF2iror=TE3&k3kUR|BYY9TXY!ZJ<%3JD;MXbN(Cn^k zOR8Euk>q)Z8x?DpoT)fdMO_&E>mjR`?BN7!Hi++xOF!e3rAD!0U0tkNyN`@zNnFRV z6NOXIap*#N1Ha|!(w?<8pz8#3CiksJd`jKSsU)V-s-kbO^dps0dxaD>2HzrQ?7xNg zH~}Nn=;^7jEH?2s=`ioR)RZDaS*BQ6EN&}HbtiMx9r0YmqrHI{R}9kG8@o##MGs6y zT)5@aTFOwAN=HgSwEgESx!^|+DCaI+>QFz4+e-4gvEttW@8K1eXA0(xhqqwZ@~N29 z0MB`}q3IS`;Q>n~Wbk4$#GDRXUkJ#aO_{okGVP`+_SrOA>>4BFYb? zx@U18Yq>KpV$vtaldk>Lfo0Bbd~t(X_%h*HcyJ2aKUrWzgmUiGvaVLM*zEeaJGPLg z_!U10M8^CU6T7JxpY%nO5U0 zH#KFHUnt0YEANj!$(nB0!r9=Pwc(KyVkB05pF8TIl7;TG7({fE796{{AYFmAfm}jY zM|x9dG>Z>dPjKLQPxQr$tQalG2iVIbC`F1txnCaMlTlA;txLSf-bkhXZ1oy-sVi1m zEY{&Bd&BPj0(J%$Wnh_lLVn&o=8nGj z7{09^jjj7DA}Jz1eKDvGx?AHsSBtw>Vg!BpgcKK{KxU#Tu*?QJZ$8xoz3;Tsg0OtT zqVM+R9UG^)fmHoh?_TmCibxd7kyh7^hQT%dw^^LBo1ZnOxcPG<3w!-2bl3LMgM7V- zmCE$v2iAU1br=oiK%yR{#4$Mb=CsxK7;&w%RX@|lbNmMG1?K;9uoIE953lSM<6kU) zD+4~;-MmGA4&PNmUbKCnvTgf~hWeci{JxM0!j7OTdc3rXzqjV*AgK%Vx5NPIGd}Be zEHt88(P4#Q+i^AzdrLKbN!49FcfxkMt*3m4@Gb-CHqEn9f=Mn_g^dW1gc; z(@)6_7@K)ux_t1*vU!U@PjqtL>y<9^JTk4hW0B5!Z>{SD+GNzjz1|jB;{W5g<9^;t z2N^aMuBnb9dHRh3x1}F7ET3dm#bfaP9HH!c7&xfJ74ZR)68Vuj`)#C5WmBraleqIL zlN{&hlpA)^osx3{)tx3|o(02wcauk1gQz^Zc?DiQ`ve!Y_e;;+TV-`;ajbw>omP3m zOv?aTdcT&q-g0>-e$U`Vg{--6%-6?kD{_vK=d(E4gXoh+2(qEDLo%OSP!^a{|Ln3V zZojsew-NEy19^`2DlV+faDVWRPa}aGZDQ%K<5uwO}W}gE<@X zta_e8nSK=?_)K-O`)4a1R&dZNz>ID8$s%_X&0# z=}h3i6cUQ@rYr3H%?>X?MJYM-u~LCRZ;kh7)VJwD9JtlKK$P*(Mz&7}97=srBhdx` zi2IZy|31?SH#*?N8?Qv@?3K}`GOme0|ZwvH<>?dBprIC z?1w8(i%cmg;8Cl?448z*W^cnl)&4~NXrl2WBae2%P{uEjvmQBRIaek2d0#K<;RI48 zpN4g;dEp|=M}{GWA}H};Hjn50o)%!6a8j+8O}dinA{NJz@$Q#No2b&Iu}t#)@I*FQ z&y{pD?XN<_PHZFJpY3F|F>gZ7>zjS)AZTQR7cBdu4SYd1{2@P=d!@bG{Du0(`HUQ8p+jt92&H5uSjFxGA1Ha&+6KBRPs$Bu@jmg{ zJyTA-hWT4)sIkd5M@{5a7BMD(`iSKk!^*dK zIPQD0%NJ(=@!1S{rCRpku8PD6d$y=FK;yzG0YgjQE8v2#<)cL`KaC6O*I%On zq=Si3`EJ@BTj{an2yPHD?dRR++#NJ@5b%{~{+}IH>+->;H{pOu)RD|n}B>~Oc*!B}foX?x7!ExcDH-G33I9uTyL_5wX%QdvLeG%@; zTp`4~hf*<=arQ=oZfg>9tDJY;V#Jfb`+Ie%IUgL;IPSEB98LZ*sD+;R_Ov8{+o;3y z+Fj%99y%I0y!1boUp;6)f9Yo)71rj1)n%Unc_*8?Sk~pV^QoZR-O=D%M2G*7-&Z5? zKuJ&mt!h1`anBv89p_tTL7eWw#G{6kewtK0|7H9?Q6CGOGWYjFHl;qck>D&?!Du;Fd{xtfqlEqtM?7A^)^Q~3Edeh)^5s2NYO<5QQ@Q}mrFui#FFnT7QOry$(N z*zC_~CE~)mVBuFXIU?7&+J11dCKLU$p>%0ib9KxG`fQk?C!JcawSND|&Nq=^O3Q4a zX=H0ef%qS`_FaqH7Yb5I&m*e(ICyA<57)lGl=3`Y;5U1g5(#(+uOwOrBff?b#)WSY zr95sMzr-P5aEx~r00-o}w{dr-`6{UHoq<#z-Bs5|M!fq=^5(JJ`G_VT;AwwV;t?5q z;PfzB<86F3Q*}ynVf`b41;tZ_NWX0qIkUDo$T=nv>-3x%`2s_yGj2M*+;ml3jzC~& zr6Xl&7;~Wy7CB3iFC+8mB$1JcNSuZ$H>EwCeu?qpFtwH8_|n5Es6s}CBaF=~L8v%s z$^yB+xM^@q0C*Q+aBHW6{@jdF#ZZTwSWx=vcq1P}>@Q*~IzAr#oRhiARfyQn7i85D z7Tdo0O3bWx9xwg=C2fy*W!)-Qtm8#Y;-ph$_w?97!T6*Im{$s3rA9z&t+lYpgbUH- zU8{{}V%#qwdq?)G0{I`_q-7*$!<%O+*N7SYOjTqR!T*NW;eF)Ej4%}BDi(J%`@731 zJqW)Pg9tOqBL1hvkB(=SHPTx_x4ENL3I^yNCXG~fi1BZ}?umm{K?|@V+pUZl=V+@5 zek%?9CKH$|gbXXkKcv;%875@8*P-S~-97nUZIX=Y3e)2=b`P*D_#TDOtp67ooaQdRQI^FFiu zmsbiU+TYF2?K9Gy!-)#NeYn<@$@1VJ5M6ZM-(pT}IrT=^GoaQSsLNGQR~{BmTb5K~ zB~)CPuJ^{7;B@$I{dj@Icl`G|-X~7$Q8t!W(?z6O&zK%=1~L%z={w?9;*%@uuD+tF zk_Z z>~GhGEJT{-g1YtB2v?F8-z!v{N){H(gaoJK6^gMjNp46z%7-w_Uw8H4SFfeDqT+`l zow@&TG#FfL7-ROxH^i7MmoUbU*H||IlhpCRb}p=yHO$*2gU6R1LQ%kGZb6nUORj*$ z*8RA?S`RljKWU2cvu1&84RX_c)!z~K{aExIMA#JGT4T~XC0w6~NySnNWJ-kl60&7V zi=Rf*2EYFn{D#If^SM+EJ^sS?2p^MvY8jW}A5ksPcoqbuciq3SadG_s*OBhNTx&hM zdt{C&;Bg&t`p$ro%E^%ZZPzL@zc1CI<64cl@u}P81dTZRvaxnA92Q~|KH{sPm}#>@ zK>^5k#>0lH$B&BpLc+XhXwoHyvg-YrjAS6aNRj%BE8D9e#}=qs93H0-)Irxc1w1dQ zg@qxe3=Cvk^7h{%@N^k;q*E9Mv!#Oymp{{3Zuouv00y!tIFo*jq7`V(s8tG+!!Yel zdK*KAWjAL2d9G~}J=kDC8vgu$$Bd*@g<5tYH?5{fik;t6G@KzzMNi_274RQ6Nw$9Eq zhB5J8HN&dNr8Apa=cbxQ6DC&{1&epf9V}iY9SzK@?MG?c zqRLx3!z@#MLAX;6b3DUML^E@U)dz2eTDy~0Eg*2ojyhtU&7SHg@3GItp61HC3^B)G z`>4V_&BTg){}r{=yQv4tup{8?&wHXLjWXZ_5x8oNZ?;2uKlFH!#-R{FDuvxg>}j6I zx{1g%d!LRor9%puJmMf<5VB0XQyA_(r`=cK->>UA{Q=|_i?@X3p^lrd8vOxKllWPd zL2!AlN4VSIa4-|bk?-74Pw%_`MLm%q#GR*zLFbeGKlK|zgWJuJ3HTaL=I{zR6Iq8e z%R~x_6Zo4;6-guV?H5NlVhZ_thB8!~vM=ig6B?cr6u?y(*u0(QJP<91ctdKDBTh8OP@ywXVoyYRXVtscifd|8gpgk z37f}pV!&weSYT~h$w64Y_D)uo%pGc>75Z-fBAdZj0tM^def3YIR@8n{R$`7gScTrs z*ZZPxW#d%~Ta8eaw4GYcdb6`LGz}+Hd)5TsgeqcdnPRAlJ9Y;pwRKrnE>w6x@T%6S zW~rhU9;%cSK61-LEvPvuasrmtzl_0@q;TK68 zcy^gq)q+tV^=Gh2F!Fnyk+*+`_<@=ZKQULVznINIv5oe<6)ymr?~&izVxQ~EsnHw% z6bxb;JYG9~arc4scBYoFR?%J8RjrR>j;CVjs{IpE8LLh!2@?Zu+-5O9B$A%xFjFs< z%-A3v#8V7jrPBG5@V6YMI!S;05p#fsvC$K6t?~IBK|)~?XUF=!Yk-F_R~&MZ1oC#j zXZtghmIp`Y*$a0=jh_zje(Jm-o?dytQl3W``Z36I@8JyXKHtVzLbHE0AHc5SvjTTR zIu!0g-Q@?r&uT~PT1pyOGjz*_3kB9J)~A~F)cJAL(Az@pdjwcTha8NHtbKa+u1!M< zQ!rPoq0wf!V&ly4hFeS5vHyl&QTbpS8O9TFCjA^$6Waq!j#7oadJJGBq<6hg=(e1TflH)r=XhU8mvlNM?qyyHgpy#B7msPQ4`wgG-;d$QP zuX~?_Vi~I+@wKDAIrtU2_FJ;qe1N>=ICrBP&lcmO@r6NK@zv+C(j(J|FJ9Rr{!N@n zN=TH{1d5VES>OJ2KIrWnD+X&;88BUxUYJBYeaX>%Zb>lL?-l;WB=FtIGZnSP&{}@^BoUX#eJSrH=pRi$I;)#yO;K&m`q87 zKO|C{@|mWD63CaiHCfMk_+SqkYw(?XK-A)7^84D1zdv3cQktwh}y4`FU zCG=Ql{vR#CBQWT`t9~8 zf4rlC|-!LWReVB`p^-?7l(w&`l0-D&OWGF1Se$^3a~ zz@1ax!&w}e)Ukbx$3!hJO&23vG3s4OpVI91Pzwz==6_Cz{+YHFG0?%GiB3bk_EXcH z5ReYgE+mIat;TZyA#{bJFD@5Yu_QmE_?Jk}UeZ1-{HRbg{^T5F?e`NkJugOUI{LFb zP(rUX+FR%PC+8O3WJjuqf?dV@(o5f&y+!1AM&D9Wr+&kosW@8wC5Mh7Kpr7drO{Qx zKm84Y5#wl@Fl!(B=iK~#lUDo(y`}2be4*s;N6pFUxYAX6sCP|tWea1>oNQyvC-|3qdTmwNkO08|MSv=`Y9Y-j`yO^+IGD#r|{!f((1JDRo;Y= zDiX2W3*WS8HXNm`f{@kNa^n%FpZE_LhDj>Sq$>sHw28f;CROg7+-12JJog4YTF(Y= zZ>%CyhD_Vwdl*9#O(?Ex;CBaRXbCMl>&Q(#D#QgM%OHA66JjCO(H%4tG&K9)?S^g{ z-Jb2Dw+2i9Z-XU(W58e0vdb;RVTh{(6|k;L`Kv{<)taxa9GycF%b&;93~Zm?qc*{1 z5*+_f{+K|k&_v?-s*}hHT9==c`p`S(&;t{3D4nCjsKX=cAK7C;RtWTcbAT7dM2YzH z+l3x)GV8L2d3mjXH-FML<3o=?ij2>`4pW$S7L`!_ZmO4c>pYH6TVHm`O1^)-B%89I3?9HVZ0aGwt`QaTq_n^}6Hsp$8Gev}Ab77}R80C^xD721 z^^8TVC>{)pjpCx{yb+uHgZ{liryS9IUzb5@XqqXersO@P{UQnMGr+%|TCZhuetU9W z(ooV0+JT1Kio@TCvdp!&I*Ab*9%wxP-6cc0+ojps%9L9tOaArmK`$Bb}T;-lI5VC*F^;t}dhywK{ghtofaqTd|X zHK=dKtuNzvM}2fMMVHY?i|*+8LX5JB#CXkVi;7>yL&dxE#f$Z^sr}ZF?vUwv)_C)j zw*hYhwPtjk$nlKVX_$*l5MB&4?kI7K#D58ua=Pc zytAbtR(LY^m&o(yN!Ys1gg|uw0AH2n8Yjat!s{ezO}h1kWD zn}rBN3`3fb%Ca<#mm&9agw|?R{0B61q<_nsVpFMgpGtkAuc0Cf;e>1LzDbaT!GORI z6B~R~1C9VEVI1Xug`2tvJjX}vxnu-*DT;epB|X|SF3J^^3fu;aBPkLPDiSri!@kyT z6GFJggD=iUa4ynviYZ>s_w6NQ>H`5iP6O&POqzz{wv4x<^#iIPZSqVJ<_9g@gU&Q8 zED9fszuLF(l1-@Arw^J?yKWt8DcWEuf3zefY`VE4V=~HU!B+oUbgjwmtu}$cU^T7{ zxwQ&{705~7$>OxCe`L|(d|tYU4dDiA4rS_qJX!n)EjeL*eR;H@uNNwYuHMx`Ik$N2 zMK`YLU03}*C4UmCk(7#aBsn{h5(6`Q0uK%JR%yl(v9Wiu6RmYU0%UTRGYl5-{sy;N|$(PqCpaG4rGE2Tv(lRod3a51iUgx$IcIGdviP zH6H1(*MzTXev5P>L|8q=k4N%_zTDPk zaw=o?LeQOQIyr*~D@frP{EzZI-kgmHTU&QYu5S@!WF3WaqlZ&n<5@1upC<081}lq! zUV~UqX>(gjmzYH~MpGw&Y(W|CSNXSA)EOQ?yWj$q4ObTD$k3R#fBhp6tjo#b`5N+T zQr$HsW`px>7L}WX!q3BB)*I?V`8H?7_JttXjQ^J4m zr5J+3lPo9OUW$hhVp{-ud zehgu{t?N;Jl&g&x)XlZnD>FpNSs2vH(7yA$3Ps8{NT-c1E4JN_j4qC`;LK9YOu+tb zyTH{#w7-f^^FzbVlS0r#xcyirn?;ve#C#%0j%}#$6DsHmw5PbSRu~;{Gwt%{CgnQ} z`K{LxT|9C4)e^F+b6Q(#`jMA9_^`z-Rr$9 zR0H2n+kWofz-!jq<+h!RGY^SyOs_&cE~!)Q3A$b)4(+Yq(#05`-^vDr z$-jV!1_wJV-U_LFfp(lGUZ~0`(=`RR7Zz=di_8O=4ZWs8Kp|s^aZe#4na=$DB3GRq zl$Ko3B^01PD&=gcba%FG2N~YfZsAJz-p7YzHIILGs`4d<(1#-9)$_ZokjsB+3Uqs* zH^033Ae~hH^TT%^Kq9(K<;k$`d~eGg!IasTS-m3V#}|Jo|KkFn=blfbrYf{z8?y^E zR-_e|In@hq8dt=<_X9|3^*@B&^SAdl3Zw#s z+1<4*m`jk7!?1!K3Q)5s9FB8=u)QYc9nGYvE5R7i?4Hm1=6g`#;Cqh%B_vL1>B6pn z4(2`Mh}of`w}N!GN)4CY%#*#cmxZ^47Co;Z2j&rd#&65tJjIi?dL}9K{j#x}ZIhK? z{<{6p1?dTB;EHQY);L6A6Gl9oMlAjKBgpoRQ5$`S`|-kERm-OeSho6*M>fBpjT@ga zU-#DUtEwsI7E+<#&#ZWyi2Ie&nXl>PBebH3FZIfS=KG-cy*6bX{>HdI*XoR|SYI5y z(r+jyrXjXF_{7gOT|Pxx1f>KhLWrgB=(lXc^PQLVKPBMr7K1<5{{3@ThV*Mi@r*M* zKVn%tdasmne(D(#=>xGIt7gF{sw4Foe%pNvfRaOzBK0;=mOBZhpWQnkJup5?svcqq~k z1e3Y+@~{#p#o9kYD~aGygJO;t->ODQ@qIAG_?;9K5I#MC&XWD&wrd+59DJQe6a{ab zsVAl-XV)hVvwq{9aa;QcKX8n|Go&tJL9f_T904f2!a{atLR45|xR}*{S(?%F*;~o1 z)=G=q!*By)l6?KFaNU+8w5tHiwv>x9Ou%&7TfY>kn7ufet8Af;2Os#N^53bkggVys5;WPK=9Hw?6xWL#Uq#$w+Wk;k;zZclE5O9C0N`$P}_=1w33{h$2cjTeE-oR!V`67TB#(`L3D#{QJ+;fJar&!>G`RL`#kUE5_eB^L* z)3TqF8eR`kP0U>jz-XyGuTetnU`dNF+02nzqIv^I%ewv87Vn;Dr&w@kJ*ZStRvj7-i!M0+K-V{uDOBI}S_j7{@R^IrISUwmr3koEc=k z9DAc9wzf7B!OEIswP-5@`WSw9bblT1qbPpb(@I-WHYTSLU)suh*%=LL;-6Br0TZs! zd5#dZWb#~OeU20O-7Q_cAW9r|K_UOq?nWbFAk|WQqM#*efNuE{;Oy~(=7nGD70wc( zMdw^?&$p?%Q@XG)V%;%Osw{DgzlD;p@yPb-x*~{nZY%mbbavM$k8*OL@Jo(|Mkt}L zk42QiqND9g$sZp^ypLuf%C@@%xV~Veis`;s`>XDhe(N7+JrQPv zN4l{M1{iDV68ZW0(R-jI%#T70m&y#CLQeFon)w$#E+pE~z_3m9v`AU!D=j&M+evcdf zkkFmY$h6xS?!qH42?;5zB)=&D9ASyt@x9(hgGU>tzM-)+n3oeP zMKUxf=P00<<1dzmV!3QLi<8tYb$qyBiol&x0U{@c&uS~@pq|s#31J) z@in(_9;{rMWT}mBarCb(9?IjQ!c@OiCQ1)gBo(HGr{nf#sSc-wXQ?DYlk}y{SU=6Q zn$(IMTLHwYTy5-MR8;M|2vncHd+gqSnEfq%=vk`C9gi6m!Sc{z$`kG&j=VpHutErOOPojGF5aRjh~V>|?jMn8Iqs=FR5ypqeZBn!yFyWHO*c93C&IiF@p!fq z1SsWYP|Wae#9Y)&@FyH_6wUgwfXj-g=mvelFozIir4X1LU`P@Dc-Odun6ByN&VU z$&dUUE(C0r@X7gI-Y>maYy~HY#)b`IC@|P7%J-qsYrv3!G(jDXVULj7~K^b7yhw>8!&2*W?;i-JJp;+2&AAH2W z2MlGLoW@{rV)37!e2P|AVtme5E~!~%dU?=z4%1Kf8`g_G8(@@}3PGQ7(Ub~gdFozrS7A#zlk>Tvf^tvELGa!GMb%kG zMfJX4-v9)Zl9Gm@K|rJ%VF>9Q8l+)Jfng*Kq`RfNyJ3cqlJ0I%x}}@vpx@v6ujdU* zSnxtv=f1CN@6X;x8>S-hMsdTb9hi!nWh^GQ&iP2jMonW#p`Pf>1?J_I~?dUQz60|ZkLvoo?2rE57L{soB@aWPNc$O zKJYuT@2(K+M_7VJVF=Ri;Us*L4tlu!R#vSTz164>#o9p~{!UdbV}DkCgUcnFh&akV zZ7?T08&>>nggboo;omjz8|D2_mXjUhJeYiNa?E|>;_dq&A^(iExKdz^N$$I|lhZ6d z0l^BX;8u4eZ9D?D9p^4Z*;LHRN zZY%i*3#HKR>dn1@KPkbeIyF7Cy-Tx=9=K0f==Z$zD~1*5a+$wna!FNQzqE)MN^1eB zFRNB&BnT+W3dBx|Knm9cYkoF+N$C|lj?U@@Kx&nWTk_6>0=?h2FbWucO>tD!;{GVwe72hzn zzMlnfRF$SzFy>I~0RwMOC6rA$tTkPQSJm8I;4`^^2ZSHaPyvktWi?b{O@E_-8UzRS zmh8J+St6YKNQ2)}^!J8O(R(MkZQ&XZMrAW3a$}Hz8%{vZ=J<5}Megq$`lH0Xy^Or;WD)WLv-p-tlZCjWN*0#`cNY(>j4)v* zX`&M0u+Mk4nw^O(l_&eq8Je_0{1i-0(!tIKc^q?@G2=96?;A7e6yvw;_sNA`to6dg zX+$GWrRCdRH>B{XW6X6(c3NCFubFnRM8H)9l>_!`+56Z(3)w|z$9NoW!b@5%Z6)=nveN-#m zf9o8-Xf*8Bk9NNpK}HwlDtSXYw47{1&pI-)>Qq}6OlI6RX|ymu)1_wvg4$UhASgb9 zG#B+7zW9U}?TWEIRhTTlMQX|XHlz?STE%O1%$xD`v{f-B^0E#^;ysIjXS#io$E3^m z$%Xjg(kUp7hv?YW@0%;-4M5u3x*Bot=2{s_k?v+^cKhc$V**02T9lVccxApmFLLiI z!Xod#imuh&RJHt{Y zIVJ6ZJne8r85IyQv#L94${SjIi;~ErH|Q&%4w5h!=xuN*8H$=Cv$^WZ9pv5k4YBra;0@p;NwpqEn;U%NnL^laySb2*PdZvzDXT<^ zgS7cGRU!pyeuc)l=2i2uU-T-e2|rHbZbaLdHweDBp#UuPaimZvE+`ahONR*cD4VA( zxi}OMP0|&^V=H}6U%oG``Ds*M_P<$fN@O-(958@#h6?nE2|sA`t_Kf(MntK>)k-96 zL@j0wRT#yaB(_BW&^28+9+J@{^Fn)6@m2MZ{F5u~>!utSW$pOKvpxKPcVd=-Fl3|Z zy3B0RpI?Qo6QA}BI}LoepyVV z_NO&#YP6Xb}ng z9zY3njk5-;`sNbFDJJ24N@{{*YOi24^eDuX;&h z{QAh}I;v@B|JP9NQs#L!dp|GW%N^XSX{0diLaXS2&f9Gv5THU9Hv16X;~*b>!yL2u z8b3>3Mte7_$6_GIDgNS z&A=DHq$5(4fpDYg&XT5TZK>xkk(Hkj3$|yJ^dXj%qO56f#;}#BIWH8d}EY^ADNYGx5E_@n@~p2%BBR2VyCJkp!ebpAk- zY(Txb!cL0JtS@FBBsTe?*wqy(otYZ1{6JcXh$A9#`b|M_9O@NAxJ3@Go@h4p$^Q%!4BopE>m7tQ?o4Ujtp(tbN%Nib*OyN+PLr) z9wJT=f-O50aBD(D>VGmd^$8sthVFWl1tdbo$K1uf1h|2Rxq_14wi<}d8fk?=LW^ZO zCAO#fgUNN&-?jC79T*lXs2~BTMe-F-7G5vw30X4?cG8S_k zjkq%=DcXusY~@4mCe`dpM!~er!H39wO#16U>J7DF44v0>l8H;tsih7-B^eJ7Iyy|-WPrV=k?Wu87WU~6Bk`;$I#=62D_P~+pc`K4 zJ<0iOn$7kLu-4j!KF~DoQ?)J5)T@HzYhdbz7IPU}!Nv5|K^xo^PisA z$OM`F8E?b8yau3|{Dn4|$amznRt;LF%&i%k3@e3@TkSno)o!$}p_izHmFf}g161eE3H zo$3Vz*p)3^Giqo$ug>Q(#hk5Uw*AkqGzI>$Lb>6kC4%w8n9VlI8AHR0iCw5T)Bpen z?Bzvs)HwGHv-Eevd;8R|d;1iSnHPrth=&57E7QI!w&(gc!zJ1A{>>#N;!=a;8{Df1 zyQ~Bx$5!Tpdf|IbF_>B`Z(G=N=rcpHxi@xZQ~AST1wVgxWK2W?q3Dcn;1~eJqlT@GlJZ4{rHXuH{3S7Jy=M48 z@(O}G$=GShu}Eu_5Ljud%guy7w76Q~*!dm+3^k+e1k8a5P|l-NU?b72kwJwOOfTg# zBQ;?)EGjZ$pu;V^tuTI*QTFpE^D&ET_j&G-w#^7385VFj<*NT%o?m=TL+s~gllWf> zx*FYYsD!)h{Q(TiH86VeB<*voCUTJ(b;_e#x>y^o3y~_t|J7L(t~#=yH`RDDk0s!8 z%)3>mS3E=I_0h#j73JQNnCJiDLNiW|g7cS5``(MLaCjJxXbeWjKLM*$reXuMvp<_2 zpb>ly2>$Ec)B-m@{n^oF>Yv}IduR&92yKzjA)+*wPB0vFC>Eu?W0pY{YvQX@Rlm{LVaRmnvU>LP{ zwU0!@%YivbB}xn2KMhl&v#;6vX6EyJHRctckLUS5SE8$~ACOI8@{@crNfuwLMsE^@ z2MRBM6(c^RS>U0JV}uBPVa9+B>u!(ftNJ5!)M`Z7B)%qN=2X$h#H|2$KT)Ri zP3i}b)6Aqu{5?&Rfs*;3&POyQ<^mAfEcC5lGx4d{R#W{tU`^RBj}btkt5?^I7d%%r z(FzARZnP$0K6I>wib^)XtzgJz&8$BA3FF>=xg;`>3cjCzXg)T)tsG0gQMxkJh!Gq{ zohx_E__S)Hl(GNHJd+f!O+*9nYtiEXQ>st_KLt^2_<>GD4T8)DFL}AVeH%V3#R}Ti zQNxQz_gI)En34*az~>78i(!0sM3KNf>8>Tm=m+pGfC3}kOT~iTpM0~Nzws{bRUXm# z4>2eIfa2nozUlkUhK)DkCDFn3mi+F&IHCQs1M9;8E-K~h&feh9TW@k>4J-%w8(l!2 zP6aMlg+Ul(J8oQHt8D?=+WnVnLYLGoa$>KxS$otwp)w_#xD5<*1W$ue$?CMiR6jT^ zGd15GklYJx+#@9h?*W~=?KvNWg7xs_f-tXmzR$uv;67%WGwiM>SOAa*a&A1d-K>CZ z7m3+MDFoAEdfCebw^n~vy=2-Mf+on9C;ao#l4+p;pNJb7i(EoQgy*tq@AAp+PS7tN zi~Q$ifi93%o}zJq8mo+BHSFU7WcY`JgBq|}wg;A*Q?A6IEwgWV`nu=$?tygugqjW; zeF}}2DFeD|Z?1To+HQz#2!4%ok+3w&(cJ;)w_L*ytC8Dq+x!X2rX=KbjS{Q{T2rK% zcAB+xZGk{8ol-TE$loaSA{PGb7JtUpr6U4s);xgkCEBh{Cie3`B_|AG-%D;~>3r+^ z*!Z8jVEYMpbZj3~zF`MOGu#N<`|Q0p^WQmbb@0QMKYWC!xp^^kIoZe~5643(+6!2v z#5*KEAsm?_zd5Zd_?`so@{e~+2#Q*Z*gh4w0BqjYuCs5iDF0JlFt+E^A_LT}uoog= zZ#9QVCkJi%A{=i3JhoJ%FIUtjr3R{v?&A?#73@HpEU1-OIL$fylK3H0&sgrdoXXiN z9v#j+49xW_14YG4&zst(2OBNYka(60)f{R0RbU{cE}r-**kdjm4~8gyJ;+;=++TM` zZbr83`Ak05G|e{a2MQ)$GDxEhMRuov3Dk>bh5l+d#cAY`~a0 zwx$7)&eEv@Wg*^G;Q8AgQ@jkgxW&3^P~pA8C-_!Z-AbvH$ASqm&bF6K2>_iM&m>Ds zYP-oH12#cl+`RX)iG{-o({DEg4JV%&fY7IZ(fnf2r;yX-&~twNSlV{P&IfH29wM`C z!n7F`i62Tt)8hk(p5NEd8)Y$s?^^ogfb;7D)5^6QEHV)B=RwN-ZNum&+`k3<16W}o zOJ+OwjTl)1L5M5OKc*i3A7L{ogyb>}(`rfy;VA7l=TQoBKpK;d4G3dHEYwEIBC(V-1 z#HhG%XbMcAwciqrMgOXgECO;>&!v6dSWVdZf-1r=%Pt1(Uw$RQvk2ffWu*bJH_2y8 z2NuuKr@-N4)k@#MV--?(n+pnN1;Y|w5t~QYOwDuF{gHMC_YS5aCo9apSe2O(I1a47_Q8O2NR#wn?h7AOdc2O1Vypx z$rTjYM}aG(#&!IGU;m4T{`cotSk+fR5KXT6C42uj6uzGnQf*oRvOjaE`Ti@9P{s>A zqpW4N3$+R-73S-2^oe5sW0De8g6)nE4y<9rEec&x+hV4?Hb{K`-u!7d!Xg>KdOV-hQU{h-N+_=$ zbx|DMd-CMz8O4E+2~jOM+gd9X+|;Jd02s7`EAe|wB#&yAiBi!3{q#Y|#giPm++-Dr zKR{x$lmCWZ-z}wAu*DVSn;DKqP**ruXE-;9(+fh8D*SBgc8A~X!VZuhda>_ z>4XB6%;8Kge*|Fxu9`TCBsGER(BEWiI|RHI=Dwx_=gBk#8Z+b<5HOoEH_)0aIU%s% z2yFFAUDqc7LR(WnOP`w`KqG^Oo9hmx{uJDeOPpW|klQ-}NO%#^0#|0ccFS^-=38cG zhDphWC>;fV8ba@^M~+S=Xqk&Z(a86jHHA)-@YX--TAVB^1=24voDBgD#=M8zT`+XU z<2@h*1kbX0614MIcRnsq>!^e0FJCfdYVN7w6PRw#WMq$vC@F$57kq2q2!x>gG5I-> z6pj$`;Q0q0<@+*TT^#b{4a27yCjPjbiow5->0=fWL-OWnSE*OQIDz$Y?G5RmPwu{$ zL$$UlvT)It)kZ98at7$JdsW+Ry79qwgb=qm_QSv zj1f*3wj-8YX4El!IoAb-8E0NVCyXjJoD>{-$@S(GXhKrCwYjN1YFl9T`JKLQ;|s@U zHp-T|Z{Z2^x-s+rdQ0)jvd8!tS&MpbupPKqN0r!sc6Df?{NE-1wg$L8BQ-rh=*M zeJqKEX(rJvTq#W~DX9`UCi;?mn@qezAkAHXOo}=_VL~nJAW6-S^a%;z6ec6U=`;`Q z|I4@ZG1B?iQ7tSI_C!M&!F`>%K>gDChqEnL*>BWwrJJe~QgxG0->2&%tENbOJ4GCqF4-^ywxmqu`LtB}PUSQNuQIqIdA&AJLWjsUfBK9Lyu zU2ysbqGWx<{JXAHH1u2WbcFM(bQ=<2nkg2fFQk@>1+RAGXTR1YTde;LT4NJ=1Papy z0c8ZJc&@B*J$_6I(^V*Pl=pDgq_lWM{?x(%*230Al`FTc(F5R#!1&a`;&ffJtR}rR%;S{FU&l61kHY`mjWkw6`Q8OZB+-ZDUVLISEEV!3l?g0OJ%oMxjzPI zpm)avY_)mPCtirMLxYBo7hg^F1u@@)}w1pl95A^_aIIk^O=g>R#qup zwC&6Q;8;F8%WV20<#*o$*n<9C4LJA#!I=EEDj+Ofy~nOBj>Rj!Eo`j~BVbLw&1BuP?Ct-YWXZ^A2K0To%G_-9$9-dF6&ki7^qw&E04Ma^Q&22*pr<@U3mFPey{ejH%TvO4j=`ofKG zSafX)iDphVTF`)MvWS|Ia#zF6h)IY<>|^ExJNXQ7+u=-TB)c6*4^b1up?bYDLUS|{ z&9}kp*aVdjd+|bKfqWuB9|&Fi6hc-QPXn}qFHi>5J6i+U{?o3A&-GGF6$HtIYI1!W8rexIDfVdG0=w>dio}I@_EZdKMg&Glo-X}*lZ_Dd&3+(HS338n zX7#vY!H1yFdLRt}q?U{p!;a1)Q24zU07SuQ*d4{jaHyOp?hA(&%!f@<0^N0oO$?gQ zv*qIlPv+_zmTL{MpFO)~#-WYB`?(k9KC=EE`foaedVCeTPxM_;r%Pi_Z_S~NYj%!+ zOJ3#Tt?y-aF89m1&Nti@y1241O20GfX~q+PGhB`L*vh71w$X`fjm87^myTM@Bw(-g zkK|Xx^s#6d2W2))93^ImQ^GQnR5=4@-Og_DgS65zWNR~zuEvla)Ih0Ep~ZIfvFdK7 zluaWr$WFklyzcE)D1?_Q895~IWB=p;4!%w^UmJHNIJIT275$tG7r24|ag zO;E^@Y(nrf>yu?&x-4UUC?}oFI4zm(L3&&8=F^bzU%I2aOEuiHyzhy66@7VcS7!A3 zb#WhTnK6Q5Oy}xqAOGqMrO6DFlyS)NMs`*rTC=Bt!NlkAcz@CBc_nKmw~07CcWII` zn9yO-8+^iW#0^i+-p2;D-$mZ=(TElQ?Sc733ratwAa_zqhF9%l0*r9;c@-S6Ivo}6 zHK9@-uOqZ7e>BapK2ub zi62#}CD?uwMbzd7IcX$tnYMn^nEIS5<<1ZqVVFu*?$mUBlr?Uwz|1MNGP2;kn*$9R zTcgBP3S%A!ME3!cb4!dDK8_9_@hMFAE)kZlkG4&2ICwBYIuu=#$B02X`hs3V4)|Xb z=vp0o*vhT@=*76)_f#(ZUXr?41m(&yf%4$(aA11}B8ypd2_hvi` z+sCa}LB*3VFhZ$IX^B;&NcU@0w2)Sf)UU4HK^6zV3BmDDqF?*?zeMWW^z*9 zFCjUc0>}TzdVXxR?92~I@io%--f?KOy7$!{?N+URK(H27(QMX9dNLxeH1|IziO^1q zDOMAdN5XcZ%hmxz9n1!F6Sb=83;{1Y?i#9s8^L~uzO1s5kis{Db(~^WfU9c1Q{A-e ziuTM7Ddn@DqAT?kQ$an$S|EX^7F&+652bE0SbW8ZC*x*fDk~Yyr*H;5^|GkJ-fzqr0yNG~!;Z*l2z!p4pJH4MV z&_MSB*k7a21b{dIqmb1E9L|#MV3V_@xW-C3s7RuN&%?4tfEkP~UXyf%PI2mM8NrgR zmZQNX2h7_Q3K{B3kzY2;`->t6B(!Kh>x+&PbDDZCIpa_^{U{7w3Va+xHv$3^ZH!kk zK_R__%iWJch1h+8(l<^1jaS|$^SOyps)f>$%qk~?K$g>_^)RbVvLI?6fc|+H6Tqx( zTLm%9#Me8%4_|Ov$M4LqquWiAty`Z zO!bJ6CYf?(Qp)+4UsO}fF_to#E63AICF01phPU4~=+pRfz2vX76qy9%-m1~h)MkcG z^mKq)Jishgr<9d=K$ruPbgG=TU4SSCAKVn_kyP7l6*Hri^lGHQfMNn_lT0-}RUV}8 zVJ@nsh$4{aOJw;t_!r@~!^NfB5?JrYFj7{Y>*z2v!$ar)NI8C@^0q#h#DSXjUS-2$ zs>HJSm(SC&ai}l=&cE_!p8>zR6X;Mg51o!CNQYaDhX%0gvh)O|EQm`L9AmEH+?#(^ z=b@&Lxe3qlUqe5Q-b(TfXOF$sk$0i21%h9Xg{}Gi>@O!-0Rr4Fab2;y6AC;=R~O?a zfC80ycQ6oi*?y%31`K3hyF{rG=Cs#;a)6L7r?)Im)mPR|no*31V{oue&`#3NL5S|> zbHHFN*l0lSkv^VJLYIf>6~G^i%T;>b9yWo|)dX?!8h_7tHF38j0a2fay}>qkCB6X@ zw!v$>Oz>@6;e1w|eMm1XoW+#8^=1Ezbbu#Kgocf007lgmGvt?&)l)y{s8gxH8)KD=fF=4 zhd!Xlop7jo5%U2b50s%ms@lFqyeVaZQ6l*j8|2y@7?=G`7ga)V+VgBLx|riKS&%wG z6)jXvRJewxr~H0lRFYrD(8QRp%9Z>U9xz>b_Y=!@5rzjGs_n64%5B9@27ZT=x6o+S z#+-9?Idn&}!oJdOpgyLJBFCD{X%etAV~b%=?K#+eo1J${TWbYU9VNYDvkC(XVrSt1 zl{Zo5Ac-U3ZOflK!LMuBb=~e{vE&kcn7W+ikv&8S%ZtK;#%NJjStzZ87G#$LMc zfpl#+Eo$?lnKJY45xI`q_nh=-$2xYr`7bc03MmYB`hQ9vS1B@{^o{Liu{4t|Q%an* zEJCBH<~}07}f!Zs8V1KizLPm=C_>bn)1gt2As&N1GO zZub`c9v>tPT#-P*@CX>4adZ?OXSM-+N2Um&P?pW8!Et~F%#xMEs`YSGx6Hm}`z(@c z7dcc3Ya$5Lak`KTPS)kuF2Q9<$b&5+Yum%W-2o5ZmIcv{xfaeaCy_E+s87cQYHg#+DZ$huzXbz>kp;<7GU_$G#yW($`pnI zWp`iyWF@<~VUm!sfj@1J0@-RANLkv%^)9?3%=Wmum4pNmY61PCS|3#>5VEsF}0gOk-$9_xvR4_~@i8KMl# zAJPIOhP%SSi@y(;!hL#2p@uxD70}`SkM051U5^q}04DVnzPg|Beemk<4tNTAQ}Y7# zCH-dy7M{K^-0Y7k_%Fr+;iefa@h`kkg^SWB@XcURKAvP@cnyUMuzM zIBC1O$1&*ClxFC*sStDl(8kY2AFTjNk-~9v))kuK$#3L}cQe6f-C7ozx#i4(V8Y9g z>8vRSk@kp+Ii5%ellH95wrAq@UOCwR5a?91PUNFcO+pZguXJB}OGJz1n0<17$ap?z zPtL$l^EG3`H}t;7bg^A4{U%k={Lw({OP8P>e#Bx!T=xF3WqxRP-S3rdl8|Wv$y@}crtC05Cld~nc-b&6mwp!p z-3%ArmF_5g!Sztm6QQ2ZgJX4j4IuzFjky}J@GsH_j-S+Uhc0k3Yma#5urbs+-)UBF z5!H&jTP-txyP4@ym70WDdhdH36J1pLquwyRh8xu zHJ<+lMI4|%b%c>%0Yy4(Qw~dx$14;CRt&3>V6!Dem>Roo5xJNbagE+NI#$-vU6vm5 z%dO2nStLh=LiRokQ9PXtW6AI-&Z=+&;#Z6-ad`yUH{7BW0Z&$W`gabZ8zK@!9F z8`~qZXu$nmERseUN2vRUB#_>?6lo2GjGfb)m}D+;RP+nK1)jxMFCKc;g(Dzx?RpNn*zSrQu`H1Kafpb-ZrK^4~uCQb&$mQKaZ*!rq&)~pZj_Iv7rdreECT`KWR6>t*!3C770$vvh6Yq)7c;bE zE6_F3bCG=!q}q!0WeqbP&bLeS$4{_Lw$vQZG$;@>ESRk#Sc-N^=^c*>2S3Z|NHu(v zVS&qyjr(>lm`zL`mNZDHaONz<15R z{gDB_;a%b_M6r!^!F!ol=JNu37q==oS^H+Y#5z6(WQXDoKV9g|Q-`SR znmrRviBkRi#8X6MIhiYuQp=T$_RMR($pFK}=IkblnGh>sqee_2ga)?5`_Q!?xc6G< z9ub!bOLSg36RhuXm5AyGo4LI39LI#~NR-wCy%#}{lw6FbRnJex2Njv`oLXDzavMl9 z=Iwq87SDJNqN^v`@k9AU5}AyprAXHKbW>CXQWvVumF{w1bAG;w!O3V0NFHwT8?CX_ArGc}TtZ6J_S10n_*o}VXyRBR zplg)|1?pB$5BISGL%AX2@}@GRwil_MJ@#J)JpXhxuacETQavO8^beg!xv zA&JkuTdhl$i2oIel;1I62KN1DU@g}SXVn7*H7=TN4wwJbOs2#p|1 z14cSi{MFfiOO{l=dzGP#1=ss1>UwFYkmMmgvI{H*wyx;;lgr`g=yQ;2DS{M=VcXEM zrfV;tNydEpLjk1pDTgSi4+fAMv3+LhJK*!JlaJ{rKi5)-343CpFI0GbkYqu18oMn1 z0ZaJ}(KpGmNG#|op}Qp>bB?o^xf-*;{d)oy@!KFDpKrR$?lZWDC#vQ6bq|AZ5uv_Ql()uAN^mS>bIV0gwOwKwovFX&*9G&UB}OH!!j$9N zDJPH9Cw{w5mC$_64P{oh*GQjixbWC$#+jXiPN{5B#sqWOS^2X*9S8$Rtat`=_AG%V zIKHIL8KFQ4FG=>2hV?n~RL|A#R>iGvVqCs1XnsYyWnA<6YxeZrj9-0qgQ-8I!4I`Q3^G3D;C?axSm ztW!IM0~znhEGt>hPa%$H1*}T&@1t__q~m(0yV9HEyXNZt+U!9ESTpBG4x`lz0TDkK z=7WL8Q|!7@D}9G`fqK!Qaf+m0ZZCCfl}hUuKQMM>+FhBMI0)nnzOPNZn%n=_L&BE& zS_HmI3>QpA;y}_<*0&+Z$i=hiuEVVk`r1pPp6KJN>)m*yKRNrD+a#A>q7ZTIRF6a~ z#A{q&C3pAEv|(&vMXM~fO1UnTma}jOT3U1tLj))9F%TC>5}T&pT#&Z^O7$+!()SL$ zk>aGCU}x)sF534TzZ5wAMr*5C@bql!QM0r^8oRUDmSs_~qtYncd5hbvzdE1`ttP%m zFwM?=FlWIilA~n?S55u+mPF`fyNjx@I0V@fx;Z=&KxYQ>YLoq=g})x0 zF~m1KTCf=6Si>SO=8PbVg*m?cBx}&M6%&`;)wQ zGg3W+O|}!%H|+I%MKv2x4{2i+Ec_*b4?!i4ncii>E@x8og%eMXdB{XfX31|v6XV8& zOdgUJH49xdq-ef^nJXJCbE$Qe8X#AzEhO23)$5+vx2O#A?|u--H4zy3snY7Vd$vn7gO^WWVfpfIpC?;ed7ZA1#gq2 zpVVdZdW;^=M=HC-o!V{b##u66=lgE#Q(-Bj>d!68qe!tX*4=&TH;Q3&P`>yvVxI%b_~Z=$)){pn*5#Os3SB|=H6dhlz(8648tO~!TVPY)i|0YS)F(60UME6550lmYY~yCR`ntQJiCAqO8_$Wa#Bll z_QGmyG8dw)U9c2~G{_XBavzXok{Vg$+L%X(c`TlgFAfwEN?sZ4<*h z#dWiG45QYG+d^dMAl3%?n`~^XgrorsQ5U%0X57fDl`WU`>Q18)Vz0MV7XmfeYQ9dj zeWMRD&@e)=8Ik+go-4E@cav)EmAc#|^kt&(*xg!RXA@l{-r?9#3kgvMQhAF##=>8d zM0#q=(&>j@6I)j+H1+6*O;YbdT{0DRyV#Wd`KFeebsK1e;$^$Yu|pRbf)DE@_2B0R zkI?H4fR|Fu$!vVwrOjVYsmRPhr~XO#5O~2L=~LiCRGuLUZb0Ry(lyuph=N)PW>x!M zvCg@LRf`|VnxnLRo=>t>=7d)kT}MxXF7Nbjdv>U@r%Iv~f}MY3>(Ck|Gb)STC0HYa z(Vk15RC=VO7*uiZ@WUjO1uCRH8CrHuJF(UklTMm8N;U0;clHqGb-OrXB9cEsaPHpJ zP3GYo3Dt=ikWMLMqY%I>hwbL>B8G_9di2zrYm<}NY>&nN@>E$fhlsl%}vlFGB@8 z7rPVi*kC+7Bnwv9ZaMK=8*MpjB}S7Jafw%7*d5;(_TE&=gZL{QZh^DCab8f6ei?gJ z8{NLuy0uMP7I&05Uj5Q(zxl>#k$xyaX_k-;EQXW9E*I-=slm6Psi_rY_~&M*@_75? zaBZcGoP*a<(fu)rr$2BaVgCilTYZWI3 z4%YX3--{F%Z*B&f##^@s1_(*)txeyE~%z_*k1K z)IA6Tcl>vp_=ATi$xpwY-o>O${ocTI9p_pzF;|wUd(KGvNt_%tS@<~ImCM6m42*_V zZg`;#Pql{4Jgt>{;5wHwk(cmn<7hvSK`6RR3T|1cr&T@Vv6sWK5_s4Dy5vXCpu}>m zmQRKv2cq`L}1gbuPrebR|a<%y^o!SOtEibgN_ zx2vH}wmD9Rz*($TVA3ES;d&J&W_E!irgwY$sz^brXMEyy3c2eS_t2qI8hoQ23md+7 zMTint`!vyly45`&#F)v=Wc#xs!Mu^c&w<^Av9w2{(WTI0nkkvo1nY^W@6ba83)7QF zz%Aqq9D$sSI5iSq+%tOLr>Fy;Ds}RU#XUkf0bYxI=$*xmg6qis?rf$^)ZcSGc2BJ& z$OA4@fw@yHTvr=H_CAM}E~RYPEm$Rtxy*>ea$H0-Mn213aS^dQnmKlq3sXp%mRBlK z7bx6i(ml5x2!Vec^VlDQ+eg4fuccj<`Ku@*c6j+mgSlo)_;lAdtNK-cJFx{1HCCqz z^X7YXXE5|(1|{S6{zAACx3JA(3n4k7vVV5t5j`~iF|MP&>Fcu>*DrQoovtDKR!aRv zxov$!*F|?@SYDg-HwN_MZNV4u&y2_`zL%qU4tx_llu8S&qH|*|FV{Y_S9+Ho{Nx@;0 z8O}jvin^eEuWI3978gqitVBwWemvvY_RjlrEgRJ-ensA*>`!eleZFN93k&1(@nB3= zU(lydL%^ibY4d4DmBa9GJov#QPJ(l{zWvu`=l$hWX$AM z@8gO_X6)3klBldWZZVMDZlKiO`F+^zOVE&Z0qx}HE$;r>Z) z8Vwmejrg_ml^7AA#3X$*T(Quwy)XWg={dgAh;5PT45H9zNRQE%@LeDn>CjRjMg0j6 zOQxvj29;{*`LxK^Ch;0LlQq|T13cNmIrXhH6UrA;NQi}~3EhAzSL0xfZD6}d!6TnS z)^TiVyHPi@*piNZoccWMLz*;ICXbUXUh*AH1r$g7}rntZYss+Ow|l~wDyf0o;S>lwPt(?E4nb<^Q12kirQ!XTWoy1OeyzQYxzXxxsPa$gves>K z*R7}zE8Y;x`#)6MJ@G| zri7+7u!{m;{`Fg_i9b{ex_JNg&4`z88Y|LC zA&pNPN@#(UZeIBrFIyk{;(}q;nY78nU-| zCzr@04ONXO5m%Y{ekVlU4NCS6@X9Qe)o*!Yt8HC1<%gE@r5JSDnAUEy?klA~H}>dtBh4c)?JfKR;=QByZRN zWqXy0X|dbE-KTHZbSp$Trq3{ew zmZIktite`IT!mFMmeNDnSX7c#x|IxpRxx6g=`{Vc*K_^}3^7_t)|d6x3rn7^NvobE zM2?ziQ}R1|TGj6sX#TkQ3oQvs;NckvBX2Z064?(6oRxrYL|Si*54KHr&gCx8#HaPR z9N%315HjjrUw_~Sv#Khk^fr%m z1h#eSq)kKb^@0Caq`V^IgDQXT=O5}FTs^NXlnXqZ2rVhXe$7CHj;tnT&sLrH0%jhI zGCpK5czck6oRz!ch0%_RE|_l|`MPBa!L47sy<-+kpe*{X!R@Edm~P*^ZZOi*sZB+y zPj>m8J`-POM6mdgzPzPb;35a@4{vtmM1}sZbD~}q5-nLfJ79B0CcSCO6!NhH#n|6% zOUrVMA}xj$oQ@nDXQn?5jgYHp``rjeO>Bj0l9IMM28W{Lr3+N~LSVDP_T;5?vz*@=>F5Y*}I^DpGEeq+6kt6IS;nmT4LFu@3m1}?r_f@~aMqK($*;Q-?rNxGeync02 z<7oZavkqESvVEMb;$sa#YTZV7#FM+`-=q+fe6g$*BKQ?3Xy{(XTaZR>g7F+Zd<(8p zE@`AOLg-2h+d#2LvUa^t`mhC$@S`8VX5;{GZe{rFH@*TDdCgs1TOmz3F-Qykq?4oI z9z=dWEdIO8A8f#a=_CXX*sE?b%n{^P)&i| z6_(Lz)u7FzW0l#xI2#wroDtulX>6AoJ6MqL&ATT8hXL)NS=5_(VP2Cs1%wHz~?kWGEPl#*h45yIb-&xUl}C0ckmy7OVTp(xvJOqX7%;e4y&aJN_S4F``d zfl_;cbF((9x}`T`(Z~7AvWtqgFD|J&A!A`U^V^oGu8reDP43{ej9|WMy(PYtstX$| zB<1EpQ}H7ctnCzPhn8`Xm^ta(!E+`1#-o&oQa1OI>+nk{y7MAr9G3gBX*&PgxTa!3cIm=mUWp05?-Et!p6>8C;=sSfqu9;szFQnU#xQmN27OW^dPIsag>ssj3 z1`X_VebMZA9Bo`*UO>IxJ+w#q-zp^n0}8qN$Ol6{ZJi_H)xKD-5aQTtM~U$7mSRL* zxdSc1dlDDFwx3FrD%qux&%a@b;~;Y)(GLI~Ot9Ribi-^qQ+4b&P1F49PamP5%L89& z{1HuY@{3-%2^eM=KR0=t$q#z;J!<8~a_`uxKU#x0jltIWy8&Vo7YW01H)faT&)ha^Aifw$MmgAtc^)f5 zus~iia6b&mn?01{?LY6>HTR}Sy@k+u-*GhNd&9o69?i|3+teS}vZ15$-I0`*BAdrK z%`y?aG^^v_ReBj93efBT*K>MH-K>!a?+WKPPRrQiZl_wOhElV+>gB-EdhgKh^8BtX zJ}#&iIYxF7Lu+M`k!AgLtFJyn#R?!i2;}N46rGXgleh`6B^>pC!2kahPjL{+Nl5d4 zaC5Qk@pFNvW|x0u&ku6%rt<|dgXw2QjuRGc>I|q!V;uW3oSACKM4+?eNaV>e0v5Fy zW;gp0>*gCO>!piww8vGQ>(e2z-C-%}?K(<}S$2c|S6O2^oYu$*6Gy?JLgbbXg&=XG z@mN{q?BPc1iSx(tY%=GRtgcT~@a|@* zzs1uSq7*XjBJutA3nWLwl6@$b@4IPoB)5Xcf{WLZ+As!DZGs1dCLFlts}tmNEd_4> zA6-`+5OvnIC6tnqZbT53Qb1}*=|)gGRZ2=4h6WXsZlybhXVauRp z=hJz%xlR0%s|C(yWh-JH(IdR0+V@)R+&>Q8mw(pn$rUB85t%wE%K=-a+Qcj3l0)f0 zS#wc@9hp=bLNXN=uV)iabC(=6>l3}KQu)y&UoYLa+qAy(lEAL~rRW;G@(0BsRHH_hu`l5|gdy*8iUgVO=T z44=Dhzy?kBeywZDFfygFFdSy?Afq?<&$?O?lk3HgPo9(V}<7yS$1(Ee%&E zUa>iOz}X(T+U@JiN5iRSeQ!KxW_RbgDUg{U&l|X|HP7zpFstc)=YN?~J)#pLIQo!a zha-E^mmIavMF)jNTKep$9tjP{&wOmt*R^DZ@qImyp5W1wz|{D`(tA}Zu~$+9gCr9V z9_f-AQ&-&{|JwaeVAM!Q%llwVwE-fB%SOb@a zu_r_mr5~d3YMRR44 z+DIum?g?bEuKBd$!O9QrvJiKHu97O|v_v6(bBS&YNc>{Lk%(R&JNy+2w$E$Z#r03m z9|z&&c4vz;_Zo&hn}nSd8?U>MR68af*SaL;o-GZ9AZN-N!_J=&r=?Wm&(^2v0Hb}!Xkp&Adq?Q!yhjkjc8B?JEQ zsC`(Pu}-g6oVERKf@8O_Ut+CeU;THYxQN7C3#WD4R;Imc8qM2&d}`T9u}Z`@v`%K( zyo6WM_UbX0G@QFw5tY-RL7;`q$@S~lMQcvZ1BS+sQyBF5oblG`JLe301!_l&?n^S0 z)v(apef1nQ4qsBErJKt$U-c)dYuo9ZQTD}uv-8sY;gW`<(_LoNC@JUPw%jLY2W#(4 zo^<1LaBx8BqoS@hooBVw;!`hj%i=-IbT;46hwXO?L1WbWv>OTmJw;Do zROKYRtI)oe4Kp7keAgB;>Phl~d=p{}mI@M)PMF2Ey7ySF2<#Ru;^eC~gO*Q(naabe z#tSY$f%!9g%_N#p8i?c&({fvg%h(8S*%#hJM`@PuE)mX}Rj!@rylP||2a-Bw-Td&v ztO+h_YJd(*i2=-%Aa6fWR*CKOsW29vaDxk?(z$O0Ud}`A6djVAv&W&a(6^e)LRYAn zUlVvDFqjmB#P9NEvOXL+lIHV#6Q5+%QlQfV2d@d3s(fINqX)vbyC5wNFH&ES+P!LI zP}E{BP)_pfE zz0@ncwKwU)qLy0PrK?qjw+CvdACHK(c-P%(wAaYk2R7-Q4PI6G z!%!TxX)_0{>$c+ua9*g{3u;kQ^b5|_jW`$G=>MG9zJ{m%o-eNP)~#$dyrul$Wa?Mf zpJ3OBLo0OMcvEy%R8VrrXzb?R==N$&x48Xh@P%pdK_%*J6oCO=viK*Jya89>fu199 z%0mtN#ir)wgw_i_Dq@L95P=;AxA@q~q^~W?snE-k(JQixhczB;Y!Jh} z1UKdU)nu<%p?4lLFYq*Mg9@B|zhx-{zEV=*rID9@xH*4jpCIT{4?o?L_qhf25;U$q z0~u2nYTg4%2Zwp!2V;ONh34hlNOkE1zye zguJUV=89O!e@=ftoQX`gU=n}wWTT`#u38foe<&|CUGw>~uR#ApK0Qj9YkoX!G?|2d zM%f$;DbJ#-npv+>8e}4ay=o}3hwPoLC$28dWq!@5{)3Y|aABTA zPe8R!r?RCqT6O#Cp~ANu0+O~8w$76Kl9{&eR3;b6-U-#+vJIG@NttHm(ipx6saI5G z`#81>x#h7AW3phSG2}s8;nX)FIxL}}FFhiA+s|rU)%Sr_ILTjnl-JH+lEl|=7;^)5 zw$DixT{2v(exlv(R&S+_C8#^FV1MxBp>B88>L`hB?%wOQR*m_OQ&`064!6Ex9CDBQ z7qAiP(2utybFBwfE9_JUqacCCgfD3RI%WClGK$o8|M++f3H6%Vd3;R(`R`md2@dP=n_3n=_mk2gxpM~`EVkJ~yusE3zDS>3*+oe46*op{rCt!-rDTYJ+)Z1kg} zOsK0$Mw>a*S}P{4!aC|#39-dFdA46Rq#19%5zG8BTx_4oFZe{Y`^}K_)9N?*ua~2q zF67szZ;ChNh&1z8D%^>i%nh8O;{FxuAX-UW#v3S1q>& zz2y}9wCqVDEV`hkQa!Nr`Vp|<6*+=r4Lv9V!87e@?r-=8I5bJu$f4YXlzI455U680{!`4tJ|!je zSh5|+qCsu-P49!^M^%`_soi~sz}ccVEaWJwiycxbAkU{0apy4;4NLv{u`DNQg~bt# z7jJ%I#cD)gtUHlgM!C&^QJ=A(z2XPvz!gTVD}I<1vGwlNoa_5FdF6UGB080Y1GiJK zY?f?mZ}+LG3szXDucMNk9S?Y2^6i|6@7=NPtFp^voY-uU_hr2XUFKDM$90?7e~gH$ zL19?NhojF13ehd>mtQL~@qevrRWobxo@?bOB&Y8njYAQl8lJ{PGp9NOO+Z>j^E!ssrgIgN$EeIN)YF18nR5NLR!KY{ z5wZ*V-eW9y-=dcT*D*}ig4xnIQ!i976T#GgX|)+m+#za<4pbjdtiRiAG(q@mfK#Vk;d@=AKA$$vZOzN$NFS<9D}KWwn$Ms%@FjR4!=(=MJ^&xAB-1Q z%zfDNA*-RwUQNA>gO8Iw$3Uanw@&s6@#o(4MU8csNe&NWT9#!I20$^%=q3O|wj9{x zymg0dRWH(Ifa>5mL@9StzK;=JPL#o+B;F?P5u1;1R z{Q?o6@$Pv|r+#lOeAraa=C8&{dqcFiH?{hb(-*#lMsM`xgBMCm*FfP6I-k1zZ?Nys zW|6QtFNyB|I$YW~m zEnsaUv#pQ_?OccTb3>)lG3RGSjj$i;CW2vjF`|wpoVxAL!VSKSZpf-kxTDU}E7gx< zQJ8j5-_<{MKRU7ET^qiEJpHBE5H}QbWm>HxgF==ETBC>Gvni9SR+ox@pw6+``Rcbo zvu<9wmFOOgOS+@0|0H8Exw>pA-=av+j$~+E%?bn6WqZ&fJigjN7XD!#W!6dV>y5Ov zmdNzAmd7c@Hp_>B^>5u5qM01C;vQB!7juX#9WCpAQ}8+;e#A5^RvvEiHX&zLXtspr z*}B)4VW=*eY{`QhUQXb0`GGGb0>E!ap!9M4{g@cbukJM4 zD}Zl#iDr-0G4c9uWRbKT1y%t7*#Q?URT_>D)sF>JCL8b5+E4lP`gxQX>;SY)67~KyB(PUO|jP(7G{33$! zZmLNDo?1xFGxmy;>-J4Vzh%>2xPqcX=`cqS-B#2r_Wk__LEnO*$hTv3Kf>ou%}G(F5Kch(x^^CN6d~chv@2+r8K`1EJd~ZKGeAxTs^32t>AFl zKbg#}>ljojSP-7q8Q-yGJ+A%QE+RO@1CE!WIP?St7~YuoGq2 zk(2Y({G0El_fZIrBx#5a=ic{m9UZtbhMRq&v_n1u8FQ;f_P}ww*yLxzZ(0>PYaXjl zwU{DbYeY`Z^O+uXwoT|96pNJ}%2k4t{(;whIC3ES*h58+1R>KfEQ&-PdAh7($OX4T zN3~jrDC%tAjifCyn(Fwhk+KPS%;EUD+MctH-6zQk%8DI6Maw1U1WFsJCBOSEcf``Y zim-)Fh;2;TOhxV8cN>RXq&ntgMA!zt>c;Bwe`ddwCo)peaCIq*1!O9o@5uG{}?-!`~kV91G4d*b?j zWiv!;aJyTD19`oWlnOyDvh(ecqB?{uCirG*UWtcS+X6v+JUmI0dar~7@Du=bCR8^y zez=bc&waVi3!J}_G9m?Bd;3JBcAViLY^2Qz9{Je_I-kDjRHV!Xe!)|C6?Nx&{#w4{ z9KNsg3CD-FWM2u(CydIq>ai8~@ow2wJ=e;Do9S;><&eJ-%yJx?lh1P7TCsN&+)tj9 z(={3vU_0qZ&*4{aa>>3*BhY0x!>If;n@w!-Wv~KY+)$cP+S~Lvbo9mAB({ck6JuX< zde-wmho=Q_xR^J?WKmzNy!gauy_a#KJ#eZ>G== zQAA}#4`Zi4cQW~mr6VT#xQBlS;=YjYlgyZdXLt@75Q_+4OD^X_L;3CMY9kwU;BkXi zy8?NT_T8!h==EHH!s205zJhYDY{U_o4;w%a`fwet)}SeI%4JQjtPh!EtU6@vMGxFt zP4+S5;kEPal&!9R;v{`h-m`9{V_+F>g!fq_(&Q1?M|XjWvm4o}-0Y4Or=Qx0R2UZPh!btdX@KT5u=yl5$f&v^WLf#;?ZOgJ zyp%43&lMnvcedSPcV17OPbPlq91kneVHMH>Sc05GPf7m_DyXa8H#G6lc~G|mLGPj9 z=FtHTM5T2kf)df$@sZ8}Kpj9i*DOyuvF{vtLJR>ZBL%m zx=aA?*rIYnsKh)vAq2K5SD9+n=7);GDYgNB8|ZcEW#__4p^^V_p+lQcwZMQtc5#5m z>GQGc54EWLrlos^4SFrN(TvKXR;oFp*we#Tb!Qf9YV60-#WNSM3zn?ypH_kbTnxqS zeXVZERT16~HN!0xll>14;rw02=#0sa+*nV#bu7}|VyhS&oJ(>+?Wt99hOdsgS54zC z=#*G6vc*P_gSX0p!l)(aJLEZ?>u7Z~-8p0Es+(o`vGM8xKARerD|@x*6y)n-_X}IV zWYL5PDUN#-J<}yW3`$fL$^E=*O1jnSkl9xL!{b{86L-sk|Caf>Lgp;TNuFSkP&nuf z;w<)Q7aww#2nZh5x)>G=J)4eRG1ntlBCyZp3~?AvHhgeLQ>aaCas#S=)td?ccq(EG z;Q$B7dxj18T6v8$YxL1&LKCHIB!)pnwMR+rG}^Hv-Ruo!uQIEV9BRI&H>|6bh}P%8 zA+hIGbULV_)t~$7k%-i8VBZF;uxpA9kyc?8^5i&xuw+2M7)6?1+W)AfrE_<15g+t{ zoq9Au)Bh5Z9y+2Qs_U^LOC}&Vw2QKxLN(OVU`sI+(F8Y>I^oD^vu>tb!ipAK{n8*Q z7`AD}f<0>n7GS%eCMjU$r9--PxG1L0S#+!JO-v@!I085pO!_ z^Jkl^y)2O8H%^ScDsByI1$wo>01%tDCjB@Kbo`aq1D*#lB3Y=lzrkQCM8Hmz9mOPbic?aKAt@KCb=cy=YMO|c}Zv87vPZB5$Vpx;EOSiv)ZFw*&h;v zkmV)IXSfq?58ExBCy??*)mEYp{J9nfDPm;1^5y6JY&so^Myo8MU5j{x?)u&{Jj86{ zg2OQ&d)BM{zopOT1jYj#@>S9*Pm^gUe~i)_zYMF5!?rGBxrE$M_lz(Lcit~cigtwdsl+NFqxx z&T|@vgRfX0r-Ld`AdFWac9NnKBi*YnuJ!lXa59gV&$slZxfgTtte|E)xH^ydOB8^X zK|A2K$Q@2GX-x_!I2s1@Z%?2gpFUs_?=7}Owiuu|clf0#w!LvGX)$q<}p_wrE=wDeJ) ztKHOO)ykuTR9wjcM&vyJKiBYCOLxN-Mc30gid}nD;(5k4`U;c_-F@AmL;ZdgBFC`oUdhl{%FrgnKJka2(yEs(301c0NMo>RfYTJJpB9r2UF=wNmo zp5+{k3Yk}1h8B?|{^0@0bK)!I+r1uXV;}tM&YML=e3`vDW&}OL#_=)DSHAa$g`Mu* z3F~aJ7_)w1PuVFszp?2fvZ6!ljbc8#e7{6 z*1Dv8K)nkxQ7Tv@eW{!fZ>D;-FFRFYY)44LJt4mpI-S_cWW`#nQZvtcSTVlg%lDS` z`1=m1I#8OujNul2$7v02x{!I&o)G<>*@F7LD`3ROT?22J*J_usalRH7bTy z7XIc=oD_j-&d?)EaG^@$;6W9~lV}P(?yLSIcjoFDm-fDf`#?~?PyV1D5fuuJp1fP{ z=>DyiX^v{t0YJUvWo}KI??x=@UrRUhQ@L8>Lkt6)Exh;<{U;UMCT#+B&w-`n#1=-0 zb}sB{2ut6h>q+bbpX%f*M1idXdyxFa&1Ipz^izey#XxHOkc^w$?CWg0O6vK$>n~^9(2%!YCBxi~cH0taj|{dqow&WUv(X$DM#ZzzhcPk8VbGn1%A1Ht#_|456+ z;PvCNc=_YV_SIEk()LAt$=X{7W6V)W3G%lTu(UujKKzmpPaGrpLB*sD+3RynV*_7xUR@U?qGHJiSveQ$f(<^EiK+?N2#gIPafp*HPI9#98b0`q;9 zXIk%I(q!b8HR%h^DalFfhH-9;W+7`I1Pfwc_X#E1*y@WkCM zpja3C($M8~=G;IqlAhaN#*%w3B=mk5QYe)_a?j&oo%TpGl)yLZJ}hm@i5Ukk*j<_N z|0%C%zep1Iq{}HNbP|W6y&VdRiXq?mN^-J(*z2E(1N;g(D>;HNhR}}FYfgZ?%kJ>` zb5eY7BsZ8yS}4Hjqxj`ag)e!1176ra*Em9^1amp^&g16}?(TJa6^4FcG+#$aXtRtb z-*P-Q6I4Q z(x^(XT6k2InrMz$o1{8vAzE!$$+)%?>xfT}<0Kec0gCZ*Y{yU6s+=QCl@s)w=L4#m zB})|;U(L(u-b7S~;j$^v{7G-9c#)kX(?rWg>^fJ=a#_9Km1KwG-Awbz&CEx+N~(5h zGT0red9IIWU6l_$-at9af3Gba#6MG*Ri#B#$7PjM%8s!-0jo&gS+Tb4e!{8F!D{@b zW~b?a7UbPhke7v4{I%ht#k4ck3U;ER*!o|)ZeFbq^ZkS z{ADk$T;B1B0>XcTxZ~=vI~mvWAcOrXryDza9g)!kSTBp(A_eH@xK1}0sL8f9^8u7) z_$Hp0rm8y6uFZthW(1lHBYUIr5l`JJaJeZDN$r?Cq3!Px33bN%H!s&IDxo) z@!y_nSrD?NYWr+jA!bPOV8}k=%5_iA98`9f;AqhIWB@qDO|+wR6n1v{wyA@}9KP>; zO(143HT1H8y5j>#TC=ud2&o?^DV|+wA)C0b%aQ?xRnEh-a054Tu;6xf&0S0P?o;qb z9e4ZiFMdV~MfRY+U0f729|V=7l(0fyjgV973c9u+_5e65dcxZ`G|wv>@fBE#HlWlc zc-DEIH3Mp32z&Z`Rw_<(uXMhv4n4EFo0qO5D*1_~77U{RDv8z$48H#Y1I?} zaJ~U%E`wzb6fmh=7hc)?hzw+A>nV3w*^_wVIhzY8<(iDtPo3T96}zXVmT5dfiG+@s z+tf~(t55f}PLBcj0YNcTnsN%$6IyQk<%QS~5l9wkKT(O!!|076EwK5j0W|16!r_EXj!o|+}dG?o(*0J8@ zspb$c0v4!qefSO{ar^lraP%Mu8B=@;!&nx_3tv5;@wwmvq{sop#oFhihG!B?m`w1w zrG9hpOtYt82e{dhgBzE|LKN{f=%T`UX+u<~Z4nBXsS%4hhucw;^W|uQ`$IwEjyo2& zxhYnKiEE=sqjMp&uzOB24q!vkKogfpp>r0UMNFE1hxG(@cbqBAv-Jt;GowC7#!lcYQD z>wD#AxCcL61D<)|^xid4CjBdkl!`!f>l~(B>i0uAGIHivS;6m(LE?h0gFk|ibnCR# z>28Jt?qv+UTESEm&Qhl|`At$mXoC4fIn(lwKHeBCtpw(1Z6&o%Gi8ot_(PFJen=*lNvq8$2XQ-H_4QfT}R&aOEc`E z+C?{0quk^@?Ci6y0bBqZg;@d}3cQLTm(I88p09UlTpXfmwx_v^a=AA4ngH=bdOM@+ zi6&blVyWVn1H7RUG70S%`zGJORI$n3B}^a@Ip7`IvwR*sav2diyXPof_iUHkYLwie z-+w{>A@?u2?Mv&!ylK+g0tvhw-XtX?O2pLtmfJ%@brr>c2S~5V*sV#$Jt)Kp`b)gP zr@`R^ARS)<^G(?KWY)u~vRfbp0|3z{u#4Hd;^~aUD9h1a8uoVUUyv{t9Lj^g2 z^Ww9_eokR5wC)45$|vZJnABv$3RT*=LQrf!<(8bmQ#v+z^;-n61NM~ragSEiP5X}N zOb!q?$((R0>>eKgVF}S(oAqJf&@UH4v`qZ%k?dJ6mT*|x7}~`H>{eY$27ELVT9KuQfW&v{ib^>17+C_vcE(bEMw0O9!=0d+%c%?`&Iz z4R-Qm?R_~Rk^P7LVXqeudxf&v%aq zv>0#^aImcs#`0H{#47w>&dlqw9nlvF62YgpHmnucdN4QS>E!z0TI4MReYngY*)+v~q@B7^T({ zdbTRuBwO=fv7ug3sO&?8(BVc_*&A=b)wzedv$whxDb$8*R`G2Rz-7DOKQ=I*{kbt+ zmQlL0*jMTxwM`}+(-7r)?^~kQE68cL86RDJ=_6aY)iL#j6asy733(7Wz1+ZYyVny* zy2{GRIW;vkBbLu076_PjhGD8zsF}b>vi5n7c$wJyifx+)ehvu4#7WRtP7UN_`2bvI6Uh~lH4C}M0&N9-=%2^v%91o) z_L_v0?d`#i4;7*S+9*Sc|7|aL=<>iJnOJUCy}baHYnVaBmXHcsbP0$37Zn-?Yj=!If}4QM0YGacn=H#6YB zlCx=of?y2~xVbKC@S^CpEb()bUZ^CCZev>P7K3{tM4hH~DK+;#$a-V-xc#HZDxkV# z*p4FcUW2JPQ9+i!?orcak?5U?eTYxer3)N2;Y_F%3rVJP7gog``Wm5^aRsjIpW7go`>GKgpqM+Kwqy&YVMsNe;XeS&p z`*X7Z#^-^W;BFAa&jHOh@Qsq|3VRyN`&o!8gYh5F6iNiPrO|aOLPCiRGmDVyk!KP$ z_v=_-wD+S8Qoo~8rNrF>qpc!uV||0m5h3RMPxFa*UI@>9xgvUXjC(aXsP|xoLwn4f z{PpjX!4pC(v;;TZ2rM+q8XGLM%9F_lDiJ;^;dB3-*3ehW9Tmy-=FvEziQ_NjEyV>X z68D!<(qDQs&a`n_O4Pfa;D}AJ|I5pbk0ELP@fXr z3JD?Mv*wD({xvkP4tG-Vvt6&G1ZyQWk0s$V9Dae{s`!L9WYF>@!j+1cK@UF#*&Hx{ zaj?5rA0Yl$H?4x*c2OPEFRYrWSV=^wPp z_ns~ny{@(DDUGT*9Dj`QLA2tR+XI@z3IFEkIZH=5jFEWeDrpWVW(`#Y8|0c~-I>nq zV`8uV(XKU;7v%kIiJUDb@mtndAnZB4SRw}w8)5(bCKklnE*eu`&U>9^HA~`$mt<+4 zS^rKt(8VMgmOm*1=I&hG*r-?ry9~nf{frYE@t)wEA?1XPP_uzMbv?NqLXm)$P#{!x!_$)+ksY{Eg*Q2bHyICY z=+rOa{W2~P5uK1#x6QuF0!~MQ<3}~HZMBdU1P%9m#r+_kaY4_%Kc~P}29~fW%a*`@$KqP{N9kYl$Qe5Nk^+iBcLOgT7>$(+ z>XJOZdi_xu9RnS`shJBofg?=H@*EJO6RF*Yv4JXVB~hx3pkFqvXIEE`fFUdMMtE3) zrHd!x;*=S&ivw?Xl%(Oikfz@ll(aypvV^X2EbSpJV>7>E8o>OIgyT?izYo3T2A$=& zE6eo*e<+3a0X==PH`LJa?P5NZ&r99IZ%Ic?sVkO-UGqXq*GN9*djz@i?M+QS`-GU} zJR|LWaPfRvYGhRc#4RrO?wsWSfAFWk!S@M6dQR7~7HW^ei7z7M-eu_IZj>Cj(ZfIz z=3H5gXvC~LMo|CZjpQ1hyXQHH>v~v{-TTNxB&Nisu<;0_2MNS443!kcL4|-j90}hE z0!jTU4#&dpVEK?heQN-^%QPIs z%{#DKIsac)-w==76_CJ$U_%yK59y9bU72)FhEOQx{8nw$8M?c!{n}()t@~ErQUagk zv#^Tp6vzOL^|)$+1RDB^4J0B0U5xW)Y*^#Gs*vS*Hmb#1NWg_GE+JFnzx#EC7K7$k zK0JF1aWw_IVHkfeK@3PtfBAPjCg8M+!f^7a|J!Bv)*|TlpAe9e|Mk05R$dn~+40)H zqR{E<^cS~<-tGGe|09M2r!10e*uK|z^WCHL1l?hZDP=cUo zKxl$;@nAf>VbCHgwwx`nc@p?mxr{x>#YXENH6jI9f!Xd&7!dW#{<=(PY@#;FUKiw; z3$Gr^jB{NB4t5t6er=CKr0dt2({3RH5?`p|LOvIZQxD@w`c{#oM4gR{*l&NZ5hwgB zhf=NXJI~_B1}ozT5m|hXb^0R^wJJb~&K+QtE>B0qU6h(Y(d5nb&!YbLN?)bnP8Rpi z49y~{$uyK0_)yb_tsK`tpsM|U0u=#ibcyWovabrHN%6XTRU!0w4j3^qVE#vhzjGYKy10PD9eMe-&lK1OYN97_ z6S55=drHxPYhC&}-#~65D!??--@a-FnPe zzDp1cxjG-bmy2bv0g1@3eS0Q?XE>YLfo%I?*sY;Vq&6ZVLIVwl(!|uaWre`^fl^oJ zU0cB1`a@V33(W_@S@WP)gjQF*Ljc`KhgP>kqT+RlxO{c)bWR}kMs;rf_Vm9ZIMw7e%zPE=;s-~ zke@CS855tF*hM@#E&`U+-~qN!Ij@>kE7z{OYS7!wf+7z2M|c1DiU(wgz%Y>i$1w2v znRqkWD-YHMGbMj*5zoWRaMChDVM3S7;#C+*R1Q3Cr#Jcz{y*zdDl4-p%Z1vCr%@Yj}Pw?=;_*MEw#K?Er4S{*^| zO`PzsFwOb-`3?t1Hj{V~#N3NP#I=e*RJcsg$xmFf@GM2$ENTo&`CE(<)`^|KC8nf;w4csvGm*d zj9>?S?~4D)vi=aBUlJ&{(4R;o-^zI->J5D;tk_M5vCOd8;F2~5mWM+qA`m=Y!%6VV zPw_Ku9Cg;u7n|vYJ~s2X-Lng(E~IBm%$)KSIsfthw0sC`ca}U*Iq&;eDJboFLNeum z$rgh(R|M#e#D~3)mj|Mp75V##79xDdn%0-+Ga-Od*xV(X2QI`)?~$n~zp1R$wd_*= zVh5}dw%|$qOYs1W5W#A*w&rL2 zZEcAAf9!_#n!B(|x5p!adaqwP1c_=gg5D?2A~W<%(7oC~DOkGkRG-$GZ!TAirS z1B3Y@t<<^_%L$JF!A5$^&co!ltM?S1yPW%d7}M-oKFijwf^2Dw)sdC64;Ch7aeP$A z0nlboJ%n|bJVSvjv*=Nmih~%#sl<8t_cnhkG)Q~!0fe4^mEwMLHytBH3qT@8Z^JzY zzFZfn<-rcX%>fu*qX_(A^y(G(@y2KWfj#%>LV%3)U@NHoUh+j60^XJ%Wa3?rMx*UY zCZK>0Jy*Fk zY9Tg+dx@4{+8mWpDwun>z()1Xi5pqqBANT+>EHmxksW-X%x}rgL`o210Y%tXk1RwQ z@KLH^8$2_2L&Hf3grpuupNv1v@%PE;U)Lu>)NmM$Jm3H+i3tfm?jAT%j%4wnf@NJ# zvX*WK78;p$GmE5TEcor;gn);R097E|uLoSmJY0Yyg^0??f?WCva=sh>C-9j)?wHHi ziJa1}<>N^0hS7FRSvA~ZzD`9YVg73`zXehVcu!qH%Ez5WF+3ort}oiu>Oh4bbETXv zE7Dzt+s!HW_OYM=V_?8%{Og|Z9wX)OmVa&q_wRgY)X#a51A4T?U|mr+83BVe1?aSG zj}!rT(eEbw>k^nz+i&87Kr7CO%R9rUf3D2#PiHG)K#gl5b}~TY4LzgoDt&!BjH4>2 z{>ppUUzag;4@eM3b{FyZbGP(50^ydfB5JNnBbG!)0FnNQzD%U>;Qm!xQBx4eS!;s< zvDNt^nPwZuciWak%4Ina7FnZa`^_Q)9iQ#|K2?LknG-)|-d-T$`+45Yk z0+o-&SstjX?R>p@!Iddbk@K;aN8))XLqe}>EG;yNLsr0N!pdnBwF&sPl&kZiE9L>KL4X zuwU^1#Ark!D4QDTWWkWNw|f zEuB$-0~s=dP?S!T5ZVDoMn+@*Zzx}@wLUz`9yvHT$a{ObcaiL99BZq_%7?@Cs_xt=O#;k?;p41>NgEryaIr$!unaEFmPa8=Db-N0LJL;-y# z2?X^W3HBBFfAF$DEUe){gE?|26@1+K2)Masp8Z-P;5!9B!`2fYgCYqiV=ZS`yv8hp0n&-^P3cNX}&`$fz#1l+_s zA$0PKH?n)q{+mJLp0n65ll=x1er!Dw<1Qz_NbBzb^R7LL{--+qLyY@swCSgKjh)!% z*Av>EIlyiRfL{;wN_h`}^gBpuXmHRuF{0vmk(B=aU7~8nO^~K{T?zcVU>@o3XH~MB z_s3mZ0Fvnn6jkaHcsjihMS2-7_Uwg-h=@Z^9Yu%Bg(v!9%y>vFgp_l@C3FHShMK+v zIOLk>^S_4Sq>tcS_9h8byR^&DR>5IgrvaE0PYX!02+oy{&FOf z;EovEvk=Y8aGA(E_aQ<;^^r#|53oSu?KKR3w^>rMUgTQ`B5cv=XG z(SIf%655*llM>ply!`$*5zy+>A`YeE3OL;0P#R6pfx=B)To=Uahw#(^t8z>H`ys4j z%(1pax9iUOaK(w=o4A(%g5y1$QUH3?Fib5cCA7@#e!?g_ulV+&K8l+K>QWkPb*nYX z8UIC$sIIgj${v0OZ14E(nlBj}8T%{H5)1tnDMVA6sRmr3ig0abz_memn%c__&U7<< z|2WIvn}Z=L+0)Hp%;h)Vro6qq&G+_R5+myPBGAGnftYFCC#&TN`>A7PkFS7q>2C@v zymJ|F2;H@xF(0Oc7I-D;&dJ{{N&_aahFO6H#=DP+S-B6_K(yF}MDG6WE$CA`dt75D z6SScA=AbIKpECh8g)b@%10uBqiu^_Mrzc0oWJm{_h9m%%Kju-oATj*j$NAoN5vwrG z&Q2ll`+xLeK49sNDYkMAB%~BntbY+M4FXEuOS%SL_PhDOWOpXmMRXt*c)>>R_^ThG z^}E1gx?d+HKrAUh20)5dU*VmUFey2Fbiz?7i4wm(*4lq=9W@?oeJFo>@)-v3J6nwM zTYAO-g0T|-321?ue01&y6}fd0#6ePRJT8Xph}v2uPdLjnh1hQDp(0#4R;{95(^ zaAfth8=r%qZ|ofV)cz(6HZ0PPieFxy&9Q^q0=z|Ra@q~xLq!o@zW+Ke?}2BiFO8lZ znV@Od)mWZ;CefhO*$5YT{5Lg6e+P2>?WP|F#dk-~)!b^wTea%vh0=bz`27$k+`C;( z#us?wzlH0F@oblx4KPk*Io~EFxHPrgIS>7Z{BK=2O))^|f9ttYPuc+tKxAE-3~u;Xc54NL~Km}o@KDiXmV-J|}QrhY#| z`n%DhJBfif7?Q*SiqixnkV_;KB{6^ih=BiCDVPv1criZzI^17t5XM7FB@*;chzXBp z0fw^w^&!YKDK`nSo_j+vIESf!JWsg{f2|lp-*{7j`JN_l=1`PA^S_;t9_EEh+YRL9 z+PlgNTw=gw7GNRk=pO~MqQk=2R{)oU#V7R8T2kPLu?)0onkoY18IEhLh~bp~O2t$0 za)9bS^Qg{E{8^#>^IG6dH!{ej?zY0#X5TO-4BMQYo|JgQAE3-19M%TXFo&4JBV~{`$3s;|_z?bK-uc z#~=DvkB^U4b=PP>^hH8Vxjr=luoVxC*vs&DVgXIfp6q|cp$geTs_FIET>>j=Jb#uZ zAn;}psCa=)VV3JfHL!7?8(%16$JQT>`u=4jAyUW^{!b7?Qs7o+v6L6~dLZgSolGbJ zE&%$<30P=G`)Shb<}+WCSY&?^N~#J6IR62L{-_1Kz98_Ycxx`_LB5YkNwi}Sf#h^c z02?oS?@6|}14@wXSy{GoXSdJ)7u^Snii1>A$t5IMV&@Ar$pdjFhC&T-ha47A9AnN; zKNhIq?e57-KVs_b^q)+Y?NI;CReSMZUZuIO_Bt@;=?Jnze+m8DLks|`Dac)3&1x2y zLh=&BS|;l#DI(24vs8^i+&}ogYYD+aHnO$g4W48*EsLRl=|jS28fvC*Ajfu<&O+>x z*E=jHuB?TH(wfIgY=HkF4M;EBv9B=ydOpI!&-ORi42N@6SZ)I!Ud-=HkirDA+N!z; zcO`IQ9|^`mNM4^p2kHA!yv)(R9n4Srx+dPhoE3d`kV;3fLXY`s1!OW^a8WT7P>{bu zPQYG1aF!IPRYX*lRwOsC>Fi6VAZ}Ct&%Q8jggAFR>yf^l7$@+w7H^)Pld_y7nkE z=GA{B0;B|d%V0l2cuSUteGIs4(gT+?@bLx5YSZdE%Z3lwm{35<*-=PboGYz5n{9Ck z1IzsZ$4e5EXMj<}-EW8f7bOIUK%-eM+`edUQK$&*``Z zo#TgdE?_AC@wbRK@!Gk(LF3w&RrZ6@y0Zv2#L}8QA^aMBL7b{w5q)BPUYZ zOdzaeH8+V~rTD+Qpe?lbnqYG`dx~8`syBzXLJaOF`0BXTS|_Vz)|o(AEp6j-YT z`pzj$bKK>%Hg08KZWUZO0rfrxZn*iOx*X;5w-u6qiw0~@nC-IQ@q^v(OQsGBu-eaC zsBp|TnnJWzxu|f>1)71&sBnq<<1G#j<|4K7X66;JeH(}EK(%lx$yWrwCH(sfDUGS= z$GfZYTB%!t{$TPm4%O4Z!dYBVxlj@9nD!qq0Uwd z7O)$J~^mQYuhuOeRG}y@rR^zj8oparlY2wfx=i_9qqU z|460b1*-eu2=meW|M7L*@l@~sKPR$7W@I%`Sq+qV5NQyhV;(D!9a%@lF)Oo0NDd0w zdml5Dy|*LD-s9MZ-}`9XzNz2+!^6!vpYwXZ#`Cq`Cn2k3%RV`Z)og+DJ;5Md8;NU1 z9Q^u;-d&LIZA!_XX*e%7CGnQ1tLu@-e;W6TA3bl&&E3~wy;bZ@T?`91mFGVf4XmEA zA5|ximRLpt!bb_8AD;T56>4d!M?P%?-S`IN-=7cbtFOkEtw)O$hgPM{cnoKR5hyE1jZ zb>@Nt#;8L7+(1??}_z`7rNu-(O>JI&a! z+xuXfyh1uc8og4(+xe6Tdz-azu>+l-?`ZSj!2@^QV85cS(6tLLtGnCRLeUG0PFx;u zkNo1*uU{sR<&oAnj^gF6A&Zo|t8vzDOG3cT0(!VlS!mfsDkb=hkB^(6&)hi&^k)JI zZIedhXv7zeEH4D>Vi5~b?+Evs@G#kscq%dgJ zAf0-DA@sD`8NA*H7T3JF96>3R$SD7)UHk3)&ZI~d&V9#xx9s+IHo-h?nO50TVTY6Z zae3N7U8MW9Sx4;6LgO%6Y+xB2P_Z_Z+rWb1-XQ~sfVcwA*8|3-wJE5#x5cH8E-JISZ8W+wrV z3LSgO3FROF*h+Zl^h=Eu>K3~))X6ph2U{b?0=q7D;c9!(SXM&<9^ z8+rA%KSN$Q-Am<3*~fT$5p%0W-=LFFeH{pzYHD_NmQ_bbXJF_euaUEuJ;v))#l79p z!CLFx70!K@!U07cMZOwbdFo)Q4PWRAWa({;i@TSW{;Ce*EFhU}suAA#goTh0do!?< zd}eMAqsPlY965DtN=qBlJV)KpzdK&mwHBkw{ud@nlK>`mNgfmcTK8b+VR~nTcDnC1 zU>Yk5z^1QyJ%iO%QAvK1(GZw7f}yV*QVrj)$T-G=tVp>L|b(_1CbBz=*m}jV+^KdGFKL`wwzrWw*{#yZiNoyVnrmv@WT~o6Ll`j` zyY5Gu2_L+uW8Qjx>27~+l8}Jrac_{jQnGSW*}8%DolhzGRIW$TXL%xUgDHbE@t_spt0L0wDG+{il9Kfn$XZad<0MJb^c5KSL5~1t@Aw8m<32 z-{Vp|3jXmR*R;FlHT~mfp6yo}s5N2QOgy8V9zVEG6D4b1!xbCyc#w(+d;A#d5dn}0 zSdV_1C7}Z;XbHf-BMrT~8)f8EmL}JW&g>It`D4PTo(Q~)ccuiSG=`MqwoN`nIJvpG zbtQZ5*)AXXNp@hnMu;1^y4bvJ){&+C60Krb2d;~kcPI7HaZkOlt3wi!O)@iQIwIVPtkhh!aTcM%(9lQO*Y5?d!>Pn*+Q-nVjUbE=LH!2y%cp>RFwso zX4qKigO>U%J;8mbg~)~Tx>rg?{ML3>EL5~v1vpru6^Ut+mEh`P31a? zd7#K(DMOjV6LLW+x^N2bGuSwsi(Y(5a5sdZ8OBlZ`)RB>t4!C?zg|*|f~Ru@f*-!J zz*J9ignT?mDkqIjAiq!QA)@j%G9jTi%4N+qGAkKQ+t&BXHU9dA=~a1zT-j@v2NkA? zVl?lCcbv(AdjsBHI0=^yQ%>JCx;REw%ho||6rJCmq>HWD9*mjauO^n{Gj5C8ECSEA zQUFE$vZ_h0(&*bdhq*vp$dwp2agqmd_Tfa=&u}EiuMYg>Pj+j+G^jo~ zM6P(!l41(Cq^h~v-fR_P7&dSc>>^tMWkkJ!9+5`Bmiw5A&McZhz7V8b)%za&+g_+_ z-G4yB8lH^d#y95#8kqahpCxYv$8N=YB3#+f56!{&r{heGvNQP0Xha)8pB`PS$I&Hz z3U61pMBGA+%XYFk9rU9A0&f;_D4s--tyYol?o)~RuNLcy6H)Ma-z(ApbanIvrO_Eq zObqQcE1z8sPPA13Jd2>&yuRE|w158v9~03L($3^c^|WBW=aHEB$$JeYo!M0cM_b>6yyapKsY5Ps51-5} z6>Y}IjBYm2X5Efkxg zbY2v+K1|QY%irASx(sdR{;>2H?Z`PElHCHM+dR@{ZhXZ|NbBOgm_ACc{H%^<`ILF;I?+n@futq|yL7$i# z2{)an5;!m$4uG-w(3Q);9N-+`C=5We{P}_6Fmlmv)m5?KYB>c=O+)U^BE?%#-To0v z_J0v=6$1~+Bt4p`-U@+x!zMRe$6K#7w!HB`DjB9;2go}ej9D;e?AW>+Ap2S@CBbr2 z9JjH~CVwQ)vaRIuoh6)%x{o+qa(~4AYa9@IRps>QQcld_zsd~}s zmKkHm^fCvRP9!Kwd5L(o)dSqyu}hn%4!fmCwmBJnx0jqfSl@TzZ}J(#dw-wi%-!$U zCkzV-Wu~F-F!xqDK}Zsy0JsSkj=}^xov;-KS#axuM1NDKCW@6L_lIQzB- z``8JbnRhih<5T{y>ce2mhBfvrar37&d>xWeN>t6@zs&)v-1nzB62ry{0^d(5F53ZJ z-~+BQ`LGeyl*Ft%$)r!lTdMuqL;z8N$Z?`uKFwSmYOtPW6u8&htQS!9)~ks2lM6xL zBgtFGgWIA)&w`o^NB#44zXyw7Dx^uK=+UF+DJI%Qbb}r~G-6XIe0ZdbWs;)R@=vK?)hvJ%#xn04S|YkxA=J?p?p5#dGZIr?t3So(LQhv3d&Qr^smF{mxDUVr-QK zc<58(dexg9Y4>vvge7?9X$=FkMp2b1&s7G^UD0nB9k_e`a%AH@zbLxl;{dD_6gWPh z;H@mFVRIl`^Y%q;+&01-VoodCU9xgm6H6D-^I-UkL9SebDx}|vYjxs#wib8`Z#pB4 z-5f~XdXQvzNs`a}$X?2{jGR1wbK!7JWPJ|fxe!!6u@QhES6Oh(5$Q7!&XY6VqEeMoBT?c#T* zS~eh2=Lc;Ekfi8umVONhq8kfgv4P&+LLK+w&7+$)&Kbd-HX0Bk^EN7d351Wb^pv0di{!dD$pZk|}%nSs8jFVV}jyvcTfv;x$J9>Wa?z+KeuS?6LI~ zM7{&jd%UtCRA zuTTb+Mqkmas>yDUZaF(`wk%wgA5K8MD|`x^5kqM9{tG$9*qwBa~AV{00qAW06!E$Xf(xFkibk<2V z#F|2i{KUbb=n{^LGf9jN-OGO=QqUG(pg9ZU5Pip}Mbj}oYjFhpXyFOK7eh`S?eKrJ zUYUk~alum}GvL8xMz8kA#~GR&CNu)8(%UWT?NN3bYjBRS_g0v(-YtWhrBG?~H$R&N zRvxk`%gs#<_|v1sZmSmycv3{x=Z~`PPpLmKKJHbt1{JoN3vTN^>qe}-;pO;ZfiRlT z{zp8}^z;EIkmt#hAxg^gt#J!U)BP2~^XxeFtxR5y&~R&GHie=ezQ;Jik$0bsA3)BM zY_DW%uGZg@aGbRfSZtb{9F?ek9UKA=xGfd*1l@f)u{(H?*_J+1MoBYNB0thdMc?d` z(qS~-leUJeReSJEp644*ol6G^StTNP*uS4DruH=bc3Dhmo=bI}O z3aQ~w*`u8wPkV)zmxZWnkpk0l{5L^i0Hq)O5;vBN*!4+a{8<0n%kRzpmf4p=NydGm zR?@sz>JF{&&-;A0wc~FeL%$tXU?8PE+WfK}D1`G$jmlO$2WE;TxVAJIKGWCR9_C%T z@F4W>juZKe6rXn$O7P=m3I|sZVqQyKU*p>+hrtx1rnNte39vDbC&WToV2Lxp0w`U>F0LV%7gTOZIgo^$29#n5)vzYI5WLT`*KF&`Jh{D{aC z;l=*-n-~*qM}5D#HLpN*fUXM#?N?hINU06b-^#=u?&B#G74WNkZF?&mHxaq0Xr(dj zZ(aV(lI&Y*H7GpawdJv7z8?wTW18^!Cb8|}M5|%%lLh&i4E;MR&YSn@V-H?k$7WIS zlFkI$JfArKr+%Q)AyPPt6U7rgbEK14`w)?CdKc^V;|2($NJCKHJ{{CoU={@S$DTz+ zMI}tzB+D2%pT0NKH_|q?wC1-vr$;s&INg8EdY=P~qkl{~Irt+Mb~;f!S*>mV=x&A9 z8%ARXaW?*|9#AdF=R;VC-8YsOvq@nWxQ*CNor@;cYr?W4txF=hcwu~hlInr)32c1e zCFewQM=vPjqT1VexAW_+Dx^Q{TWx+afchiSfo)|s)lWBoP8}KI8MBH3;lLu09=&7(s%MKhR!PDM@c<7>WrPmb0!dRmB*tnQ{sa;%kYVg znW62->`tK~j6)8xvMw!Bf)eMx$*s9sSxc#gBHf4e1P~mJ>8W>&MJ6KX9g3xh>p->d z=pE>uR^kZ)ht)o!AezX?Rzqrle%(AcVnj7a2Ym&1$|;67blCkRM9@XP5;5zdobb6l z6AN>M=DE1dxZd>l?q^Lnr;EP1p4O0QbiSi|mB`T1ZGdiK%WJy3c_&|LGcG}(-J~8xcP90G^B_u{u&oAzq>^F!71&Gg}S7VvSl)67!3ef)Y`M56a za0%cD;=R=SnfS0Bro*Yk#+8o7L97ukb}V4}?N-h(_M}gwD}iP(w4hd|c}VgJm!mx{ zBkBg|StUmp_r8*gDHN3(0#9tTLQn=MyC~9qBL!~Mc|hsL?5jj$AdwX<4lq@&i|q^7 zQ$Oeu6H!m3!2TCO(G#)l%z{vF$IxM1V0WH(VAuR>w)?D>!}J6X`7ZVvjAOO*{oqO) z>j>Rh0ou~Db{0<=%;wU8a=;UvutN8kW(>m~Q{#v>hVNvzAn zAwAvj1>JBXVyXnMC~}Jkci6!uhfWta(aoBf@u7O+~!B%oGA@l9f?!N?1N`BsR_ zWY#A7uIZ6+J9z>br#$-bYE{YGF5CK+6nc-44D+S1H7Bapqb@u_&ql3pSGHv7^MOF&*c zmCDVBP(F2!!%U)q&W_+Oi6r%7Y987a@aSm>s6WmnId0TfE@t{Um^BP@9dwDTBatrcywy$R)cIv{yCeBd6l2D) zrvY)bq-&*BQ2!2zmz4{lPW12myifX)vWd?n|1^}bRPv<@Z++CVOtklMTST{dQXr+g z(+d-N-SG~3o;@=kEP8WGKMERt*%XGx+9fvB%P!xGDStb!@+Z^q^9C;gIGel#9Y-^4 z2 zkzd)?Pjjc?qvlY)q0eH_WJxGgOlCx;(-je<^k20(BmfFmu*#*Kc8Ji%A>gL`$3$L! z(P2DTOfgP6W*;+{Zr-Dqt~U}BVL}u^BB}TtTrT7h_A&Ha)Z>2fxff(*I_Oee?9$b^ z57Zhk3l z_M{#6X9^D)aIlxq=ezp+KWu4Q9)$1xkEw~ZVMHwWDoG5Ore1C4>5+t z)bV@1lzDkDV!KG@d&Cu>#34+_sQ=XS!SL%?n}HFBdvSJ3AFApFF{yEJ^;tg4x0%uR z%mXQ9zzH=SQ>#Hgvs(MI?G_}m(h=nCF?|5gas`?x^vDH19@}aj?}=8)JW)n5>*Mla z&b}t;GT!a}uU+ZCfc6e9ht76jh2>oKg7j&p<4w^Yu2&u@f?`VQYZeEQPSLv@Uz5j|>%}A_!b_H(Mpk?dNCa~0$%7Vwy)<3yQ#Xy2 zSnVq__sGa*?MG(L>~Tsz1!76e1Jf-gQ-!nE^ElutTZ`VQ@mXky=`q|*E-~cJhFyBf4=ax^BE#d-llMM znO(m`Zk-XZp>ZWZ#ccKUXun)os-)BTJDEm#T8a$^b=5%#Q>*oBgvsm?wK<{+ko$T|sQT3LqBlSVrZp`-a2h^HxJ1R8b2rPBtNm3FbrVM;||Rb4od{b;ZK!_Wk>BAc{mg5oG6mRjBVB8`tSD|NSF#_>}2v z{Ri#m%%bFG^9BWGX(wJlpSTpuAY7RXo^^r#Z5#4dNxztw*l@+u{^_D~(aqgy>{b!e zF@bmp6tTmUytxERU8-u%Stzudho?Hc*?v-f-s96ry)S0a7qHCoPHOYTqd^+<)SE)= z|0O3DdK_GYUH4vb>UWKD&z$M*OlU0Efw_`&YhqycPofwdnztrRig|}}ndx}6{9O>u zYhSVfYB70s5=Y9#l?k~k&)3{&$n=LsH?HkjKs=a-6V^3XU;c_4lVeATMK2%XSKRcbl0K%2I3|CEsyd%!ZxZYdQeZ&_je0P`RXzzKg836t6^%KGeB>anRZ>;1|$CQ=s7}EjW*rGy1iS&ZWEtd!wzdmmq{~ zv^)r}@$11L4Z$oN&N7rnOCO%*5zW0nr^NLlyVt^@;4r5(x1_*FUJ*<0)jH?P5x*Y! z^B2&8H5x9l(Ii0Iu1iQ20fJB6;!Nyb&cqieadZljiHsWl4xQxEei_;DnOYVpqxZ+I z;vL-w(j597^ApbDFP>EvvSFY_Xy0LGadSaz=bAS<+%^Y1 zHI+u08rV30E}_0gZhZeaGz~QMNuMV{UjY=+6KUL75EA&9BFs|RT(+l&xrg5DZ#D`d zkkk$2#nOI*9&37isq>s~U_%*xd;l|3F(XkWOLZJsToT6)ouc6ov_VR5j}$)cQBR-n ztLuEg(8aRe&L_4TZ0xS`#~^_n}uI+Pe(Eq6@G=q}ewSmVXaY(p6^TyVoe`4#Xx)DR;G`LsNK zaPjF*<6v(uY0PB*^iwh_Iv+;EO^d4>Tm`UIBK~uhzL-#Fq*`y-x$3SAR(sf=0qF z_c?$p!uO$s)@*NarI&nFw6GtJ0>EQRbO=mK#})j@$3sM7joG$IW;GO!&93(Lm~f@NOM)l0BxL1h8CUTT3{xpic*d2ZMGVR1>85`0>9k<1 ztiQmcW5yc`e%Xm5$FBS;L!z1?pgixOg~5gu6=HVk#YQ)=^|z{7%WMah`Dr zyJUI`RZ$RFadJQu^*!K34=Wy&038Y= zket2kkDB+Mks1Lb4XV&hqeqsM>$a9v5Q!}rS0Yj=!sSUR^iVvXU@O@7E$&5<)awh@ z(S!$#`akgOEch)yHR%$Yt;ll6Mtrc(3o+)C|E zi+qYG7|bLXBzN}930HzfN-2m0{)kopZXg{94H<~TU~ikpy-LCcgHrAkGk|eatpRM` zkx%{V0`*w~x{5y7K4$vaBly|!*Bp8c=aN}_so5T-r5pDTir(oX5~s9|9}K+RX?<2f zD@lQvR^cW}z~EhJR^aiIov=st>Wn-rlK!NnK4x_}+G9Oa682npIw)>_X>4zw0fmG@ zwAS)-Ax<>1@!%jQR9_6@v%ut|@_@ZBp)(vQBIhQZpQr__2{4&3+M<4Ax4$}Evr z_)Fe(O{|uBGfS)B)ZWD1J3i{{!}Q&A4^uNv&5R#-YlK6Yc0p4ED z79^<*|K(y#uXBKkolW5DK_wq-1T*FAkF!OQkMNat_FNDDgLol0<)B0@BvjChyZr|HMtf4*YUX+3-^*66u-b3iU zQX~9zWGWqnDX4MbMFUsySy%fk*)+Wm&ND7IH?x5Yx4*LnAe;}`Ga%c9)8Its9lBZ! zrZ?{BgNq{tiz*RuQ6r*%vsq9XtUQcxRJt|$F8`E!*sMI56=E@gO~S#kOMVojmy@27 zPLGyU6mA_Z{>uOfP=_N&9=odC!`x^GaDF~Q{F;nxvKshTMy5`w6yqucJ=^iZH-|9? zBGw&!miAXZNT?I}sJ9L^zao!n7+=otdO?MJ2R!_9X8!6>Y96vaA^I-zQ-klj;BVu1#)btenXY@Jn%*xJ`p}0v^ff?A7?QEovG9v8?81lS;I& z{_2S{ps$1qfamB1L)DRIGGD-}jNV-apQ1z0WUnlsuSPnpzJX*+qpw5^+bvIgy7@zY z`>0>v%v-Y=H(bOC7pWCD`uop@cksgx29n-PAP&sU4fN%Dk^a!OxB9qcf>MZ%_YcM| zlt6n$oO~I$K$Q*5uF$kyi1XzDUufm6(&5L1d>o?N(S)Fx)d?Lr_4-`gSFnKdzb1x% zitia^2TFeLWf=wQ5$=>`SCGETV%r43OIPIdSAf+}s{>j~vZS^{z1ZAH;AIYq`so5g zL{oUk90C=;XrZIW%`dGB6}st$q|DnXyAqOInQ~fF^@dXx&tE!42awkx>gUN+ z1SE?KF0GW@d(Zjc2spuYFl&)&W6#<{9u~A2qhY4Ny4R!D?9|up71Z`AkC3u$xV3k5 zWhInDL8g^W=5>7)h0FmCvArDYIc*|yi%VN;;(@ESJY;OS;9D>f{oG$h1(Qr011-of z?e4(OX;&LQaG_Zgd5O(YQ4)&!uRQt-vINf{&k&#w&^fe{3rN)CNG1c#_$NVIzMr1D zLwiIZBY*JP
  • %NlNGZY1Iar$sX>Qa28jU7H+xA3+`krcXHf;o> z;9JPrK9I9$UAYZz@*!y%mlZvyZEuHCv(sk}X7=&i1rwULzk|3zAebq*JZE-0mqmE( z->QEjLbOnp4lx~8=5g$Ou9WpwkMEZ$enO_9d~8W#`wyXG{eThSDpbE`rd$EJ?wUBX zv5#(E@=5|4H?cTmg7|vQ8dKiRATBw)emU{6SeEDlK(1J70PXqLwx&NsxD-9#6=iLTk}p=#_bo04QM~MHyB%j3UGf=W8i|x(bN4) z1Xy4uD-C-Mgq>IO+RYf~^Jzk*4*`If+gmrZqrt1J5Z=_(1S}bH-BO4*UwodL(rh1K zXu2-|1bNM@Q@O=V3N5oTQ(AmLL9yUP7Vmf3*mI$VijO52MF~nNK3HzlovD)5l_ukO z@W^~m5`|C6rMws*j|Zcu5cy|yg|QO(B1i#Owq_DCncN4%CV6@oh?vCtg^JU z-6ribuH6q@g{9^eNi9AX@`0uV35RC@dNs6kMYc? zar0BitqnHKZM)NV52mQ}M!Lt=7wNr>pZM*!s9FCgMm7Q8UAT4{la@YT9}>#n{H|LX z{?E=AXdi_R)o(u+$UaYQ)>FzrFL{xZzNa_wka&_VK~wX@)B9~iYK?4->x)+b>%m$Q zZ7@N+5o&3zJXcDgyVyA3Pr9S@u|}YnnK=3L7V+y68Wz%fq$ex&3WFK1WFj!gf z7&X$VdL|zxuf;tm$bZ9hfb8ixaF(&DOd8jAHrnQM_;Pdj@5ZWM3%B*?^-GW1ChOa+ z>|oPuOTvQuIG2Vo)8^<>H48T>| zvUE4pt0M!VNcfc7ENpTiPiWsTY5D}07s%9Om<@WKxP4MyMi%{4yxt+s_;D-O#=qF; zdC}_g`6Fa4wEgIg%1=;F+V&PHZKe^e+g#r96rYPD<8rnTU#f|ix9bqOEFBYpqTUJ0 z!u_tr9`iUUwm?kna(b`~f;6F0bcsVIA?NgqS49ldi{Zhnze$0sS0yc$$rEN{qfAK#N3D8`yo@O?EB~S${RT7 zHzs*2P7|csj+jetu~$^*t-SAscn25Cwyu=-g(COHB-_|`Kz~7_O8TRL5K)29_7yqJ z)deKmoykW(PWikfVccvaw}DZIu~zwfP=x~l{@tX!Bgx5h+VwA0VSH3i^z&lb{-;YMFOI-n}`=;eFB>6Z&jortSCjEA|KFF}J?x(jYA+9JG@! zIz`Jgv*tHRnsHMVhKo;VU#i(wt+dKk5d5`@j~SRh_cqf8P^g9tNm=`HVPDmE*Pk^T z>8=dCGLeH`m}fRam9^VKuj)5MCQxLLBfK_)JF2O+v&a8>gnROo7-IQjqO;J;^Y~U* zb0E--vuJpQuMO`e*U7G4AB5jPh15ixC(A`^B*(Wsw*BP79!i(hp1UUz;!rNlJT1{w zcuS$~GR}rnFqWZ{vz+qRa}~^|a|;Buf$klS_j%(ltw$-ku}&lqoRu zEH;)Sq5m2>KXeXPbo}lys5Qgpy~*e5s^-!Md)}nXnOEgjJj_HV^D#Ybw%Jviu(3$B zFi=3pR97KfK9Q;flAkJr7PWKxv5BCefUr0!%X3Uh&zcBMaII z)_cxd6SlmJorvECDGzX+pCf`Vty*R5NIHrryEk z_z7c3cUV4BoMiBWajS^+yTvYG33vW8CF3ci7eNhUUVEC9=y$+_PnJj3|DRNbDOV+VggH|@gmh( z9m)Hrl+USE$aH&#Ug@}oGFTw0sECgQ@e{xhDl01!D^1!+U>kkG$&^9w2qr){`Qs%o zw9jf|jctV3FhNbk+!Tzv?m%K{$#h zOJv6CbsYt$FPd3(GPb{aqIev%QSJOakF6iiRmY~|A?Rqg>@klZ=}?{={S$L5mm*>w zF}Koj2VwvY(r~0qTK;t7F|^$MMB!>FdeGxlEM&ws#v>Oe(BAWWq2;~Fw^@sC4O2Bk zs88%M4=pHClNGM2?|y2JXg@tGKa2vuW%RKs+AeLJ)Jf@ECx-o)^ux1Fa<$G~v7FS< z7g`cA$bm@>GZ;T|-y=l1JKWuGs zj*{m3*9*}H62P}pEeD0suruwY&G+hD-IXE z**3a&eiZ+(&2j8t0SR4EaWij|>Ig-u@EHl#iZUk%~)Awh&mVMUPv@ z)o>&tVTWaxIp=+<*xA`=+W9o*o2%>mmRnb@nyP7D-}80N6i`2-WH81YtTYy@of=^j zzBj_^<+fuEEZ?WMyqzjSt=n#1+}mo8LEz5V&bhfBP>$sZ^Zl9noL@GIo1HI$r$w+h zZ~j@zE?*WjD|Xk)Sft)5+v2woQz(f$Vl$N_zBK8y275xm&)fG>bD!;2PxETD9h2|#HwHEJGN?<7l=w!9%#i)zk4M`31B@3HuWR(_ZM3A$V{#7ELEqZ=sIJq z+M!|t9ul#jBSun%5cA_Ur7-*Lr4llQ3$4#HB)6r!pX7QD5TyD=hhdgJY zH7(kRZPD?RXm`P-yHc}%rKu)s9z44(r@dF4nxRFySc~KB#`vO|H0IVrpr5#MVe{}q3dGX9%9LHV4draqPVu+D=wYrJ>j?G52EUAO3eA+78(b)0b_1mk(~lyPUFjNm|B_}h zH0^@axX!XvRYznMN9I=lJmI+U1mq%x$hG$vg*?0>vt=70)4K|1WVX5|UFDRdXlc5o zl$frKMNy*4MdVRd>@A|mDkSj|NhvHQ(HRd)Z6}?uI4oT zz|`%C-D>r*Nc#P?M24Ke+$CFp#lb|BiL%e2;(q}pLVR@D^IdLCavAm0tM+p=jFa^B z#lOtps)}I%Po7km`q?zMhZJLs^UrZ@wV=`yLlt5!VwBb92A0U?nL4Wk>S~iQbGvHx zz3`Z~wBA(a3PUo|LCiQYSOs2$JI~&V3MchsQ#_Y+C7s_{4j^PycSzmCtMkBt5vya4 zEOrC8wM=5Ks{|t1gbYGduRA)G^(jw(svzCs;+ijK!YHj3g6JEbMcUCHCw;t`7<@nlCEZRtnfRU`uvmI|_A{7G?0Sm3wCVaT_dip+9T|HHh`%a;+LWy`eY7 zqSLhZ%%55a3RjL{%jEwd>n+2&Ub=8$L6lOuOX-sC?(PQZZjkO$knWal_|qUQAxKDr zbPCelNT0#&e$Ia1^T97(T$l2jnKk#iV@HEo#d(fWR1MS5i~X)Aa>RpWW7R zs#(#f>5=#Y?I~=M313k&pV%^MuoG{*!uNVDQ-~q{MkQXZfW}%{m{KnO6Rz{oj0@|2 zqv$vRIp;!aVVYu@3PTpbn0z?>fSdzz3A>-PifVa^w(EAnXms4(t6s3!m%n`C!4|lz>CI>o}97_nNqd?(?+$Ql92yNoTdA zw2AC0RG3KLL!J5JWUKeNyoS={AB6Me>Kg9?6Uh5Y*-U>GY*3#R|}Ye*X2KkZ~S^U z@Lw;0Da&#{6B)hnM^%y7+f>z@HqSdItGaZCD_kk;((U(kO;g5wc(Y-SY^7`6>7kLB zcR5eqO_~LKdioW^KsK2IeqljJ2%CKk5=O&)`Y|N%PN;$bdHke$Sf7U#tQ#1;R$t`G8gw$kCKPsz<5i#3QL>9xGvpNi zQxwq8`gvurSx?1;guu=2slAzS%DnuzCH?2#K|K$?;i3pwHtFf#G59Y#rmhTx3@k|RkDVks(-(jKDpJm2*Bf^CU34*7b^$yFBs~| zRa5%mlL8UY^U$Uk9z5z$uZs-70wq&y|z) zcEn;>?8mJlq&Rpkr|{mIBTIBSc9*cHBq!ZtiRF!dh>CDoCSj{^y+u?QWBACS` z3-3;c`tTioV9UqBbNMwf$M%aG)^yUQ;Lm2#nj2d~xIv!D3IzVDvP_Qc8OJSJ zEZzV--vBdoXz{3;#2dSd`e?_j>I67%6F+?~ZM2(#M{F2^2_5xjf;77eV#+q4w$qlT zEY_-kT0Ai~K+dYp9!_ccNOx$4`pdzezaJJ46n#>y-#;LIG$hw0+&UAW@U;?2Xn%44GI<z))V*evis97~j)zNpA59)+BD8Xw8pCZ$!k1)Tlao-OzAu_! z8tH5Fi$%M<((^1?1_Gc3A6)HA3IhR{gc>jLihQ;B+~2L{Y^&}Gbas(O5Q2whkCQzQySCAaT*3W?pKDj=9VYUA zny~k$JqPuL%6!18Qv2=RrTw@pIjYogXQ~ipvrWNEyi*?avIp&PLeHKrY@8kM+{9>1 zl@`p;bL%A&h1<;+KQatHr_APz^SgXq`~CNp&(+R3PrTyPmU!;s;YW3exd|y&g#uO7 zXd{DKUmG&(cAqSRy(KrLvEYdkw7gLbT)F!tz-M4w=97K0db`zZSLja87R#bv{h2ap z(1w^x>=1R=MC?7udl>TSS8{eT$7V+QUv^m8X4cW9AV&px;vr)o*UCVjX=Uc+dnamU)o=-_RW+56zP`jA2o;~K zURwfZhLZydYsaVQ0Tj}M%cqr@@QeW?FfY^^-QC?I3e82(s(J&1gMaKku!FW*IpAyT zkHXj4w@=Y&D~in*8yNURi(u>ES6fnR{CXmdBdEam*=yeUR#*mEqloOfURmTUDMpb_ zTy4v1kKb~GJC%gH3U7J*I?0hwz9;>*thdgEV!MB3CTTQg@6 zizOFgj$O&yv3=_1aM0=V#p8wuOhI9U$5ttleSu(sJ(xU}P^U5?PO<@db5g(@tc}DI zi>t3~IRzcG!}OcjRM2HDqOD7tV!Iw5+-)8mVwpII@vguRIp@n@ol>wt`5B;F_79YHFus89OU_+ zlpikp_4Ms_K%y-pR14WpU8m+zbIUZd?xV#~JlCe48OLVVfoIuab;|oc>zA5Ud7{hD zxyHXp3-b;4?MSu^SyxK{0_w(oS~SpeWo2dRrfZN0IRX8;OziQ^8uZ+CatUH^HjSg6 zBAzESWpaDL@&%XIg=I?&2J}gCDY0} z87|&sS`DVttluf#%a#fLh)ZOhYyQ*acKsoVlCp$EYcxGcGH%8-ju}E~=WP$U1txab z_}^59&<%d*{rsLM2-EibOx^d!GPC*dGt?hp*wT1?p0B6irg3c*#^V% z+Z68lwz9jw6mX5t7Sgw7{W)T{sL$fYXf*=&n~N!T15i0_;;zb&87dC95B2RSaiTJH z@cPrwLtm}U#bE|BS>3YF@(0&n3N^FNbyta>{P1JILGQZgY}kx*=K)4- zx*vld`cCn%c5v&T*~~nL2k)vrX_MRN_Y`+C5uS3b6*{z?Cclo#1WbDzQW%+7)#7Be zIXFa&P`hY~rx|JxgbWuxbq;)u36Bpj94H<+kAT*rl!u^4c97F=S-mL9DvI3O5R_iqOzRw*M(;fTHt)4T zQ!`5tOvLBpCcYo9(CBsa1$`WDV{r*92Sr9 zIEnEA+WVFwOkVJB^Rw{S{0!d9=+cLB!4Z5n{!Wnnlssm5olnn^Tzb-v@Cg({z$0^zjSt0k%ic%)LPW9E~C^uro0vbsg_LrVJ)7U0HAeLf>gRfxDr6cYbaB#rq1&!?Ey&gC)Pb zEDYK--fW-gOF#BAN$#s;(OS=-7>c^K$uDOnSF7vVHi%sIXpG_rf&v;nGInUc#@bcM zM^c}bd7ikcYbOkObPg&icJt@qhf?0+V^^oj=89KxjPsi+ry5$O>*7ZI7K>x+J3l8vDGS!Cc~8j<=2$Hrwp(Zi5F-E1g*Qp>PrW=@ z+XvhFJibptYEKxmshBx=Ozty;=t?O0JrH(YUcT5tzZcVo4fU1JYWYx$eei3161LI# zoY-#B<0F`_`<2#>+nX<&rp7Cu+4HLNV*5_Lblp1*WjT~n&H+lAekYNqRnl$JXDt)T zN8k&^TQ+Kl#p!AyZ1K2e8$xt0Dhr_`87}MmtDw^s8KlZZ<)%23zE2fLtnv9tXmjfJ zSX8@ne^JCNe=g>ZB#vAbY&Sw`y>_Ya=jHnGi9a5&1?aFyx@>CQ7?u*QXLZ{=Pf4s% z_Pd1nPV+fS0=M8-eWL{Z58lXT8uJs|O`H=hjrEqn&7tDpRDaxLSR(NYl8P%V*?_!(F^kA0}dgnkUbuSn3gOZ3Dgfex~U~1la0#a5D*tBRW+|?cje% zj{(OxlZ+XzS6M8u2yUfYVG9-q6CSC})(zmB{;^O8#5ibGVJ_+^F}jyR zjU`OaZ?C3^q`8eUt=f;GqN2b12%lCIZwA_<1(92$f>+YJ)-uurM`uG~;C_aWP?l&a zlTk~!PZ^w5)G8|AuK?rx&&>{jqstQQxfjI*Vj~tM&~6`{rPLx{=(!PmXkW~?df`&o zPTBhc5=G&HC`?~@nWSqz<}zv}b#s=^Wh28dd5X{E&xwNEG}R^sA3>x#6tZnR!|Qv3 zwbP(Q#*3{cvpYUF|Cz&T4uk{b-Y1U=F#NhCEHV|dCu7Cus7b;*{di$J=KzmH)MB?q zkc3wxZD;!lQ{oO!zq^>*vSfej7te(*)b$?I^oG1u1YY7Pf?UalRz`>rTH3P_>t{y9 z)5GcQ;dn!v0~fd~jd)a+`Ay=Oyd`l(s^}#&l8Nx8H_!U#Lao|e)fiqJXh|tO!C5Tg zuow8*x6C2S1v_3;@Kp@Nsh)>W3hxMJ2*5RCGz zkPPQWuLmw^^#`qS?Q1U^Kv?r=MLIQlfT)A4{t+w^VY}V6uP75c1SX4K4sG*EN2v)k zyR6_DPiZTEL!u`UUE-Qbi2&lc&2Pq9Yk(-W#5+dQHi1xIt%wXZxt)31OQrx#Qb8?M z>^o1m;HaR1=dW{lH1h>15~yP_$-~|K@7+#J;Eilw@-m@UaX?=Zed#!vl&H471ua4sZnRi zbqLHzE_C&DFr(7+Yd4{zxiM7vx`!{C6ev>Jmmw~i?du6BF>Sj2AKve5%(Z}`TX`-# zyKH}2#40u!))chi7$~0rzg+tpIZnoKCE0erCQdP$zMgP#cDg>@%s#~*9Tnvz0UDau z;L%mLI{2)i^7ZFQkUwDdT>Tr%ED2~!aBFFS<{Ck(=JB@f80P&H0gEWFkRK&bxY07+ zCW_v=JFs!J7|vUs;Zdk>XHsPM%m0#?f)3qFz$3S}Su(}#MADDn^Sq80l1!J(Xq2_o zqpS40`^EkSiT@1`cvzNZYoSLw+(XWIgC401`G+ffbiwIpT_gvByQ!%B$b?yG{ulQv zEHda9JZMp^u=J&9@yg?afOrYKW2zx7|5bW2t|k|eu06~SPwvydCb+$-B`0-rvLS2E zronC}5Sb}5)Sle%9M&H#D~uH~%6@x87uKnUs(t1mv?}iwFoIaaVL3%4oAnT!zkRYa zTWzB*Q(GDR$ExruhCn7u>xSc875HAEo5lCB!`~b(^Jm=32A?alOGED{baZw87`H!(&$D9UA9zW_5v$;4mFgC z7Nzwr!qR37$XQVX!&FVt#%vG^{CVSbZJqq<5o1J{{^<;ARR`7l@iMxyUUlzAw0k)I zmu51}U6|)KrbYZWLPlz6<>K~dKJJs9?2lTeE!W@@BNQd+>+6h3y~7Y{_`_WL24z32 z9j;mYz~lj#^aRk>+t~an@Z}Qo=~$hf_5KtcFMS$&+V32Maz?yBt|RugfVL z!Q&+*d*11d^pf$+)g^B=u0Ko3&KjUoyfZd?JQ#xh^isU;XV4sWd~AGWtuC8Rg-V`> z(j38rluMGemuUsr?P^Dk?S9#~ufsFZ`8kG*o-4SJ=SZOut%&D3{-a4Ax-?Sz z%o-C;ZKsvJVSgCMu`Z2z!}dRrbqV16R}~35X47qmg;@gk%`_lOx3EX1TB1r%*`M4W zrITrCN;?Oe&WZMXp@0rt2 ze%Z`gjrf~0U);%M?3Y#>8t1s@7mKjJpS|g-_Ottxp>Z$^2ft$V`&s{W%Xx$RQGbS> zgIuj2m;bTAq52->WB_ERvB^h%N%ELOM$E#>7?wf5wX4UrNRGsTl(t1VNCbjoxt)sb zQBms3vQ<{5H>0kZ4Vh{yztBK;t98rK^K6(ej`Pk_6ys|nH7lh|x*JdP|8yvE<98T3 zc<}w)^{e52254&Q-qOeDWYPI3xX#EaRx>uOA^-8;)Lm)ThEX@tg60AX1?6tIU%1s^ zjcSVyzp~Z<-<=BlwQNTr91hJyJObxKbVth3(b1TaB@kY=!#qKWDY<;f)AdL*{&7t) zLTN#3h5Itd_UXOq6xDsLF@e5hcd#W2L&KI%r=#lu^ySe;d8k@vJU7h<;A*FeqdiOH znR;VY$~92C3d26JG8V5v1>7_F5|iGJr0*P6@K0u%bd}I{P%~(QgSEY{zBf^y?xgy= zK6docgox*#vq;T}O8yi-7M6{=r+~oOzyOK{PKDb6UoW7P->n^{ae5pe9W44qfd{g_ zp(rVlJEJA8eUHVuYGuw=RYDdCGD?tj0F7Lu$0ja-jtJP;*b^m5$ZE73HB_M_p>}Pl zReT=}@`3#N+ohtNDMsxrd9_jZbuF1hY}Q(3-F#ZZE6`F{0cLA#PNg?xWQzW^4Ea7A zG)Aazu`;omY~E#o`P4+;Ml6DA^?-70T;{!U;vS z5D#z@N(172AbS$YUu<(qFn7exxoW~F)DU<9&ttdb8+N3vaDV3`vUznL`scUBqg9{ofY*!@AuyJS4%)XWU>M@9bv?ww<5{F&Mu2v;o_UVOCL6{R8-K zUhJyOE{#!N`(55UC6wLQnzWmpBO0!9xM1wRgWW|zLEXui;}{*4X`0fa9_fVHi}x>9 z9MO9ktmGC%yED6qTjx^wk09@Yw3k)=Sf*a+qvj9^{1}bCySaRI{CnH_n^X9QK=?%z z0X?0&h~64<$Y`C@p-23a&7ak~*t z?T&*meq45sn4m__E7y1U7Jk{(oHivsNmv;q0L>)ea)3l6;PDb|V!bc(xur6NaL{lzPV^@t{Z6SCz&eGH;+^V@imH7!CN7UeNw-dv$u{=DzTxaEdeEljk&9pxeho|6h{?6B+t&pv*f{dfIAFirVQ z&y1Rup+3~%qsdGk!_#rp`0IcXs8ZkIajxN{<-~xa!iPT;u(Bs^WDHr4t^6O4%sV(} zt3Xd>WV$_g7>$YAqD3S9a-pA#Ezy0T^>>Fv4sd51@SkFLdZ!IW#+qsmA06?ljTV$Q z94%bT90P5DO56Nn{_bbr)s7){xDdGWi?TyIm1IrZ4+LIcVah~0QUg6jCj=_b5@i9w zo58YaQETzoTl}(EaOc~Bd-IQLlwya9JT|pe@3*3axYywTWhV;fsKOYy;W1Dhf-PXS zJOmdSE7;XS-;8$y!N~Anyvi>g;r!8g>8bjj?HTO`(hGRPX*l)U4+KVv(n z-#MbEeHr942K6AJBpWLcDpbVDL?LpIs3eBV7{TOw0(7cmNHTxwHm{@83urRIo6R%Z zJpp%2@G=SOzC$R9){$D~i%&4&z*PYT&Eu=_xlMc^gGb+#w|Y-7tGsJCykR@qHfh<^ zLH1m2UQ%*CR|pm6GkwsDZJ>Hu+0GmLjWyyUp6@xh&6-oZRxmhfNJIH@(1=pl#dH1I zc85&ZrwRQL@H@Y%5l4G=H^kUzPA+ar`q-~I9-HB7LAQK`A^7y?{2hkuzvaf{NtO?- z4h9RjWjquPtTG+Kj|M`N?4>TT#|6yhG$_8btK$1-S^wNS7NkF?Jsv?TI-X7CBhv;r zd|~aOjP#I>46hL-$VcbyhAk>2o9$!0`xXwHxNA3Ci*&ZsT_@!8#J z#qq0*#=H++u=(I0qG-Q8i1p|i7&mNgyt?hi2h;SvJFl?mEpST4jJ|ZxdFfMTivMI5 zk-~rxK<+!(lpxRNuh?PP4p&x$wDj9i6z}Xyj61u!hRgy@b22{=J%5^9AXdByH-h&< z6aip#sbt%EW+=##%fYITiEP%y#y3mkSFPU?nr^C`*HODr)hiQVL8B*`-o!B z8~9;J!a%5FK~0|Bd1VY$PMV>0jaV1p`UM3a5*xkuV`IG5TWj+S%b|<4$SZ$T$2~PE zQ9NAoa@+HMD`A0bRIBn2z91E)_dF%6o>`#Iod5I)b3>2P6aou*n@e9Q7pYcIi6?rF zq_-o6>F3KO;uM6Ly7V8alKGPX?izotg}J$omhM|753YisRB;i8HOrDnX27F>5_@Pb z+NYN_589Zf5spU<1n+ zsy{}ji$T#Hf@mm0&Z7ULplpr~8*V-~iYQgtV7Z3~(leuCPM))bOlsBH?x5uLHcsS* zhGe32BObr&MqQ|RLh9Y=u!;`6pGp6g@BN8SI=`C+dkc^5g>(`bl=8jSmHfY605TbW zqabEu;H3f57Z=Lm=%|N-tl%>!bMzgbFu{T;J^Gvt6c2ZI(Y_of2M738*L}RA7ETr5c5NcVpx4%Y^s3$f zc$>|}luTuT=|(ZnQ7>PnD=~Xe$S6lomWIGj<(h1DnlJ#oO7|*e{zhV@;I`KMNgTP{ z>sdeTJ}#9>qc8zc+okSWql0fflCiuwM>5pmyR<8KHw(ZYF4I!r0XwY7ZFRCL3 z4d%-y;z+n%B<|nz>T8U8{G?G!EsqX9laPjeNks#H&WS?qd{#CkC0blm!~!TF!^!vT z*_T)xV9zALhxppN1Rj=&ehBjHH-GO{ObX~pmoJSC>|iJT=#jI-Ks-M)@5f5@UxF#V zeg*$l-R>4Y0cCrAz)e_Xboo7yv_wjBi6{*9=tn-wGoYYB_hzI#xobV6hW0|%VOOi? zbv?l<9xt#srA~WV5N<9O+Nl+@O#5`TH=LGxqU1W(;Bca;C3_RRQqCeWmtQynd9fYG zjR{K^jwu&2TPgDl1c>;1Fm#j@5M7A6V2rXl3QCccn#KqWC58!c-oqRZJs{0B-m!Wj z>C#%3N(RVkqRz~f#BmR5RxOk1Bv{o_vI^7bWBJ0r68yjez6C{OMR?&k0^Z)UG;{FT zqx>Zu;k_aor1%uh->gOYhkj+M>VrRDGjj3n`~%vfL|;z@dXBN60Jq_3bsP;c{<&e? z1cTFX(wd~boZh46@Xs*RO zPm0do$!Ek$(_}7V#5K`*-gfwP(ImorkYW_x%0wmlDqXM*RME0GNr2Gy6-XASzNhd@ z?GJDbjSex4IcZWqg$1^MPzZfemHKJ4^%5>I%E^twpckfid}#>i8;7k&Y0<~G-8_xm zE@HR89b(yCj6kg(RK26~Y0eM9@12a&=&zW63MB)S$}cp}udC0KmiE~pWXo`|VQ;)8 z2B+NO?|ys`oLRBXzHGOM_tYOf{LEZuP8Bvf-#`^G_5l(qZ+>}~5U4NYbSpRM&@}rT zs^IS0&f?`~OS7ByU7HUS0A54@fwJ|X5md<(mL7HBz^Y<^{>WcAoJ5y4Qlya821g{m zYM7IG^`TRk@5b|Kb;G|$3#>7tgdmwe7S}K;{YbT9^fIjMPZ?;yFN>Vg8wGT6tF!$U8n3=$WqC+4rNT-Zu1KL5DcGxaMRm!)NH??pGx(5TMCS%)d^U9hJw(6j!#o zT^|%>q_pdB3vCrFBry-NEH6CxH6yPJ3B;_yfQQ!qNI^ zd*LX>jzDNitNB#K%PP-b@V7oYSdvs?P<;SYb+K4mO_XOMgKMgo&649T5gOvf;VdyV zNWE{ycdq5mERdxafnUh?mLzSJZEf}?*wg)#7dW!?wB5AwWbF%AFk_K5S} zIz{xt-ocb~hB?c_FbBV3Y*D1`JX(NhX_V17WbHs55+Pjpfx$ZcX@V~B_*2jxnLGjV z{0AP=S}(_(`-=t3iQna#HLD?OXcxA7E2GEaUh_QUIeO7ZrjJn1OB)LqkqE*7p!#cJ znO|%!$Sm^2IuY;q_7i6p)RL=B7bcw+BRBt9L$KZiasDf>Tyuq6vuja_QT?Mul+~-L zuy#uPyLJDV5C0aVJeUM%3#G-4Fi-a8&^;{lD*q>HExWu1i7`Qc?09(Rq%p*?x}UiL7sa^k!!WKbGli=e&2OdXtxZpYthXtGNF7Mu(2+ zz+MhI7khD!?Hx`=0V2vAb7lDKJOU^{8{eJMo4_r`>18=U3y$4=3lL}i-@STv^$Zwt z7M|4)!IUW&%zg1m;We)u{igaX8nTjVJufmsn*AV6%rg-hXISxe<~b-rm_)|Wgs^@s z@LzYMWSV_RoS(!|${0k;I7wzD271cvVt&gXb0Kux|3XD!zyKo(5uA7WA}8;S?Gp|9 zQ--3XI}@-SCBenU5?AqjUis_S;@Oyd)k|PoA7vy(HvHd(`&bp;`%4y*eVWb$JdGnF z!CLTuUvO%7ALe-kr>@P9DR^Kb!E60kj_G(>S>75;oM8-m9Vl45_1Uot{?R1r_^m*s z3xy-96d^pxEi&XuI`!fFI=Hw>#s~(!YFXrfi>mJ+7J(>L<+W1}29-J^I`DwikM3-) z0O6P;^z_kNuKIO6P8`?$VQKM|ot?uQrY_@De_|L>bXm?$H+ASfVLp+lSQUIw3smge zC$UDrCsvXnABx~sX{W~=ZIJP}qcs2TMmB(kfRM_ej0w0)&rw&QT%`@x8p}1Cm77dhr0I9BI9JoYLT@?!%y30L)7pUB%8gjj^2H9V7v9K z<|7VQ(+8g(m`8g*7VMk)cwZin>b3a>D`b(#R!_jduafwFX{>Ih&rFBsM0;!k8qz@D zk{>^04ZgM@EmT*#6cFlNw7~`epo8OowC4#Px|L_GMch3WB^u^H>(c;90b-%nb z0}S7kqz1wG#p=(l-`Bmian>x%p)a3R-19nPwp{A6hBnwOc)kbdTei*JOfCreikSph z2l~=j2&FMk1v_Evs_S9FST`YIA08TplM-Ox(t-h1$y?HQ#Jbbenqr)K-+C%2O; zVuHLpb*EV1q93f;f*KBL5vuAe#eY6W1R%g~g=r}2b}e-me6q$kVO?r;UXP)UMvr@P zl?2Ln%tj2Mg9IL1{T?^%n)yQo=4er^b4}ub$dtj4C>G&*5}s%Bc41chO=^+VMZ=E5 zeA0sMz((oyHmd6eLN-DAI0icvK|4lg7)B0srdTx`QFt1VY9b~GXJ>)Y-0b=^BUj)8sm2! z{7W`GLS9m+SXf4rIaEkIU`r!|1B1EV{995nrZ0aL&si*0RT&8!9FYfXpFTW!mN!@z zu^mqg^uop1R*xH~Qa^!!tATqotaq`m?M&KfiuGCAiQhAqi0J`u4Z8V9BLLhDkjQB8(4TmsO?$so#oEzwj7xVb!^$BVi{rKj#;K3J zHtB8+dtU1RIJ-T7K88V|zUP0u93xa7dc#CR<0&j4pBD?2on#vKJqDNXBRCwlVgQi4t#PobQC|2U_=i> zBj8@L6aY^(W-gy!Y|7UMdWIug8 z4XlxQi01@+KeCY|g5+kXpdb}4n@dg$UPsWtZ75oke;f-8v?l#21SQI3ap#^JodQgT zU=~W&Uzp0?>}lBmEbh?&Zmt4KuF>I!x?vYxKASLzTPL=@G$GMi$Dr9Ln1GcOah&*9P zV_-^M9tn;>O;TYsZwNe&oZwk*8XqiD9?LC8fOL7Qj&Rbli{ z?zOB70L1@qVatRBcPdmTngB5#neme=#s3@*1WsTQWR>#wHD_^~oboZZ!yXfy?spE} z0xsBkZQhhzA8GoPOt~a%fv^4}0HA~PBMrjzZ&Kz4m6Gk8c1ehdx*=3{E)Oe+DiPNg zi<^ah_6wf5T&cz6h>~U7tcE+>Uc`l?zQsNg&$~ofPp}PiuE$OAh7=- zQjE<=><$|H0SkVnKM>6PwLzmIgY{GyF)Au5{}^D&8UyuOT+5Z${6Kv80W}VN=euqu z`2ASEn0iG14|DprqNH%Uy#jH^(k=Qw%qJ{`0UI>guy*ymz0HCz9yYx|OnNfJSSgVe zO_vR%t38|wZ*m!0FSl6)E9f$p2>4B%G;M8@jDZRat8T}c^IetlL3wMoq>Q@Q z$TEK5NNb3a1DcYf&KZH%<9aS36&SwpvF5{A1#`QXlU@A5#n(JbCi0mQ7|tdp&6erc zg@mn3Tu(1lSP!aT&N9dH9%l%Yr(yl*N~n5?;ClV+4QBr*TAK-3iR&`Wx~iApi8cFB zT%rM(7l2V{9kHl29oRVC{z2)H$j3;*%bWfo_+y6n4eKCi$QC%f&bYU1L?vO4?91e6 zqA;4+|6Yb(IuJE)EYuw--IMyT54`aIIo&COFQDHBd0sBH7;paIb`?_noJ-`VQ5U{% zLh15avGSY*v=d1&1%4MV7|9b7k&cF9tkikQ%PkomidI}d9{z)SlhbWLGwE`PeMWDd9 zZrZccC*}+bU8#s!%fU+cp{BdjgVFT1Ut(F|d*@!vqOe2oA9u)`31CRKgN=F#IwOPA zN4?~OHdz685iH%rEp$@%(c2LS6njXZE+FdvvOX#Pi>N5AXW#?1%)fP=fr5Xu3^*kR zt^ltMYk3rm7vb9^(M7APpR3S+>vGIzG2wP};-@Ga6{Uq&Wn5cRi{>d!1_B@*X?6UW zOcil{0r7aXySh)k^gw6^+qJvwB_DJ!nQ$H#y}p-8Y}Ur}cnj;rxC==XfeQNi&3~*5 z+eWm_v4%c0dDipearIsH{6e+zGUVN!pQkonBBs5cBdbUlkmdnu7VBrDjx=V+sUyS$ z%rzPz@*LZ?g>AW6XO=<*GYFZK@C49;VqUkUZKwA}8s8i`6u6x1Mkze{R36K~VQ$l- zUg~sPYd3)a4mO5XJ${D&+nh{NUTemaN5xXI*(U9OVub;8&!51K4*1bQT;tMuiH?Tv zIysxa&(bp?%ML0M*~(qBan)7+Qw5AQS(V%$7PQ%*qzUjw{R-Z#OHQI&5d2JK!vy93 zQ37Ur8^X0WG>`oI^rxE5ORnH$xAde){7-2K-)a+ZSO6ol=xrqO55~EXJv#|ygh4tB z516iv;b_dwTq=rsL^=ioFgC(qrn_ulbSYv2GuAw_gsu-*@(Rvbod4`vqagpS0&|H* zUP`yGku=OorWAIv9qRDuE$*+t2%RUPQEzO?(C|5Ov3lfZgYz$L&ke573MTMBbiD0qbC&EEP$gfE9{v4W{_bX zCFb;pyEqXud=Gi@UOvy4-(|0)yl8U5-0`&k;@ zf&v$#;Jfneq1{->L9g!)V}vc&*}Z*X9Bdi=Qp-uMo>c>+hrsN#0t39T* z%9b7bS6zHLK6B5p(%MxMvUk8lkN{h8?WKF#1dG}~WoX&28VGOP@^BT8p`eLkK7*D& zhg6%y;J3#7zoxkqZD)7)MoV9yu1Nt~e*GDiq5dBV@t*^o9qHd!U|s``v#qxW2KwlS zj`|pWuD|<6L1c%L0iA`!!h)fv$c0`@40%Y^>ttU_chXn88VU zYYM-{0?AJ;mLuX)Eovq2oAnGDa1@{)p)v(oJ zK-ZSn3Upxo;O34Z%=g-cY5>?kC-bV|fVT_iU9n^iykOkN^DwWt#&XGf z*h>6P$EImF@(b?&+;qvlyB-vbpZ5w3V}1-axA#ybyaO zaGb_%q23-7qIJ*(vlsW+zS}?Qo+rQO0)K$z54C-VK$(;kit`fN8@*}9aIyvv!>E~T zI8D`nk8ZoQSwpI_{m(Zk24VBohm>PCwe!tji5W_$So15g1n$Q22AY5bpYhRY>#*>3 zDkk~yt%>UMu_|?6AIfKf$GITb1)k7=H=JMV^|!-}C09XMVM=oKez+&~)%7A!+1!sW zQw|m`y>h!N%t<5U2i1(3=M$zK1;Yw{a%RehefwgmZkWEzJKUXRW!!i zrwaMGdS_4a`q?;j+)DJ7$$ECD3wQy>x9jx~`oA}Q9vU!QJoa)2$qb1vL$P|`$@hG{ z2pyqkz|YjngjBX8g$-Xuwj}Pv6k`xGZDYf#dE^z=cN!<3fM7P4Q?=9R-q<* z>P3-bR(ZnDLi^+s$fYYtIN)&DHmo*+&Y$VgM*fMc&1RX}>GD0%H<`M4FIbv6Mh^9f zk-lm&INPx*I7btiyPzgO34+{J)NjM^J#NXL4R#JZV)W94|4!)GV+ZpLKHI&O{T^Xg z8Gh>FE+EUJdaw6sR0-pjf=J`+EqWUwWSlG|B^hK2cj z+|mHmdSdeENd1=-r~=e}_{f^bpBXzh%q86wMJ^7PKH}Dm7RLn7&-@km;(HP(8uvtmq1TSLe}O`$ouNhuYK_0LceXQ$tjyO$}vUdYhNcLlR%$@NQH z^G(o?-Hwfj$Rb|J18!`YrZXy?3mQdM#6EK;HA;x$*7wCXJpqofj$oT+q_XD}T}iMtfxOQ^vh( zH0xrz*q|6>53z$w;2jL$M{2+7CS02W>KuRJjce%terp40W*E^LFOU_82H%c801S}M zwW{^_XmX6M8%i$Rn%kx_WM-Ey8wXmg&}NjpWsrdzt=V-12_nH6aS%mR`l5&h0-(Te z5JzGf7AToW61%-j7Ak-^+je6~xxv8k%9)gSdFtnrm8;@kfA!7pOdq_|g#iPtI*T!+ zDW!!6@&a@TmM7OklvEmSh54@%7TJ8}@*%eien$|?Xwgp~)R`hsfC~yN?Ys{dE3?IB zQ*Hvp_$e@5RVW~A9BNrzE^PH?g-tA}P$tXJn1j?}h{;f!4iA&bR7fs!keR9MC~zSb zUNhETc{E1n*meMgLz>G)2$`Wt3(D(Q}lGePNQ-{I3d+c+NWZOQ>G*C%|)!o_DB3yxyS_ z3VNiOX|&1(vJuXfaj#WAn!IxgC*8gPCgnc8Ti9;Xf4auLUSW(#&8Da2`}bYgjV_zG z70aFoVbr=U$2gVx?IN9l$1yZw!uj&4IF`zk;yVeUG3;_^`BI5}XUn2Ab)j#9R|`z7 zMq>8x2xjn>x_L!qsV}(P_qYL#4jb01H3!k?|G2TlTsZLPIJ73Ln#5k~MSC{uvkdD;6h7)SCo3ADxh zsm31HJ1zI5YqoY_yOGa@dl7(ZS^fTl!~Nup4NR#Gl8Dw{S^1^W=yTYlb(}syCJuN0 zIb@0C&g=N3yreT*=e(uZ8h5D(;nM`2|3pbH=0)?rUH~*&N1I7LNp`2zjqIr3H1^NM zcwz}dd~&4Y$#0_f4#WM`cbKW;6eD4*9V2TL9YpVLASMwk-$;1!Zrcd-p zwWt&ZAD@Jdg4s>}U=YgcC847#29U-E=9?~+Dc4S?Ed+Y!T zn`>D{o5c5`;!SJaX%o`%|rjPjpt0d2w;{HYSs@E6NY zB$x;wyu&Kt6+1bf10pX#O;{s&Jsn`U$SA8p2~9Q8Y*EJp#|KEswT`PPVY9T>t z>&nNC!*utTvLe<073Nv(a6(`#4@kcr{<$uiM|ew54Llbcg$(xpANrRMF67%nF5>yU zhuVuXV}{5e&x;zBy&{Z&_Nb4)Zm3&WLo=u<#=iI(I$NnB9N@+YQA54HIvVPLY>n3B z583wtdUU3F_YK7mLif6L^qY&zmK?Ux!c@~NTM6)KfstS4I#*)7rYXGZsOz&XR^@a$ zl=Qu%)a!E#46n1pdY$Cf&;#Vqb9TDJ*J-!TkF5l|eb$;XNOgtPkk_tkg{HyjTqK(a z01`)rfiDJbe&+M97tP38qzeL`5@Ef`RI5k-<4At)hgIhm!U)&->+ zl^<&q88({W{!!Pu-E;Jl5mK1l>?0nAg+$W4?5joocLtxJ`%8|-1bKQZrue(Seet?P z4JwtzLR$y7|A*o7RyBd&*qAC~VG4mrf*`A+%AhA*Eb*~|2p0-F(Qkc3v+5PP`F*KM z%vsT6G-skV4kB#?><&U8_i8#VnOoO2f73CkHh2sMIAlxzYKmj`BT&O1!pFi|l_u#1 z6C@hnT?LrKSHmh7NRpYvi+T?6FvP#yxMHJ<06saa8i@=*G1;T}gZIDWW64c9 zhhpeUs`mS$j42OqNg5@fs`WDZbCmX>A-1Xb?*Pq+QOPYgu6{13%fq%O`=WP%c2SYM_zEF>6Tihzq#tZ8=8F!6eybXTfIRI%=&t-aRT`*-Fm|JlGAjDpZxT66~gAm;W-%*BBu6f8yGBA3-7F9ritzPHrmIC`CKq`o5iJ2j*>MZ_@skEOsxC9)6sx8^MtYkqq@u}2M&LmwRHl@|`I-CSJ&W!m zK|ra^bfo2Vs#q2t@#AP@KC^}PE&kH&xlSA6(V;e)&dKAnK_@vlKDXNY>lIJnTtg(; zn61$J*&pK!hx+9?EWrd_1rthn*KdpndmVi|+L^PcqT=5E-HkBl-X8o%qn1axOWsXs za-ukwvVNM(!cwRRVcWm+B(Q;R*!*Fi;b%XEtMtNWWj0X6w@p`h>?tR1LqI>8eTT@k z>QnP>`ThHmbQ!D?5{=})*L=ZPk_PPqK}5(N6{r56MfHpoEfT00sk480cfJ2(m6s8C z4X90(S(~wzaDHKbA~r-?`#>e-ulN+}=?*Po9+&chPB7Kq*=8+qmWlx=L=E7e65nNL zB{x){U^7aZZZQJaXD?_4_m|J6c1$w}4Xe_Vg(ZpR;!Ty#oWG0KkaE@HqLZ6{@p-aP z_aRu8OcaFI`n-dJsMKooW$NeNPT2KGE*Y;kCW8VgKqX_TW93!NAmUq&B=e4>b2iY_ zM>L%KfnlYtq9B6|m7RjQ_nd|Y->VXU3v~q2a6b_&FWiW4?}HU4p;L`|A4$g{ZjW4z z&<`au0ku!RIiT~(Fxbg;p7adQ zYZ|o7z^7ciSNR1rkYBi<9{9S+_Uy11dB0JA5U8?P$wWSWV!H^DH^*Wk~OM`<&?3Uo)Y zgr?$u-SI4~3=%Asyow*+V>tZSOzg`R?;GIn&|gCR3p~X%!=;h{;3Af2TSjArJXq5e zzW<@w-=|EN|9d$mH7}ThZjXnCW%O75PZR}cUoU!C>Xg}5#7wQDTZcecr4`RDMU4{; zkMJdgM<~BF*PiagQ?GXdF-G40?n~-ra0PJR+d_A+o#jl9mQ(XrMtn039HhC0rIB*q z3uuS6KeyOekQVxOxF*sBf=LBj8N7U=Z+|8%?mJ|I)iK0V^<|oGNRo3K(|P>x#ALZE zgV@tS?a5NO9+x*oF{e{xc6*gI^U@4%pk%R&n%4pKGuQLw?3)=xk6u=eqW(4={hOVT}%ClSI$BzR`S$nT^}^dPWsEh z*O*Pg);z9W(b4^`u`jud^N(*hD!#j?oV;58&z6|M0w{iC5jM!8y&t2DTKtrk1PuN> zhJY0?B>S>v$cRc`usln=W6DO{JNAr_Ks)7`H>tBW%Y%eh{fZVXF_C+5#R9K*;EXq_ zJBkvu9|`G>j*bnR+A3c)(TO1PY&cmESJ#)@Ix!i7dirxcOgv4F;dH-o6}r;3o9WzV~R! z17okBw3&R>N7p4v$>eT(9Ua}Cxmo-TsG@NRLxOtw%1Vr5Tz|~iNoa?^1!EV3LL5kF zEOaL;abG4%d}J?!YgJ;8oLYWT8`2M zOEK79HeN8xcna~yeEXRa8-v^Cb`F}GoGGUiJDvC_I~PE@Mwu2b-rlD5zgq6N#EKmq zIwPgn_=hi677yO+phaa&?B&7&RB5p)TT0X26 zeq%9zh_ce!WWH^pupwbMSu>b!TxcjG1t&qQBs#2Ybs`+1s~WQO7(4vZo!8{jL6kcL zY`aSXYijb^urFNj!PX*Pfvi7UGd{rxb>jlc+-14*$g9D0-4(cff~_QjnJu!WGnW@` z8%3kKpEx~9S}$l+XiC#msozi;3qdRZ(ljS1Tt0CgJ~#lIA_hB~Vi*(uZ16R1*N9?Q zjpZbCqGPw5<-n(5(Gcj)o>qBM!tW|WOxJU(^mXgCAe>w+BajRxoc#4ozL^B$jGryK z*JJY*P=_Ds{vLU}fVF-T7qU#^YEJ^;$$@7%>JmFC_*37=La2Mt;t~WWN@xKqBrZp+|EV+?xf-sRmj)G1n4`xF~vHwK+@g zMGCY3k}->>Juv`sd~zknv4W)W2h@8VG3Ycsa}cDeu){9f2CxwF)dY((duP9h`t9O!g?@YDOK-&%Fs2E zLM2FW%Op!TEddVh-~bhiP+Em(_#mhXaLa zc{9=4A%C(QW)U2cgoTgE2B@!VNxX~1hyF7Ysk~2Z5eVQYK|(<`jHBFzyhOGP{zq`t zQ?&ZcDf%LobyoAIGqe$iz zMCD`I_zsIwB`ffNB*9655PVMg;$0U2M0e}01S~-))5jCsrgM$5|CQA>qJc)jVj;Np zkFPWaFbynq`KcQU<(`5!@+UO?0oeEPlIUchl74z3CP*VsOk|7jX#r()d!^G-(Bte5 z4n-5!xhLuyR;q$D3KMrxd&Rh&-Ksb%Wzt!o-COZ9&TokY2Z33k2(QQ z?u`bNv121eXY|=t55~rOziAXi`bB81<*Mq24#{3o(4P(Q=7||8u)4ZBwK|UjiyYVn z?G>d4QRx`JU*a&U$+Bawll@LBlHCp*T@b|ZA&o_U|5Dog_{Wc-A3x^*2hFdh2A=48 zxjaZsnRZpL#MzC_b-i4( zk5{x!pDJfm!ysKP?vI`P46>B)c#dx{$0oT>O`Mp0rF4lVk$+nNn%VgY5nf1PNZGqB zO@uDHz;YIu>40|!i45E@a8z_XYd4-P=IwjcnN8Uv!;luumtAMV@5A@c{(PL`dIQIj=bCoB11TDnvlyKFuMvM*dX^N6&4df%`&?td|#9<4T{e}6hGk4v&jS8082 z%d<3C!Q;d%mjz^+WIwf&D){9Ej#{u{+UCFIFz+wR|f644Sgcl#jw*>KC3)X$YG zO05_5<4%(Iu!HU{zVnEd8kZdQxIwTSKKthKTfW((bL~J)Ct~b*`qYhg^HSaF6s_CH z>W(R4a**wfrhkwC<;Nzie<)@dE(7MH5fLA+CAeySDETpNw5(rULl*o1mni@I(a}wx z1eo;DQj6`{8HJ}UI;~{nfi1$OMb}=exv*Z6?Ar|qQBtx87~j0VWyZF`z7c#Qz?2qy zsY?9q8#XbKl?40-(IWy3F=cZCRXd8xO&bCXd$AkFrKdvGUUDH>9Gqp0O!*vposNP? z)Ok>i{d~}qk3pnOo{JUuTBQB%i(TA97aanjEnjU6Gn>Z8WX7YB3McggzEC;@ygRxk z_SNWj^N3j45TOKOPMPY|3%#4zzQS=wLz~&LcTNxaUQ|sR;8*LKM7bP zCk0%?@S^oo?pL4Q!gRdzfKH3rY$cqfkm?%REzxV}xPE^67-_rP4^vb01%mBWsO1a-71N-o@mCYWf$!V}XXg1u{nVlNo27P#ZxAJ|s)r+&cD36Z_o{eO zytagD&+1HB;0>rQg0h`9(#}_x-fVF#!I|(9}iMKKq61tCq|7LkF|KrJ{{5 zTxyZa3-pM{pv66f1bS;njld;`-ppiUC}FQkm6-G`Hw8+@<kihk*GI1YDh+=eW73t zuJZKL4YcfcQ9CxLRc3V6x~`$?^>(}ld2)RM5=C`2(d(6E_-p6$Wvpp);=BlyNKsT# zcBrfTwOko2ibvXWS}!u~dFiQQwEa_GMZMx>C(WM7ENeJ>`&{z9W)|X|jBUp@+;oD+ z*LkhAY`qWuIU(j!@a#j*K<}#Ky5;ojG6d0?yBvRBi^Fv|y83fw@q{U2n>pO|+3;<>HA)EzvFqLV+Ui2$mxWXghhE~jHt)YSFic& z(7bll!+!o4)g?*5@vAEedw|zgmpPNV+wc;pI?~;m9HfNnC(3in2$Dnp_=in`nvj5q zQk2?sH|8fDlHoOq3M1jiBi)&?KZyPw-!_py`}|VZioGLqDNKuEGRJP4|GbEF_GLa2Pe%%%`EDdTZyM&0g=fIoX)5c|t{-NG1k zU3wy1duo}o*5&JY0x4h0FR6BvyC|Vg$%I2p-i_y>(9qdW3De``96v*eaNDD z5*_N9^L^LVh@&J}_B2IvLk1HPy<>JAO#uBRU-6i!vNnD4X+=*6!&YuiyDbd_{nmjB zt#p&>Ey1&wGs>OO>k0Sc5*X49xa*a?z0X8UW#>1m?G}-B2+K@JwlANW|GHDh z7Ki=DYFm{Yo3Bn*|Aqe0VZz7V&4<@Rt<1S$>55QF3h%m}D%gqZ#(mFj1P&t}0kf8O zciiP;l6I%L+|nA;?mqTi_Su}HoYJ%%vU;Ux+gXvU7Dxl*V9M^1^?{c8M2WxE2qhxc zmo6-D`s}T|$DDO{)+_pCrozL+ri6mvgDv-;2O8Wxs=_ml>N;*YNe((plSYOZp1Dsm zl(uk1a{`swTwSPBKw+B%6_5?i^qR9#ri6QzELHCw8%>eeRlx&~7}p$~04tBBT#xZ{ ziC{{4nqEe=d!N!p2j2R%K|dcd$xBpBBC^o6Ddtt5cSot*w?24BNq#;2&dH(GtcHo; zVE>?6gTINPU_(1BZd2iH)H1f4yh+b7S1vYo0&9Zz`}LoL@5*?Mj?9*sV-)evS1R+3 z_j*aRcoO+gulP`9?aY@A{khi0M?a~p17D^>?_5NAv!JXVv&F^JotHy>E?CrbmGZws ze2)WEy^mo9I+apR*xDPDT!+4{Y}0x?vo-$1T)OAAGsZsC10AXDUFPl8F2>`?HkVUH z??LZzvvq~AnW4N-T4WrP*%8;sn~?H`+POSuCX@YXp8Dk_S-ZJU4-ZMhUCBNx88e@V zHiusm7kcM$Cayvc4$awR-%06OYKo_cnh>ao?Rav(&+G~>nw8MA#9=74N`iB?j}+oC z$|5Z%uN2N=l@pz_kDKodAHzzsvtv5!4icP)eys?QF%8r5Z+CGL$qW|3>$EV!BF%dq zPcEx%^PE=pFBZsARFdgSzUWurS9h^6VNovUD0%%$eP})@NNUg1w=cGAG%k_2a?^Xz z=!^D_UiU-S86C=h-P{jb#*1M%rnTcyS;9?4Y``bRyGy zNS=UpP3wzKNc;s2SvZL$T*ok@99pNGz%@HGQh?kns&`J1XNKDHZ4KijvsfR_Ou9DB zUH3VgZwQO$Fcu=z0sno?>soW+d4wa z24#r+H&SYAdD9nSbtQ!gC9)>j6`F8_FQgP%SR8wFau2)OQE}I`HXnNuA`C1qI!^&E zpCSzwzDF?m#EvD~R$JHq%tWjD4wbtpLffx+Ixy6hT%API?hJ=_Q1mXi!;{O6rd#g7 z4@su$E`q5S)<`cb;Wir|qZf|0#ti#?&K>#I@z(va4;i=+_Dkm0i&Ne8OIBGjx~IF@h)RhpbQuEK1fT2_)Tf5yzId5LOUWPi?~OiG zXlFOiey9g2Sl$1|8Nv@{Eu&yZJTQIy8MV2Y7{LZ+eu@>v3)e6+b39DCw&$PW&6RTS zYdSx4GudlP;>+chdevsOpj}*rwKd_*0$Z~dGPZ6Q{_?|P$9`*}g_R3H;~JYmEk8Re86C@oR<)HoRx5_u*|k4*ybBn&Whspzf$y zU8L3~qEIqsSMRGoTK)uH__N`z0=Lf4lQ`+wj*(H>!4vmW&&KE z@1m6J;;{R}d6K&eCr_JuS{^06eZRwgf6e%U^(=VV?WP^z5P$6hAz@5@_Xn%N;jE{I*hKY33bgfCo;hM?2-{zN8FC6NWQONZ44e8su0A_8&Mml@9dzxra_{h$ z96H&CTOacoGdg$mOqROwesOYD()MkeMJ0sRSyma75;V*_>u6+z&c73~`ifpcdTFdy zsC-?08O5*Mw=yVIWnJ+lP%p#QVnMfjvF}u`oGU{Nf?bs0MY3QydaNJkDKoA#K8UnE zc4abr3-cuB!b)gAA9f!8nF}vy^68J*`7YPI)j(oVDxG|HeU!sCr>G!og0XS?hlPcO zf%P%vrx({!$xpv2p&L_-I(PL)YsQ8!8hwu)!x<7kcdeFg-i*Rt#ho^E!yw`OucR4X zm9#59Cs^jYq?(hxJUTePL_-!+p66@%?FA{`vofQ%Ui+&Ya$Jg?#G9Xe$~R2;&K5kw z4D5nj?yV2#T=5R|?t{f~kuF552lU|IEC6^tW&LOItkGmuu0?rS_Rkd<%~0lVTZd+s zZa>Gj_J$d~f8^DZ+~z3oE5~~Q!JBM0a`8C@rKu*wNw5Y{^QKSsoVMgjQ4e#ms;g|+ zUO6r@eEnQnG5#jGd-qV7w>DSwrJ`^@6ECMpJ44%PM~$9?-Dz5A7_x1f9_q~$uaC0o z8a;4gCg{_1I{Y@YJW@3TU}5I2lJ+g5Fuz!3HhI^oTL)!TFk)hK+*`zG%kdx06+3`# z1twgZ7174sY~Oj9+o;w({{xxV{O{s@6mO%cceFfyH>)x|ro|N_w*UEqXsBPb@)lq{ zv@j*U=BP2-{PE5e&t+oMf{|-uBQncfCW$2WddD8 z>*6NsI>DnHXG(c^gTT5&_cgcx(wik=ad?gMHo5R&k9e)cgkx2sWtA%S#lm+KXdXI! z0?)fEnMf+uJXTVMd@glgrF5@xuNy7qahEn9scG9hhG$S+3_Fh>RG)mn5#}H?<2WM32 z)v#@rYgN}NWgZcMcB_UK7m})%0z?XyU4ACEcOEvb*qig<6_*m7(SW8*U<#A~!=V3Z zFX+((QuWSeM#?G$IztKCPgdiy4Y{prBxXJL*D}O~6b7<`PL3!)VWX4KPt?!9=uXw% znv_WKUGPe8`P>=$taW!|>Fgqj$&mf-uU>(-^x_~Xwlbd=`<)bt|^52FWp$&ph<2r{m%3>nqJ#@p3U#u@(e-b!tf zaS!`3`rWSe*fN(HQ%$WU)~27ze5d<|w+@HW zUYLs9q8uqNs!@OSD%!p`n8LD$`0BaJ-!?j969T8Br#HKis9)gmKGOQDeD8H*V z%9|&aw%pYhyCey}JeF2O6R~z%lnm_5T)vz!6H^p+P3!SRF?ClBlQUIftDK*}wah(R zh|>!)7|9tYjI|l2jkS+|M%!AoD=dA&G=VY*Nb^h{;p{|44=BR#Hg4F}{w%mBvy7S~ z>dx%$oVVihz8HHwnIqO{oOlDPDT;hqzM|6J3k5UoQJ|N-9v~Yc6%uDcO^}(p%`{qk zAS}?yD!?5aBvy@Zag1ybJU$rbE^`>w(yXu{ZtYuj-&9=p+%h6SOlD>dTMi) z<{h!cA}7r$x-tWxWg8xAJB3Cg1M`m*1dPofB8X)7TB`>8>(bgMr^3H_ZjM=IqxE#3 zxIE}hN!Y61heH>OPW9tKpByHy>*jLvs#Q;~;(F^B{~NgX97H603B_SKg>Ej(A>i&_ z{GYpf)!S*QC4^ud%xaIS?Kge(xpnZi&fymd&LBJ=^I$l9z*qm|1#6v&Y!@!Fu`=+H zqlBFL`Q{ud%k@AJB0O5GAy4ZXC z+cL}2^uQU}5O0~98p{%fP;AUz_SjLUzr?#dtJBV&Y_sTNG^gj)8J-F2ET~8pTqJua zc;?<++J5y{a%fyiT(=xMW0gjo^<$Q9tNN(BDCaeviIID+k-4wz+>m0oVA8&ISXOpG zJaWVT$*luA9mwNYEGC|>D*Smm3Dt~*`N5vI4s@(&62z~er&4~hPfbx_eDWkmP6{hE zWsT2Y)ge|VX*`B)5=)ph5q25qx`=GQZp!Y!Biuhfv?xC7)mEz`os;kImonq3P# zrDmL!79==0vd)mbma3PxuoBPcA1G3k z?rE z*SyjPWUBZ!E&|RCBVOm}`F!Svy4Q8Fviof=rcfcrmv#jhZldGtX|WSg-aFL=pu3eP zcUxd#@IBF`k7vHr!56yN+oc}L%gBPPRFEW(5&>m6I>#Bbb&~UmHjTbYZjcM-ekM|< zK{wPee!q)hS5Dy;a@o|28NZ8B0wv7|ZHsb_NOsqPi$u3(83YRBc3}LjUL2*xCAZOT zVMP0}_4negWmfZnKiVT^|FRZJICzpTU$VhHpRUa}0SfN0=8j|hAOI7CJaM4C?9TWp zg)epcenbD*&0*$vUIhEAHihY@&o!ryvIznn-SB1dIZ@z$#K}|GoIPWtx_z{AG+VY8 z9>!zFbCG(d*1Shvo6rSvuz~Jt;dIudc)xeZ92HhGZ(Qq|<}AuOi<)!9iOXkOO&Ww5 zTIiq#%rlK2YID94;cCwI7p_Kp$06+A@g6JIQ%8J&w9O#YilZlsz3F(vQdWsS`#5U( zn%4!ulp*cTchW3LvCS?mC+y2!l{Yb3*Co3eoSxD*G5nB%8ClLPQAx`bN8`LOB|Z_S zq(y&u>k-{`e6Wv#kgzW=KY!imx~+zSLhnG#&mo2pa2)SF+#MYpKuyvleCIK1*K+6I z{sDJ8(Jx5Zc6YbYr+)F+tvh8Sy261{W?D!(OpFLa&wCzg)%llEH+k_TDk_7Ws-9|p z>oVzLarZQgK=o5D9xMKZgKRFj2Mw`qj)F8Y^eSg-4b zo(J)s*|!!u%r>_hUDWH>ZjmBiml5hJJv2=$Q3Xkgjp4P^!+QOUnfqfh?AeO^>ROxU z@n%}{QJ`<5R2$tI?ph3Y99{^Y5ja=yfI9eX45xiD^vuRoGpKMANnQ`7Kj~CdfCb&; zJC0t13edJb0_Axy_>)cJTzXqqRHg{+rV7C7w9&hqJD>yY+y~xqPQ5<+6n>|d^2_Z)uuE$+T}~G48k4-#%k?ORAwbBgZ8im+j`_{ z%e6GitdA!A6(@(*gYp*w$AZ$X6&z5t6}dxmN|^vtI)q_AYfSzJVbh@z2x)TrbJrBs zF51cE2Mxo*Q|dR=Nx3@^&F}D~dkgq1R_v>U1#T$g3+UUa-CWf_ah)`9T=&ahpzOsv z-tr7tEYDlqog_kZ5{)8S^LW9RN)`Dl*rI*~gjWz~Htx-i53Q$lY7(qt*ViQ`d3$&c zGh9vr)vFC9DO$&|HToyL473r&>M`9GIZ6&e=)RBNAke9YxfysW+9cGt5a}YBNfwTu^jVZIL=AI0-rRXZT*YkGiUyFXE`|9pI#urWeN-e6l97 zremxd(#XFU&)A7GXU{>nf6oBg*CwZxb=9mXeXVUzZrP8++3?Lp2}180iSTu681f_; z!hBg%5VG@0uIXI!-o}M(_u=Bar<{HLf~{!iO?$JPw?gNq#``cwm7)?}lcU=XLxZOt zi+C=Sp9sg^LNQDD@77JdZQa6Jr$&8xZ#e>jIxP# zr@2xD)$;FTixpuSE;H_=+9I28pY~!^M{6qSnsF&%dnMU_`%&euK>hr77?;K#6s{Co zST)^JWjQ&IwJ>$Rv-QJHc)yY`n|WMMHq=+2wf^v-_9;aVOr>i2`)uGTT@wTEGgaOY zoV4%XuCOym8oACmP;j2#bRxC{gRpQe`!EihUiOb6eGy)1Np=guRo0ECKAAeM>s;=A zQv0heGxG3!Tql6zp^Rgr)wyD*KKj{tjr;v0@h6)j)#E8(vJ3Rwk+*wir}BoILZ>XK zdx`VJ8UyY<`NVU&$sATSJ5qyG4^i)!4wkL8-@hvzY1}?HV>C1)6lOh)i>`P7qxjFK z51C%2>iPKiEa&AhB60BaSSVe}qyTV_4M@BE?4%g)J@I-enQat7HmCf3e}(!LmNoqd z^=ur&+Qfev&MN-LU}Ypsx;U;A6>objwA|vv^A=VvzFl69`uQQQ;XDbn?&Oafad-vd zf?i5W5B1fksJGlE%V6;OgJ0>U9Y9oN-puV#Y;{qCs=Xi1b5eu0t(yqD`!A3N6LH(8 zwry}>8liN3zNMCG30Je)7hxZr!z>auF8?d)DW zNo%u0d%0z{FI&UenkoCYl$ndFGW#oA0tjVOTb^uZEtnWdwF4 zPe7X)sY@9%?yr5ZmZS$UJK3IR{~t{hZGCIOz>pl{v@-N^ai{@e&ojO2CC|9#9{wPJ z9Pj9&9pWGky(vg=JK06tveteevTT)(^r5E*5c(DXd9St-ab0UoL*uGfx=x_0*vM-x zrBIDVjD3{s-kHvDLvZa>W9pQ$b}sFQQMm#s|Lzg2N>NeZlIltki?gl_o|Ig+`n2jz%64T`tW6JFL}_ZeMxWH!DQ?a%+E;PsP#0QbgXWZY4y7zkXVF11o`N^ zK-bmXthK!95<4V(^Wky&B&{=>d909#Fv0g&@a>HXYnWWu4`CB>}m*)HBWcda7QBxc92Vv<|iDx^6nz7Qn?fzqKhnaiR=hyp-l+?e?li zrI10WAxCRXu@<)Sd;Ns3BZ}KAsA;t#Eo#q8?~KI~lJ+y?!d~c%$4cpq-G1#TdbQD- zpEn(gcD*m0ogWuBT+~kcARlS7o0rMx0o;7(HTkP;FPjQ@ZIpL*s(`I0lfT3bJU!i$ zyI@|Aj)xG4jsBu|9d*jcC38qJx=f#GcHzaFka~aR{^c8Qp8ab4S2A_q3V z5Q%)cxODTlPCBmd{w@i=C#wEPy-0^z{cbWNThG}9q{(f5>~w?XN6{*s^`UdTt_ss# z@|iu6I5GA@JstoDGTwDZ2@F*Pi{WcS;#*|yW%FLDIc_cH_$ssbB(JT8+~mwBp6XYK zCz>cLEOXwT8mlh;W=_y7bnyMO4Na3G-KORW*kVmGD;0BmnqwMgDU2C z?LFKPpUnny6Ou_bT-8d?fCn8{)!%S*-`!X*a}jFEEm#|qhCB^rxJ=Ov$lSj(5R7Kf5dA+7OaYcDzszvJjfh^PLqB>s^GEx-Ho zQ^-g;e=**((1p1ENg2zpKZDnymA!c8%V%G*D`T11)@T6GaV_#$NY=~}Db7;n*d5R& z8=?tBE8Xw~)*Z82Da(%MeCx$fL8lrW9gkUh>||RgwBBEV$xsWXl;2I}Bq}C5Z=T&M z=Zoj9!)~g#=8M2;2t@D0%c=cWK+Ro28=zW|g3LN+DW(#Ttgy-Hm zm^0yieIR+@%HcbxUV2`5iO^@&D=@d0BdJE1**UB1!E16HRz*Cz3P_=(cKSedkXh;O#I;m{h#6aFkT=cdC zBqyGpquw;6p-)j9mXJWAzcn}+_Kh0cM>k7CrwEFR92>JmPl9o!nz9xL##ACr&k``+ zCFu9@_TN|i=?1W&@5Q2>^PBh!J{K2VNHiAd7L&e!6SiIjG7^+`YivB_E-tZ#xm;cX z3b@-cBH5P4MJ6$C&wG#G0k%`IJ8kXSS9b6#pjLe<_{My){-5qjPMhR<{i&Mg4)RZk zf!I3Y2R~Olpi2@YRxxb0jpvz#Kc9a2R!XfHHD8ZBU#3WQUQMss5ez$SI5zN=BRmfj zWa)3%_RmBHoKUb5rObSDu~9S-**xoczT9zfx@FYzWH_wxe2d83wS|lUC;ikD0ZWh_ zrqjN{k|+h<$rodV`LX(Hg>m}6x=bR)IxHfWWm>A&vO9%*HEX4Oz%C!`DD}13I*=DI zUN2M$;dT|=VugQRV(QnSk*v2_f7UPd0~$e#UU@Q26FB-Ad{Ps7VJUypK8V z12*Zi6N7Ql%X?phPw@vA5&^#!m>Pl=k8(SQ{g!NB60R}WdS8#<*y3knvvqNHc3!qB zxNUf)Zgdc$>5~&r2ssh6!}zrwai(iP{|(d)8$NrW&ja!Rxa`OKqB+t(pBiWjXJ_XV zoxIBSeF6*tULZfDl@2RS>nngC{L^%=c-`6=Fqtq#ZAzH*DvN=>n#U5_2L9Bbu}v}xKT9BQx= zFt0>~o8MGh6UZCg6h1!FHXoqWKb*A%HcD@kmfJUCNe{BIex)vIaU->IGMoR)J|;q$t(TxvT4V!Hhx}fa_D?? zwZ&8CX6iZ|UMd9K;ndQgeeIEhO9e- zg+PRFqw95OkTM(b`U5()`)lsy382;b^^xH3wf@O_C8|HbGu-9rL6Wt*9o*iSvCr5M z@<=>)x$5knV2S8hP16g{-j?2^u145-F@K0W0+9yLSWmI`gay=fgF51<({Sm1Va-gH zF**+16b`TL6Y*~+){sw?h&PuGkC(B_B#7kYX-rW<`x9Qr%PhlI@CNR44s!r-?dDUhmfUf?hfDaG?t_QnE=eS2cyM6P z#wND)IGqVBM(y2;%xA4?*m5Na76}y9!bscZkw?ScQu`c(FAl4y!bt{L{|VeFqLLex zC+%U~V9bbS>#x+thprX@{qf*gg~y7NB&@?4)hvI6n&Lk;SyfUwAH_+FJ*=?!tVU|t zFhf{o7+sj5Ib3M?U9k6KM|>g%*XiQAEu`C%f;I9C(|yPTnY`eqCJ++LG~>faS-;mf zZnWX6=M3lGJS2HaD!lSy(rtdfOg3xtqY|dx=i^cxyU}Qdu&i(AXx;){Z*C%4-N|Py z%rm?1yEZ(++BF7WwfY$6(5^t=B}H0zxlKvh_|$~!HsuIdMAe%Ny(qZ9mf4SswRJ1T zTXj5l4Hm2kkh#6k$5J4be5Y!EVu@Ml$xf|zVe5OMu)FFuJ@N)Xr+OWZh&YN_O1FT=G4)F0252uJE;tq{Ch|&Zk~p%NnuEcBn3m z`n_a})x(rt(r%#gOe%I)h$qwKc$v&iUWL4j?nIaBhGFYF{%}m&>pw)bg`B^V#r6D? z(_@~otu@7Gs6V|s`Q{W`(nWv7iw#GZU!UA-p9Q`O{o|R1;hwLjs6PT+<)q(m0i+Et zHO+@RN+hXgT(gt)O|*5DP#0;F>Q|IlBkw(WwTbA3+Jiev5)u+pvmTk> z1Q`a(yBLCALq=gg=rmZ~)IHfTe6FF}_Mn<6W_Jg6Zs*Vb$FjewSSpmb@4JcLc`#Vr ztpkA^KIxwU5lxZstpt_?G>&1!WOjAD9({Eu_-x^q6x-=WwNg?coA7CNpQ6D}BrThh z>0yr$-pJ5$kG*6z^KLqka6C-4#Yu*+)i0S$JT%q z(%G_m9BvKYYzmpSb9SA~mgH=&a=})$n%sLEm}%fsS(xnJug@sJ?FgOVW?5dji*`Tg zJN&7x857HOaUmbf-%aadKpjUWe0M7fS$avxWcwPA6dYd-xy7wqD_Mn*I(WHB9YO=o5t=W=u~d``Qb-- z+E!J~!F~#KHE|*AoY-(?RC^F}*c{DBcXxN9uCDG3pWGI^RXDzAPW(L9196o6JCDrt zu1xp*Io!V4dIqYIy>QyNzc;}T`j5Ni*u*~e0mDbLR_k!}86&lsO-)vP5@e{a<8YMs zOuY;0GhSxh8FbohJx(@nw%&ji(Fq6uoi(q`NT(`4w0F;9{5OuW>X8vs8!33 zNb@l9zNuDxY=|QWob#8Miint-eY^6+Yg~lDvXKCUFej&-4IR`>(WGnQFu%2v@5QeE$KW0aX5!)kqOv<(Bb zhd7s=b*q=s^Ud#a!aAE&ydLJc*%(l9r2UGoq_=3u0!d~J)O6d6TZU8yc*3;Q=C*?OS~cO__)b=BuCvLka)p zyD^U46GO6Ih2)D=9gdaUj7>dew%9mN9{Y;>wqL)C5#VN>))or#n(Afl*B;%6zbRy_ z=7TJ}`UbbWgtf3orUhqvBykRoj#PRWwBM`mOsVH4B+G8pu-u$N7_##bh&7(H5$$$0 z$f_SITvngEiWEfEBYkx*_O?ljHnNA0jF&MRwc z>!FI<)LHO&%6t5;i_d`$!~JsO-|*!%{~r90b;`vxx~VY{$Yl3{(5sjipqysng;7Hbko@mXzHefQ zhgIF8!YbeNnB6A^sAYC0Fr&m|sq4FVzqBS8j;!R(&se^JL%->=xu~02ff5U+8$Z`$ zN>*y1Cq(7~8aeh;W23z{d-V7O$DM!ij6T!uL*3ihBYDdR<2DJrL?$fJKFxhIg$p> zkr0Px9o*4HT!>t>-cZK!KhjNfrIQ#;&K5L2)#;ef{cfjtV42N+@2~J~r_{U^nW%+z zer}z7mvkDo?Lq2*fiq*9-?0xk>6gsD+@O02+tUS|ym~Pecg$!mW)EL;1BLI}ef)nK zqN`hjg;B8eG@+U`ZYXk5bNdPBa}YSct6tzCC;I*kBd*-yaGoycOjY>F$~+loE;rKC z^1l6m>i2&L4x9BVZL+FurIV_PhGYy|*j_fI<-fc<2~*uZ08eSZFz4 z+{Z`71D_*q=yQV6E>K;fuPWM7)9I)O&sB1zw!l_euKr)^V7Gr+$TxRm#LAMosqJDJ z^3WN@BxTJx(^06*HVD0P-(3;<%vuart=Iza*)vRbB7E;Wb?)U#O&`rBK(C(kcx66f z#yMHqspcS>ubW|=pKM$#lvqRPs}+`dTHuWmP$7IjR-NxyXS&ciWPr2B7!|}Ld~`(0 zXZv#=;0wiV#qx8qvD8{Yr}2lAE&)5#z-CWb{ElxKvSRzz{5fS8K)f4P2b=Fa_+yKk!YH)<-WO-8)EP_k1nh_=6OhB_b)Qk(_Q*mvz;NR z(V-<&u2)^8;}C8`Wg1%%9mac+UZ>3u-90`UHs1Y4FSnTADiVCB*@ex58E###*)x8Q zn$b+)w%1n}=*tG7{>5Roxb7h!ZxjjGQrFYxX2ehE=iiBoiL<8M9_G0k14v;JXeW64 z$Btgeix>|>HZ0Qc6~Slq5($heT{e-W7H(jfM>OW~aTJ2m7IvAq+*`%7dmhPv?D*;- z4oMF`AklT|`<6EPh!CwqXdbos5mk<{w58*wK|TkMDqsM(SQGrgek|` zlrYe7KR!a!5C0%3DVc0xX=zHEPWnN*(`So>K2`5`_^xU|DV`YXZyO`p{K4- zUW;b^`>vX$0qz4DT{A=6bpwsy>+dMK??C{U*(Ww`{qrY@wzE&^q=wfFv>;aX*44No zlJcz65nitdzQRzy@Z2|h8IKR^W6K=N3p1}2EhsD@lSX62!U;4wj8HXJRP5$evs3dj z`&lr)w=-ivz-k>Cd{6jzW!Sh#dax_nG-*Og_Vm$0t%lRmw32a0FPlkbT}q$j+{EzJ zKvA8E_|02xl9Y_%LW{clL0njCNN(w_+tGdH~MmuUUFW06#d^JP@~g&)62AaS-!j}E(AQ7 zicxWM%v8N6@!ZSw&Rdn2B>eNK0!7shAoj6`r$tLTck%rT*?$B=l;?L-8sB@~J^|*e z_|OUkT`w1^7s4-lVq=-cKf5|js=U|Ky=V+PJO4VfP_uQ~v6PspPSCfwt!aC4l^NUL zZO<$_@ub%0Cq&ZGlfj^mMl&U)Gz-B%z;6V&>_~j5d)Xp`dS)VWrR}r(Kc&DZlWb;< z^dv@Ff%GsaxZq{uK0t50b?`t+OiWCd+b?gl1;QUieT>NEI*Erg4)2*!;o$b4+daMQF`(A); zUnFSXKEON_P{rI||9o*xOgJYu86kOJa#NJJB~=y|75CnM7er& zHC;tZUlm7nFG+n7!YNjb@wgxLY>PR zngZ~ucQR9dvWY@K4y`BUc_!Vc=Nn&IqhIFX8F1`|n4O;rdRtVv50nNj~zSZ1$}K>Ly!NS+m04G^P$0g8Kn z!c)MKk-{4w^;-+V)OxH%!C^{t;PS^|SiSs@?JMIybw4V&_V(;kAFCO$zRo+3)Y&EZ ze?yr5et0FGKjWTdQbHjLahRzow`r$nfDRqu9f=Lzv(jLeWoXUops zD`ZnfE}7YxA)GCe%#ytuHre|kqsX4wd(XsCM8DTnr@r5BzyA6781H+%p7ZgX@4hU1 z)V?He4kM;mCnn77ne=&3PZ~^%VK1&w-Q=AFKAsVSy39xUn7{|0q5VVa&-zXNV9CC=%B|Kq(kE|>jbA*?&J(eaO&2=H5JvfQAZ|IiIZeZ~EwF~+1t zA2;E?f-;hMiTPnEnr|{Q-&cyckQrG=>jMwGFock6p+?KVWp8(6g|6>boW?T9NZ=!B z2|Z7qehNZMQWRPnv~WZvD-kJ(Z*hW2{kpO9NvnfVT|?oTk$OqpNl#}$k~kSQJ;v~` zv9S>j(#t8U-J%?nOg;r*&LYzIg%NwB@X3Q464A#lCkDp`4DQPMe)HtX(wiha5^v6Y z%eRre^grBtY3~;NZbBYmyYgqIg%@4dqKrM;1JnQ+T150q=Yjgyu@DD2+j!eBJG9Jt z1MF76R0IV*x{^R;s_PZjJic~F5ym|EHW*<@`74KXX{;AxanSJ28C&(TWZv1|;d_Tj zv`jm59Jly`_GyUm#S`IqrI`L|i}7Wx0!TJNGWC~%*~LF1U7E)RnZQa8aWD1A%gB?= z1+dC;Gg$mfgSMMI?;o8EndFcZd0wW74J1Xi#n+LC?#pDyjlO+B35=&LkXc9t=wyXd z7+2x{zMAwa+;~YZT}3F09;p*A{zt3v{2Nc(^IjCnl|}yUMo9^W^|C$#441z@QYvYk zIlb3mO;(P}eC=tNuqSX5V#~6ySxfx3xKOduC_Z zFh|T_G(PwYXSw_!gO)t)T6IgpdCX6+n5TEEIGt=v+?OrNwI5JPk*wmlKRlclm6b@7 z=IcO!E4;3xTc?$rlFE~<;ZIDQJhl6WfW*KMX4QtlK8()xB<8_zQFjOXWjHk;wJ9g6+w*CFubXLb+u8B;lxizkw6}dJbQNT!u)i` z#N6W7J0eLIW=SR(9Et^frP|QxL7rjwF@EL3eZVK#w+$kq=pOOko?T?s!@kNK<<59) zoM}0?T|L8Uc%q+JP+Msjo~%UP4x5vp5Alg&z}#?$neG#T6(n3&ZqM3)E=#w4u2ewz z1cvD((7)k$VbRyG&JbW%7}%uF&L7(LNKI*87V(PHQMsda}whHfrIgK)C1|}ftH2_;#G6A zHRC@_GYj+UrxNJE3Acz1e@bQk(ryFGa{HME-5rsE5IsvJ%e}1MfQU;bX|rDAT}IYdK5ML?b=mA)0KSlT8`1ky;Fg z4YXglO)v(gMB^s+97$JmP^sY`vV@?2Ov5Q`URl_BWctP?tNV|br<`bzf+4Y>e^J_v zjZ+DAlz{qs6iw(UQcbjFD05^R$h*U@p{&eO{iWt$gRXG2A5*de1cre*tvDVwU6lu^ z_+L(2MTb%{J<;SwddoYUJbRkl{|V0Ed$%wdbykzc_DznYw6v+}++3TAkV~U-(Lh41 z3Oy?KFFF*fchRoqchI5XE8MPPK1Z1)Ni1=~D)bF7th5$uEf!39u2mPlZ$ves z`}aT{iKFPtMue28Of}~ z9))a6*$_cpz~Ew`5*gqy;#dXSA30P@qK5#jo0$4u_YT}iBRoE_asI6&jtJ~ayFym# z+d$i%j)ANs292H~W)+`i*5_`bwWJ;|=y?EU5gIa>VWt1D3LL6sD~KMau0gbGUp5Y}o&%55Z!!vE@k0}ck! zpJv6m?j-nd-}GnZEK?!g5N&n>R2nLS&u9;*_(o@IYDndUvLK^=Hi5yo?)5E2;+#l| zv2w#|U1kDT0j>O5iHFbLwWttW9+6aYBiZ=lyBEO$Y*lDHKycR~^6Ydh$?8G_J@zo@ zJjF7sJ5?J2q^QN2Vc=!1;cD^F8X^Rq@Uk$LpGD_?zpi8|f zDF?qYjxIM(qxkeeYVJQM&J2y8w8Z{KS2l48=W2!OI-BjE58efTPIlFu`w>t%I|ZO} zuyiz{)3TR&%ed`4(ND74a{1BW3isVVLdW{^tSssvMH%UAecqS6g9;;bcj%2t`NIyW zcXPX$#|voRE&MjPwx$mSd%=zk?wWgPoe5*OsK=EfZ^A(T@9ovmWfAK>M<>C!q~gDn z^f=9L#akh&yICDrud*HIzcfZ3woaa=W;HfZeeCNC2LEB!VDLXM!p}igg6Wo~re<82 zaTK@G_i+;Nr$9>VZ^4*eM?b5PN0Cb13N6Nn!DBY8Je7F&&mFm<_m4U@Waca#x4$es z2PlHHY-NV~#H&EhVdd>umEbDKWLvsM$Y)a~a;Ei$087bg7^aW5lm|nO>_25C|1Vjo z1EGROpjo`(7TaH0-Ji5}QWm762PS3A0u2q&Nex3k3a-@ZQd1e%n^z6L%f4SNK1V5t zb=&q1ra2TAg`MAJEt*gz%$YAM@h~(_>eF*%#KAYPLztKP zSs9!8-MJO&I`h*{(D7{`HGghwN)2#Ii_Du6tb?co~MPD$jw@r%h7v z)<&ENCxCD=4gA}hQCqypv?9^z6QL=A=P>FBn?8pX9j^~!B4uaCN?EaN(9$q+Uk@Z* zA!rBeN6`V^m#yWLJJ9%A&5wfSq#+7QgT+oV+29lW5$rLzyHkwEcm`~=*80rjbX6q! zjQzbi;AGM1_EQi#IstgS7o ziL)Mc3~{mMCoepq;wcE6(KxQC?ybHFSRZ|7ZmY4RiiQBgGC(vKy{K7_Qe}I-CFtY6DcMDs!+emk* z3U-!oj$nh!aMd}puk@A}k=V&$8j}62_dLokpr%dG(}An2tH8NgQ^^Ad{i14hB#loN zIq(Tao{<`%po1Jp=qEdg9e(kkXDTg$u|+O>sU@QEPl&FV=BL!bN#O8KRgrFhFGCI_ zkYG00USzA}ge6Pce&AG~t^{bq)<_kFXY`A`3G?t^SKi5g$sAP$4tm%d%*PK>oX?C+X5 zxKyPdfwEy#5MDwOux$5HD!kw(nre!})$&*f;s8Vdz z+)H^ox=WQjOa<|dIw9Iz`8>DT144eqe80)w$2`4wBZ>K|&HelE(Yt5f`DtY$0ueHL zE$av11-rH1Uy6!?oQv+{>SW`JJ@eHB3j}S?pH3L*hI3f8ZW^c!IhpJ%=Li-@x2C5R zCnrAznItY!S){$3@yjr5cKTrzo5nc40N6;VKv`OIh8bLj1oHN1!IG>fB7?BWJmj(` z^VPdqgXY?^4=rm?V-dYP(!2T|V@zRq*kHB}m3KkE9AArqDeFTQYX5W_!;8=OCZOLF z7!P2h%#!?)uG>j;b3Q#%ujd@&I^=!(Zp0Nm@}+5%Y!W4n1|Y5G_;vkgmdLVaq$(vp z%SEgdXZ;D96odSf%AZvkMDrQ2Nq!yqz(~hjEFg6&@WfC0ftNI^KD+y9=FaMaiGn}` zV~dN*vprDi)`QN6X>vbV-PRVoVftR3oy5=^i&h79lTPh2M$Tcz3Jsfpqc30U)bgZ0 zEZ{PFM#AI~I3oG-&v1!ijGvN4ca=U~k0quTz)p?7_jUIv4+otw?O=TE`ULa4W(p%# zVrd}Swvj(_1CsrtgX+esq9R9K(9Do^!13biSB97)W7Ox7d7nqQh#caqKfXsX#jlNF zS#C{T=b6@ibA2jE<$ygQ1!o!l02**ZQl-EvCbeyZMJk|At~(HB2LK!ontnw(1(&hQ z4$ewl^(55+lGyqu!I2IHdajnY60}X1qC|Yv>1o?{s%*~4QVe^r!_e$OaOH8-T`}fU zoiG5!35@N{iInax0{s`1d*of6-^kCOA7I+y2;0X`(8J2R&H&Dc3eH6&=#l@0)aKC^ zjAf6Y>7iBh1)w=TS1}(X4fDiK?#j;G4;xAH-)9hO9#ed>B5|Jsm?SJ%6d%ZekUzpF zlNu7g3boSVd~^IdY~mpet-@ikO&Hd1qa>{dTz3N_9%{YI)I|LBn7OiY1re0>5$e;s zI*glX#et|}HC094$ji*yd4Tv1AS2}iNBcuyd@Db(T4m?P(faJJ1$$$}- z4<_V}YR6WEJaYPh`Y!6+;#A_Y5m7D9q~Yx>T1qOud)|#T95Xhl?Ry{8`&06Ab1n4M zMwP8}w6x~*JD8v2llm*2_%+*Hlec$2-?s)O!b~US+*OZi`e86C`1?L>xbe)iPxcAh zJ%;(In4JWCL{50s;0s}P5{fo6KK4uSYG&#eOJE6F9G;W+)vqULecxqGOoKzWaP9=8+fPg7!LnUR`^MAs~= zoOfUtT8w4g0Iice#I08{Qms#M5KQ-{Tr(x^*$nttmYaHTt>U%It)OEJN#sKOCa5^B z;szMh7Fpi7pR0M(x1Lx#tdINgiTjVThS<}VTQ>%R%}U`%l&`z5IA(J|X%(!mSEaGb zO6+$mKcxi`=gM~Ie#PP8)NUcL0+abTx)|~Dljemyz~!+7rCPksT^dU4S>}CmQ6f^~ zGbRGtASC{2Q2mRDc|vM~R#aClCX&0}G>?w=yy65_n*t)jG@zAz5#u+3qiSl*@nmp!*0Yg+&2pBO-a}5?_eRHw4*~NF=BThpN)6tt z9v((-3|g%{Gc9!jf<=&Fa5q9QE*1y7PY4^>v1r>M#gB<#i(4#sa_Q`g+*Ahoi@$vs zFiX#&!!1)~vP!%ehQp*yViGP951BG!`zT(LVWf{(F~m=6oHKWDS9R77Bg$FWsp{y% zLX=@kKxVOu{J!iBY#a$1!4A@V40Q%yJ_*TE3A~EosjPBCs14$V$bJ_Gbu_<~O{GBV zsf&G=3HvcZ-XH{FhD!`4R4HMe#3gK9Dh`y@bi`m&EKzo+XtFHH1N}ClWSR99)rYpLTQ1DI;6Na`DV`b&1Zkx>E zlD^PY(^zsK6&`p1^7my**Gxe&QjvmJx}O!SH7LFRDZGJ0TC3}Ep8Ah7@$cDka^s#Z zyko_%gM`_5KejD5F?IGyuBOJ4x(*Wh!#G|P^W(Q9Ak`MMA-;Mhzu#c($JoW6sn2kO zQ8c22iuxy$t&%9Z)W1du(q3LOyFDU5HY!20ME~LFV$W%erAF4m`?K9nl=sx8{zcU$j>;S`@)~7 z>*!20g9|KmCK~^l${C)pTyC;FLU%ggvDpr)BzNV75G(q$IbiTRz#^%BS^*tPhAH$H z9YSXo=M~ufI~wT@k?{ zd#SC){T7-@ltD%r&XGKJUGvl9mufQ=5-XpS2?+^?tutudZ|njI_a)it)}QqlULazseK zhk%nmx{R=fxOxG@9pcBNSXrgH@5lGVjs)Q~4?eJ(d~+^jUeX2Ww(*Oq7a>nsqMT(V z44xL5i+;Hqw^8{G()(jv_hV_^Zh&5QWXxRxj79}0G_1}ze=jcW$5DIUU6UD^fy;Gf z?Lcx=MKD}ji&+-m7(K<{q$0$yDyNNddA+l(Dv ziD(S}{4^J_Io$uJ(8EaTh*&ybZi8?VR9_mDd&Ky5Ug%FolJ|PU6`4$CHQ3~0N1A&Q zAH!`R9y9bHX|GX?#vlpPe4RI~P1%+0W=$os&WeOk@~cxCK1RJLdl=7-p@=$mGT>55 zQ)A*_e~A9~ZgxtY)nx*m&2nfiazmH6L$I5c{K0d-?pB2Z?N|C~PXd$g@(r6a&f z;9klf=IbtIDRP7xom6FrTM4kThLe~-tb5ppbm%$RTF|sQCxfAaZN z`6Dc6Q89Y1q0+qPS$h+YQKY4|S##9h7-jd9j=~)_c)RSOgwEIMsXyi0ylgd&O)a&Tuw#j`%wed^ZgZTRnK_eKEjJgDQP+a+S?0}W! zJ>z6-ig#N!&>bG={433FwQ2SX@kgke5B=Gt66rxw%l54L`RRo0w;fLMt(S%otM1l%K^+u`| zA=p2cvj9c%0SwxY=a>;}E-aJxkt1Ycg3)kw#Jz5&mH#i;=197-GO`yML>{XXHIdvO zpgcBTr%c0{ofkTVq1U`9V2(a&?3$_8b{j;; z?V|=O?_BIqGsOjU+p_y#Az09_G7YyfQgwPnN95WL5{|zx4E;yLq+kC+C|L$Dm;s92 zkY{73?vAVe?p(r^@FhGE8d;5WQ*ZwLFX9$)5wd=Q-tDG`W7ib0 ziFxHQwvApj{j*=v%~T}b#|o>F(;2DfWco{OyvfXP)I9Y2P5@REF^>|fOcBpF)94vK zMyUXrc#u(}(F=rD8$To91Fjo+`#~j9npz72R3^Yq-Jfkrzua{Xu_HrYY3L>SjgGZN z!fQhK+oe6<&ElaLs|1Hl?;U;j%o|nMQ;Q$lHv|ZwD1(X=t^Qa>K?vp%$vJ*Z7i17w z=DS-y{PK^C1bdv6AERhUPD<+jP!b-G@3ujwpQe6OaPssZt5hxOO}@&BF%wXLbfq)8 zDvsgSi%;gvRz@#1;m`+-RFRtx3=Dil1gADWUw(#8s+B;c0D?~h{8@ei;^OrN(Sqnscp`Xk^sfZi6Th%C2=GZ3*G0<$*-JR#FzUq_s2}!U@<0j2))(j zd@!|+4jw&OVs_YyOS6Bh#!F%NXAg~MjM4OLn{3xGocz?OnPIagn1Kxv#GF#mAQU8k zeg%t>eDUz`Rtdg85x9!$k}&#W!U$XShtt5d7O*Utrk8HWSx)XWMj|1KXo$=kwBJVSmeXv;|VqpIL!9ag0c6- zAzw$K{Xqp)+7LIp-DJ>*AnY7*7gOv2T^c-CLT!%{lkY5EVBi(;UEql*yTj+b6eA3E zDlLoZ$NZxIV`3iil`Dj~D+Tpe@;9{w67GPG1v4?LpUZh3+j3^2i%im9l&cBYq;qSs zi`^X?nw3O1lf`EOK7qep; z+49>Qs&!sL{CUPHG9K^G`A#WZ-e(?9dUmi<+lkC&r#Wlomb0z${(dUHsWBv79rs8(wlnOG|jX6c`am zfKBVZ%F@|Av$s}dn11iKe(}v1GTLTdbPQ{nQN0*q*U0!=6`#V46TJ;hG?lt-RjP6l z12wzDte@-C4TOGKJ(Su9;6gO|#&y*@q%7lft>&Bbbm{h3Lx&oIU(GnC{flKP{-*+v=hJ&JP}*OU#L*L%EUoUz zF4$XZ7%BgpkJ$rxrB5P@es~8VUxXeYLpvfca-({=B1hx#Zig;24&9j%iOwE{3UXOw zE0?z}v;EB*8kZOfV@JQd0*xxyujR*GbUxu2F&89LXV+zjjke!mx2pQ))hW?lxdHYS zrp>7HoBXB|(Jj192n#kQKTrF0uO#Th2_ z)qc$^Y;b73!alF2^!k%Jx64~SJ(Hx@A2*-~X;5P;GIXV?6#UC-@XAY{esmy$gB2Z~ zmj5KRBJ@CBm3fHGpg6TqC%7VrH{IxiYE`frMqOdkBK|H91Cx~s;uwQUnPerp-Q24P z=m+pWAr=q7xSIKcpqt)blb2&sj3YlH5)7pBPb5wPV*kwK{oFnBoqJ}xRU#GY41Ht7 z8o6yu&&|Jml|;9RtlOPe_|s(qXPjQ_xaii;H6t@{*Xdy`IkSOUAOb!n zA(Yd>zpm%ll-!xG>a61$?~;G;&oP^Q{P^d<7uuzMm4o?gzi1jUUoN}n8 z^Sc4L^r57~_ih@Htx32Wx^NCR{{4{qK=Q$+6XI)rWD-aN@R6?54XJovTE>*#r8mhO zcZNsQ$R$!lmf(jd4VGdiQ{?y$yBRtRVJ1lw1&%Xmhmq`05CWiI$?7{zq-J-mL>Sl` z3LD0T&<(I((&6gVCqJ9yB8}`0H11v@CezfeN|A@KMeux6rOiti&6P?Gu_L?uurCv| zT3A<)zM3)pbforPwWaXh^<{AXd2xT2yhSGU9;NgwJCc=e^-Oi5FP|wk{wTjIhY{mr z`e_}%sGaPQ!V)qgJsl<4Y|Hn89}7Sk)&q5vCMGN48$TxD6(J?)q?I9%fEMNBe5RKe zhcqpLB?OVUGy!gEr{QW#)t2Zy=MHeXGB; zMk+jwub?Zpc{~6m)(B>eVBQco$w?8;d{uJfnW~a`g@GtMAzwVy=p#JlY}a~N(IzDp z`?fx>Q&Lie&$n*j8HQ=b1836jBPh33YRSY7C}f0X`ZW)?lOVIsC-?Iy6R1x+Io8ac zEE*&mA|$ij7^Z)I;unY2m~FY?Qp)_mzX>!fMJMKh-QORNs*^l=1#5x3F^}^dP~c`| zy}(BQDXGJ=dDkY|qO2HZy#ILw#^-V~4|O?(Vd(B5#_bR@LhNx6iO|w(W3m9h+-6%$ zN>NjqwSUWy%qAlSm(5?2dn^TdlYgRH*kkOz@cNbHl*uO~ab#2=x4Cx=oH?c5B% zi~VU@N&|O{WKl54j4!`dZuy?5XRIOBKb@*it4_&n6WLB#sm=FkaxIYb@g8rbuHGY% zVLWj2tVuf_n;xY6A@rWzQ~d!i$|0e@s`_B*;4zCM(Fm|`Zg)arVD{Dy&nnI-B;JSX|~?Ob|+{_B+apvM9bn#JiwNJEdh^kXX?&swfy zH?U*KxnG^ zAzeuP`OEl6{i~%q<;oOK1v;OMtXrY8Ck^@#_@T;)7wlh&l4*Ocy*Iid z)esZ9s07eln6S*C#8bH$gYQ>k4F+q&IItDa+VNPC*CcK5A*r07Efou7`)tRyvd74I zPho?)&#a@?0?td_{hf?90%U=n)ur!O+-DVb7ZuhQb?0L{>$m!5t{VYO7RSz)3wSWw zVD>_~+0SF!S!C_$8m@RhPc0VK@+h@OH%k1{9s!7=pnkT=Jk;?0P^X;lF=o9(zQnNit`yfcuuvXZDj;mbT34YCJAZyq{6|MjXTkdf&KceIuO#`5B8(RN=Ix|cyD&_C#DWi z&R+?-Lz*KTvYY*avcr4rLpA@6x(`jFByZ#$?VBB{ciohEe6PBrv&Ri$kTy9#T}Z>h zY&}fkrpEb25AU~X1O)df)Epc}aR>)XnagkT!V<4%?P$&Xf_hdK|&hq%>tzAn|%%(95dNv zwB)-FFWE4a6y;>HU?fyxTn^5m*pp$0jj%Gh)@`i1GZvkIVpM z^N*yF1mjC5vF&$xShS_CP+JM{%26@%1Iw=FS}V{Q_XMpL-(_HVpifXnp4acu-LdyU zoLU+ zP_I{NYN5(9re7^4+a1RB2;i73u6375kK49O7{vNcvB8vNTf%1*<*NZV&^+_oT?i99 z)*R+=AFGJ}IN08DKAI;HH&^ESg>EBN^k1=A;JAYnP@boRhd8${mdKm zQK%1R>~q1}A@G-mI^<^zg_SHVbPsBG4|Z;^U)b+f>AhPb!VtKNU@<0wyo06xh^Rh# zTh!K&Vzm|*nRy|htGOkxw*kxkx!!9=r;Fl`2|vHcoDp5><)7}wq$B*xps*el{InaE zVorfN@)D3o3Rc#P^ib6B86Pr7GOcUbOGm39igIZ-i%gtLeZ@W;Iu?SzU-rTg%mZz) zJUK}Q$zFp|&x!|SEyaM2npO%<&%j6K#LUmEiw%?gbw@&Pjp9Qny-p;m64&f@GNn`+ z;rgdul)=Tp-D3)P=`N-hU{p+3=_=XQHM>oz;K6|bg3gVU$8KoNsbeB$%}7;GX?Rgd z4Wlerkadl|GasNq{FtmYP`EF|KC?MmeJ=6XBU3&;)2C;X2$D(9&+TsQs{7`x^0n>c zTXQBXV%Yh-40Jv1U5NSBY^Ryj1EIMe!?l;)_1go~ce_VJ5`M^I2~1de;%yVj=7b5q zUT3fpDpVFwQysD{d@Ayg)mYI(N^2=OVe$6jCH@dzg@llr{m~_L?`C?kgi>osH?v0G zo|`X>w_;4=t+V5mH(ZxSk|~Eq^2;Zs){cJod|K+iZYs64vveqanA~z$SHE9K3AlOhBwe%dvN$;uadx2k1xezS31Sfj0Qa<*) z^U25?ZapPH>HS8n5h3e|>Mo_g<-ZbzZIifu4Q@B}^{EqF=aP=b_pO~wNyFv-=XDQu z@EE7ybFMn|>5`LTwaew)AIDl!wo2djPkyLa4kC8=Ha&Dy5rC+UOTdQ`FBePC$so(i z%TXH}8%^+VI}VGORSl7M$m5RJPk&vI8QUUg=UH&ZUEj@`eI+#)m845#if^N1zrhSX zKwaW~9-mlG)_A{7X#3FWd-FNTsq1lmYSsL9bB^!Y6ax^0YASDRCR_NVXcy4*149wKW(u+Z!zIDWtCwwR*}hoq3`2_{$A!q%I7ccr1GRAsqMJng%k z1YhkoxMV&Q{*?KBoF!jo$noHW#n8E|nnQt!$r9>TwLJ-q zF>^bYCF}l7ciIB`7&XBkrf;Ip-Q^C=_9g5WBwVt)5bKsjs!7Q+r<)o6+s-fZ41za=(rE76! zuXSyw8q}o?HXps7KXw+Ea5#|hAmXpQH=(_K#LHPkiIZIRsn9iQC}}>()Ow+0y_h?f zU2-mN!THlu%H)<3w{fq#u__NfniWXp6y=k3@wsWrK(H^8wbnSKS`v8<9T$F{ewZbU zD`K=)_(ONhZOFXQqYdhn<^B0i*O{ZCVea^9i3wEpbu zYcXN#8r7e~R6hThNTQ`X;gEZe_;yHP?niFo`ndPqyNfqULjrkJ>7u~IA<UZm64My6{B&E6ruYkTm%$@B&VQ%QXijrkC0s2X}Kki>spd zdDE}_5{`=$*@xKACYn-EH(#vqewsSM<8HTg5AU#f+v+yN(e!xg-W)QfjPfCL!@O6O zn{O-ie%*f6q2XuCE$_Y#ywbN(+}+6Wim7_FIGq3>Y=ih6_;_8Ox-Z)48khq-+6ZDv zdO}@W>Yb%r_S^G}E?lyq^Z68CnYI^8B2Z8evX!DHj;Sqfe?pdbxEf=^L5UNzLf_>xcn`^Dyc32ma#k z%3e8pKx{nLy0zRk^e%F!!P=+F;afYZ4o|x9O*GzIyO&>PeJA%tS$Go;QSPOSksr>+ z*^IY5Bq1K4?$EBZN#ESf`)@Wm+CfCh*wmi+e z9&8EJf1W(#F4@vu{?sau4c*mC7m$(2T1C0XP~GN)z-gt&CZ6UKy;3uZ6LPum#&st< zmMNVCJUy_?(LnY&MM>Vn`$HjjYsc)aNv#$2oWX)D;5{eVV}BWO|N85SxA^nt5<9=> z?a9Q~dFxdm-*)G0lfZiFq(|S+)b;PlLIV(;8b+1~SoWucoK>CcJJ_h%hO8-@arVct z=PX9FKF@k1=b7N4>dZ=`A)W%Ro5&8&&-eEEJ3OhqMqN4)`d+I7IG5>F@;-I{G7BS+ z(g4ZYYZA3hVy;s2$zdW}O@-Iv`K6D0d1WpP-xg|lRoo8uU0 z0Dt@ihr*8L{m^2HGCyskzH4`#7nMNl6h+JojyL-5iULG2f=MEDj83 zCbc$eh2oPdZn-(Wp_;gVyYr%GvVDQ#sT*I5lE2-=2RgAz5$}f3y`y+4?5cW79v$0qr~dx2@Gi=*ri z$7yG8VZGMeAp9c`1|ff437050=U#p3!wQ=4VCf$?U%(LiQ}ckw#S!!-%_}3HFK-_i zC0)BYkV0v4Dj4s5aOFss~BCJHgUF-3F5O(biH8KN|!a!KTTdEgX4w zRuPQF|)L|DBV7BKAYXIN9Nu! z*^r4l_Ye~L#y5EE51x3?Rj=?``&YkOCw}OxQ&2zO-Q!Mg`(^1im9K(FRCO-Zz3a$r z!zWsaV^=8=GCwK(h+fl@mHTQqcWTwzBumiOi6IsP<4K>}B~!5jJPtvKyS~@aN9~1o`_~XnO~Qp9h-)S*yCaGr`Efg_6MD+-3bC zzp)n;K~uT$33~$~M@!8uZ@hceKYWDFt3~$UA~q7XY7n#XWUfm0f`1-?KqWmyGj*%H zy`WhB(5}{qUnjZB2iQ}zgG8`P-tO}1uC&C77~f-CZGM)RE^zJVLl7|H`t}X4 zEU|^48SCrE-;(vm)Cqx`6Arz5~@kHq%v zE_e)im%Vyp=r!^Po-`FaUxpMEKPACVP*l6*)~WBm^r`1z1wP{pvclUzZTuO}J>3uD^_K88 z?LsMt6EY5!us@|Kzsk*iex-i!$#U%@X*Xacx+rm4vwpnoJak*xj8l7#`VulPw(;TZ zm%79Hr3Y>miCZ4AM-{K)45LQAd#zw$K{n1?*=ag}{a4+d={pw_o)vhSM(Aaqu9#nfQklDG5Q4R0q}9r~mO?mtUalL|}N_x}hdx~rD z(0H)Kw3HUnNp>o1W64#oqPvjgr{s9xECzi}{8}sIFcrfRrDJ^HIkCH64WBZKo(jpI znzKO4z1Z9;fM0sfz@zQYdpf+gs6%sZQ{)Q!e7t#K>9I)MYcBJTWUbV??PFg`vR)EO zIHuM)n#Ov5A$`IaMcFRQ#Jz>rX((w|zu|uvBlL)!Tg1(C7V+gb(y^7EU1Dn7fySjp^(rs>p5m_Re9Zeo z0P#>CERBoBDr=H|c9ub+%xj7f59@*Z)iqI|8=~|)M^SEHr1yAPLS4*;Yrv;sG^XJc zHt?P)lJ{{(7 zPag8~i?W-rjg3@6e0>qSl&$CKhD%vRJEt9U!GC*BiLyqqU|$FpT5CST!tUZY>zZ)N z{~Z)Mea<5H+)o_%JG!NUSRdN;$GOX7GRXnTe1jAat+o`tDJ)d)V|D@t?ECszL85}N z#+kAbAN4<%`1>G7B#NH3S)Gl?9j-|V^x2;>M=n?xeK`ZFX+j;YrUMfIa6-b6^>d5V zON6Gn#5@!fXV$yw)~*1|i>G>a%Tz_ZK5s~W<1g~N=)>&lIjK@8%^rBBqEb)!ui}W3 z47AtuR+W^LY)u_aUXsy!a}e|r=O;$_Z);R0gC!M4)#Vo)5>=0v!om}zOExRCciCZ zr66|qsi)j-i-n6S{Go%i3QphNSZ3`8P6B6&+yuRJx&R~vx+-4)UheU=sM%ls{oRX2n`zlVMSSWjC(U@;D_9VmYemg9K#$0bK6yX`>iTLTJS57 z0K{va+*jEC;K@ouHp#hhI?5st>{xPEzVjCjXP-BiTQ;MxN9m#hH!?kU8kZ(08*0&B zX~Lv#9=mRSY2!kW;oqN`?h7$b&IDHeNS{(c&QJQPzq&+V)IvczIB;iYXL##2by_?P z!ONDJx2Cp*^#4udpO5m-#GwfKJo)8a!8AWI(=vN%10+Q}eQNn@IJj$*X(P2zHT<;) z%6`(i{vPid+>b5^O;r^ojyZh$Je!tvC#^j4Bgt}ZL%yQ{jS6Xz%6r#?om*R%XAQ2a z*CY|nFiEMZzXL10FZAsEhhg?##4f43z_-`+R^QqAl{o~vhN&kf-SCwf{+e1^E@eCZ zn(IB8RnC8^ObPUr-NpB!k}fFoom^40bXZtlrvk7A8KWL2Vs{e;!P)= z&J76-2yhCsHhDKeXg%{zZ}Q4RZ@5Hz_+7)tTsf|8@m$~b`!i}*Xp?>BU20c+g)ZeM z5f0*xa|*c?GXF&{X?^TI!?1+iosU8(V*5VLyCy1jRB=9lg6UWMyKqqS+K)Qlzoac8 z9=d1aVf{Pb@?So70smE9G2g3{Od;AxF%O42FC_sk23U4wSn9oY3O#*&{i(@G$>egw ztX1=vn-M@eC&-0o;=9GOcP<~lEN}#XHIs1lYtB7~@2x`}UrJXs3fyz0*Hz*qcOVWP zA9%Q7LV2n1jB1aK4Z=>Q1&b8jxGdYr?JEFp|3$|MQbo_JivV(sQbqOe)(E^osQ5{j zLQ=dh&^hRBt!k(a)tj6rRF&0ph`qD+PYH(=g*Cho@=OKBFC{l6vofscUMUp1YJTHp zrn*0Yr{%ueha7}!S6Br)GjzxQg~{7FzChg4YzjNynoYP&C5K&BOaf)BzKem5x6)c( z8rk1u0*-2Nqq^m&g7FpQ^?ECZE_~j@TFJ1mQ~suqlyj>sGZw>+;;Uvz_waty6VCIu zNdEp~%stc3LEw|?)}=+~_y&JSLRV3~5w(J{S*xyg?|1*rnoeA^5$8~(bE4sIalF#A zl5WVTfxi~6@G`vD%Eijr*_k1M-UMmSrHlMqFfgBKJPZjHghcme6ngbn&+H{RB%ej4 z$JL(GA6Sk9@v^=`cexI3eqkXk0Ub^2&3Y|V3G0<#=;^<#i2pGzbOq(09{%{0{~pvu z60DWoNWU)ON0t7akM}2=K|A;ae!i0Yz*u#qVefp3jd3i z(hvO0q727*>W*TGg2x=*I$69H#4punPcV|!ZeFO;Dgv96Ey1}I1M@l4Vy*M=Hxw0b4yyYdv^I*LmjHdlb{h{kf`I$^iTQ<`U}>azx})+ zG!?J=)Vq?OP6}Y<1`_{C9Mo)8iM?PK97p}r&f2fQ^S^}F{}DFYa^MS{o>M|tJM5nC zyJz*~%us&%WCL^E*``iJiQ#wWy^bSFlyUQ;f1Lna!rIe;Dll-D7lzJm`pc!qfmC7@ zQxl>X&tIbc-2 z0d=ch!fSc8J=WS#(Sb_h6Dc};n>>RW0{<#2D~aNG@*ti}uv~Oj^gsUg=h1htOKeFu zB~ZITbZ_UJjt5kM=DtN|NjI~4w&M&+6@_u1GN|9eUBop>h4 zL(ZSUOC<-5!!C!)O^E~$aPn$h8NTVC?`gFy*AJ{P&0gw#gv#|7`u}_eEDFj&-KP(S zk5pDySIhjw{ZDXrC9^5vCHDM_jOT~jaWjSgdEyrhCDehn=Yw`R@)Q6%x@hvrstp3N z&ETyUW5K1`peFD$`SSl89V=<$1c|C%tFo&X!wGR$1bCbhL5On;@({Y(Le)+>6zW^8 z${dYK=JR5Wy$rI$b3cyXvoN1+Vx?(?2haZVI?G6Darv!-JZ}8TDX+WmyX1ZBfGX91 zD)*zS(_0|$z20Y=`0M5F^seFmV4nXwJV1>xATC0B;-`UnNryll0s~m;k+j&Kdc6@L z6sXB+>-OUs9T305n>zo(2R+vQco0PIHN`jmc`puAC-X`|t%6-wvqpbA}?7i6!S3Hlt6nckJA;*QT>R^LjdmP@)mzM^w`y&;yROkLhJAymuR*KxT zj{JZbBhJ)cA-rA?Y-P#|cQm>2EXK{oJ*an{mTZ^M`y<^adNU;ak$sfKc>hvht2k*L zETph99r8gZD$0`~TWEe{Yk!z;^Q8n-#`w+DTI2G6S*M@kWtG+t;~*+D?z0mvFIo{D z!+JmC9{Tr;iyL4M5lZ=d$B>hW?rQcwi3Um7u)*qC- zpboI66(r~n{9dOE;=9|`0=lKr@4d^;qS!#qYzY{=!STDD|6-%Gu`tBRX!D~=SEqz= z*75?gGf0VG-^V8aX@S@GZFy-?rxYXBDAZarO81PsraS(Ru;-4ay8Hi0WJhLEMuo@< zSs7PCGE2y&?47;26)J_YH_2XQZz9>F>`nHy$F;e?bFK7zd4B(R-PiTG=e*B*&-0-y zos>1NQ=fC6Mma$xp@xzsQiqcoU*r^Qc3@QjYQJu>s2uRw=IZL|@TZY?F(n{;bm(%d zn-WPe2O08{C^SQmzYELr#8Y!txTimExS|G$Y1zgPRRQ(4Azl(%;5kMnR^un~=!d-k>(S~&4I82@A$T66INNd3 z;X~>S9}bs;`tnh|Luzg%)bQWFBI5fl39Cxs6x13D7`=FoEK3e)M$`)JkQBa0Qc3!jo?wYeg@7M~b z{Z^7NTWr0uLc&a=hKyCAsVHigB>f+j({3tNYv>e!nA1Fy5iHNP7zJV#11AEh;>II*SN)sI`mEWbWO?ivvEHmfOPOcb8A zpCq}#;hB1aDX-om#f4>) zuZQH-(%i)RFb@#*ZEen&81DcwPj@1cpNOYu+i(k?D*MQYpLZ##4AohawNN47pxnhJ+di-6L3Z?!h zTMmDVq!{+&3m+Np;4A5_KkWLtIp{XEC0)xOGqrWwB;S*O3y03H7I$QET@!v)nMyX@ zEn2-G(r`Ku^36G0@+DDE{rAuKCn4|(=!>uiZbf&~jlk|a!j|dvyDiS3PCy3&Ud;zy zrfjr$LEn@0_%$vNqU5QZAvAvad?80Xo!zX`E9ZMNbMSM+(ry@UtW|#Ktq8 z1gZ(qjghwEsps8hMk9>Vgy0cSiKzdA71*yYi9JV?Fx!5qub$OdD2T^(TNCWyU+KMmKAa)K^M>b7j6hr z_LQl8i))O5P#KGOTD^dX0}RjzCC8KQ8GOIm(Q<9J?(UJ~X~KEjD#GUx`HDgLHYc4Z zn2y!?sXC+FswD%8LkH3oDx?O5InBsAtQlhc9vUP8TFye#QN4bjWmenzj&8BJ-tG86 zu)4jvfQ=SK_`-ctM$TIUcT7vazR|6m&2!5CVB;rRUuxewsmZaIeSC?3zrucTOdIrY z++n$~Ox?l-6n|-B;-Wuw%L;`uL7&$%9>DuLIO121sD1o{oiAi*M$*ZfMnLY-nB^4| z2NEhIlr(^|zB^s%Hhy>e@{>!{Ls(ipI3P_do zz)iNW^n-j>&yAK<{s@M~o_vkRV|FRYMYa6t$v-};qex-3?YvX)dlu1d(;ImyY+^(y zGdJz>4t2{FXn#y`*K%QGuM!}!-hA2}BM}*3rtyJ}d7zAL(qqnOa6cI=4@rh7N_(hq z_{I$0kcb$?K`XS#VycvVEwC`D-@F3xTk*<@M$ifw0f7g4kMb*b%evJ$N|Kf1YUZf$~k$XCD@rp za&YbS8tF%bi&>>Bd9`x^$lTNjsIG22{s_P}A;ND<(NYM-nv-&0eBkUxYZ^bUbtiK% zbmAi`P(C%5LcJ84b*;8Kf1d)BVe9%nGdMcwm^uXX=ViiFY}ie01?LL3%xo*Itm#^p3GlB;V?vpmBNQ~ zu7s$>baLRwASLwLmTublnc0p=&^HlPHE(Fmx81h-<#E`{M`{@NbB2dAcDHMvUOzhX z{)cil6p=NLpzb`TLdwvw(!Nx`#ZNNi^VQcKtx+86Q#C5ExdNCLBlYB`hiOKSrYqtPV|HUV{{iBGMlaGStj-!%r1|RhWz0w%&a%&NG}(M*D;itgrtS^0Qp+=d`k9j1g)e$^E;Ii*bRg zgn{Fakv`%70K!Cs-()oWd6EG9lK~%a#Xs@^s2FW5zFgVC37$nVWet`lftj@ip-Oe* zOe0W_;-_`M*#Id%DUXcI8ZP1%<+eQ9N)_pzx=cf$I5a;%BHene_?0S6Yi}zFo3)x? zlq(e>w{1a#eu5T#>a_lLj}p_L29V!j;l3JrU9uei|3U@6NtB~Hw@eERE(j6Gkv(qK zxoJ$u1L#XbWHWdinE)8k%b@k+`gBDS?`x$_?oi`|x-SNPBcbg!+dkvDa0C6FH(X4~ zmzO16*5T9g9j@beCC_!?u@AXp>uj>x_%UOCebt+K$p+TEAsns0&M47qf7Z|o#z>$Z zCLaY+2@H|jcjX>m+o!3aXp92NEa`peUN3&@-|AZv5_reB=FmIAc}V54>9t<8bk8+ES3^Ocuvob_!l5@ZE3yBQKqwO9_5rV!-qAbMy#DhV1j_dV8npK zI#rBLq&U)t#Y262Ztksm=UFyks+I@P>wS4emhyZ~>=|jinxQV588`7lt!Jq-c{{v0 z_W*;0mdgr*Ef-d3@;~Ofj5MdL&qc+l=^TR^ut^jU{+)2e*ptgXNX{p^L6(2hLCszG z;ccXptU#h`sq{NHLeGPDGsz%Q2roXmAJ1E5VMH=c-zjJEZ}6c4AZr4DuJHa1(j@$~ z_klf1d>pMmls;)4a9(um#0L(!2cPOowp;YTuU8CzzB$G=$n`yF_t`OKgN)^r?GAoh zNDJDpaxS%MoD3Ae%=3Mrye*)4<<$N5&jUq9EF_?)cnbJb3H$tHVI%YWv}X>Hp#&sY z*ZkyA7Wqs+={`Bus@@&h^OHa9kbq!eZJX?rC={Pc7P=1>!^IaTn&e}rj=l$+FM?b0 zA}4Q!ih%>P1d@oAvbP`n_Yti?vNWT}GU4PRJlGQ=yf4~D`oW;fW8BWzZ64s zVlsP=->&bkb?xu+*8kFJ79h)LZF}-mWVPJgXNMeLNYF`Kd<V!p9~a;$+bX8-$z_xdNEn_?=bS;=YY-u^KZ6B-@O^oE@?S?0i2}*#BNxnZOJ`q!3Pgb0L*})ZV zzwzHkY*7OzoNJ1R{Hxr1d{11=J^|MSfeBkX;E&SCH4YU`=!f3X#iR~MjXKs(3#le2~3*olyRu!wPlfDiolKL0wc z>?aVcy$Mb})uZsB-j^M%M6&-!(G=bUM!LVCb+57eB+-bbJ}}Z8lAz@LpNE8q0@ivb z`}V{n@t!~t?637dy{EzbMrv({+9_-~+E%mAufnlpZ$iYwxM$w_}forsK? z|Lc$sKsQpEAV25~+M`A=j9VY30 z5{}J8{{DZ!lFb5A*fjPx%z=Ap7y7uoQ{f&G`89>!0f0^j1 zUdUsreXy`RqW_;*0RIN?*fQ=(o>+k0J8tE^Q`*n^`2GK*X!=3?JU!Pp#dDfEYVqCY zEHbiq-T(GNAXKq{wN7gp+Joy2SgSvh-|@T*sBHItJgn7Xoa(pi;~T<-RAD@34pOJN za2!A{byAha|2Kp!cY)F&k$!tR2mQYR%|185kbI${>i92qIAU_h1OS*<(EZ&1FyCKX z0(}G?docF+(mwKKbIP~Mtoj|=P5k8%t2UF88uetB^6=EuSM15jMM(~Md6l?QIsBh( z6cs8&6|1-2ij`z%InO5-xctFi4QP4{5|j9U^!N&B1Ij>H8oxpwz*i``Zb30XE3L3n z!tc$HlE)ab(-+m@CM(%(awp@fL&{=ZayPQ_%Gk~w2>OaD`1u5(vE6Cs-$z6Z7&;0f zltc#aV7m9+KgyScvNF%=)T90|Et={`mwFbrGxm@^qfQT_&!&~`&alOAgXYP5g4UT) zD0GoFj|9;bMiMUyxT(pPXC@D6cMsnesv&b-ibM2Sv42FY+k%d74i@0TswhJrC z?B9tysj%H1?sv@b^q~))2`D*jk`2IEwaIr=vhp?QTr{ zBz&T0bazmjjmP($;Wz#?;jfaa^&K*<9grX~(2zdX-D>;aX!K{HO0@mXA%IUnd5->~ zaUtT&2c3k(w=GWshQGW}uV*VbLBT@|P)smfa;b-u^n%$j$C)f$Z#Ah{5%%4jRTL88 z0i18~;{S={p^?C6bjM$v>9({BLBH!zDvINCBxA=vsNwH*vfISOZg$}zy|9MZhbLRo z!(T-+LWR)bfqWtvsXVfP5l-S{e>$o!iji@lB!GvMC{B9f*dDA*tXW?@chvUk%C--^`xj zR|fq#x_N`|vG$@W=Cbk?VT$LjCYwTPcy^;}Rw6}czbFhDveu+tCHCM;E)n#zDSjwC zL@$_?p!AS6k$|@ic(#}+R%Xpnwq^aux$6do1V1_|E|xyBh@zV=30z-uSh5|qelD250kFv7zXC?J5LtdR7VX=+D@^IY2lEE<+WJvRsPB$q23>L&|Ug@EfuQss;GTR{1PC*LQfrD!oH{g$(K z?Wvy_?rO`ws!GOsIo3UxrhBgK!C&UhnRN02BnSIjD<`@{e-UiU%)<3YP#<1Hm~+hy zO;@J_r$rnELrWclz0bc9bUQQnl@k!1`sIakdxwI;hQcIH8z9I;u?n8!Vn0!WLXHkH0L@kKfjaNTTOs+u%FfgX!S|snG|3K@|&RuBh&C+Wg zX7l@v;t-G_fqP8U|LAeenDR`mk}L%LCf-h67>nTR5XeKIe!z~?Z{6I~6?Gp7o+t~l zQEsG9JM_|MKxv)6Wh;6B1~YVO`3-b0@LOXtXXBR9j&DCBUOUY6k{aOZW<9jgk*RG zBqC=t{}&5HV01h+h|lg$$SYwKM>JZk=pYOjuGMdJ)){`s)yt8f>*VakWNHFr-dqV9 zuX9#g2Of*^DT1MQpe*@EnmTe(eRa*(!(Row3rxP%2#|3hdXv%jwu;_Csc@K$y2cOM zn=V01(c!e9b7Yb!c0&0?P};6hr7HSlDR?8PeRY%e%YOHZbH3RK8wK(gi5d|6uP&BM z1Q!e4d|O%+-4>6mCO{i`0 zW}4Q(OZ&yh`A?p!znfW#VZXL+UU^^f<0b|C5xEyh4{$p9u|Lb8+;wmcm3sEqsLr7r zLioW!;oL^<|3FPKSJ)EsB=679EH`87MIq`;8$S)4n55AhLb>&tw9+9<;MQwf;5uP+ zsxOunJSdvS#h%7>64JR?8m$h<%1bS4bDxDrgnqYRt6VVZao4AssT(5cZ>|*2#>HW;L;2J$C?a%1^DvMC{Gv97MJ6gdo4 z*wcC*9Hxi4r6_s;>BMeMM>-D-7cZbV*jM?Vn1cc8Gri<@Xn|I-9Uix-RTK7d)raqL z@OUm^vDh#3#6Av5@2hO2ER7tlC3c1;wS1sKt;3kA666|*`no8$0{uwi(K00wgt@_F zY4SqhdN4Pxx+o2)K;#=AnyP2QZ#Q035C?MtUl`rb5t`c(8+t5Fz@}59)tx((lSMU~Bl$g-OL@9?m2X>u616;5+m&;vMJ4t&^6oe5LsN3O18!5m z_pE0gst1XHfum+p4Q#6N@~Re&$avH3ed(o#UhMKRsg_$Ehf`zDx#OQ7olTHK-r^$T z28lxBrEK#`n5-(pd3t}o=Vp4y7VYSiIN#Q`WkvTyS!VOHq4C}-f0;z^&BatmPm;r; zP_zh(vsVI)1`qqasF1ALmzxHQOyw zK|^)%C`EY}b?Agq`ILW>Tm9&;G$8EV$s^Cw%ytwYzA|wQxX;}6FNqh?f)|_bQl6+f@h9`jKka-;UGtA^0mCB3 z0&#=v#-Pg4M&D8^h_URkQaqK_L=(5yE^*|O9)8Fc9(v)|9M=6|YC#;tcqKLsI^>Sj ziJd=`ekO1r$1z`0_pM!R5TRr#mn7W3ZshQ%NA_ezMstAPQK{15Fek7{_v{`ZcMWq( zL3E_A`f9EG$(amck{XMr19#MSI?Q{e@|v^@w@2pBzi<#%7xJoRPSi62mn97pMo@V9 zs*~y|M4fDn)5}*Vx)0ltvq?l_fBpcY7Nyx1$cbL40JarFBG;8)UU<^q(QlAA8(&|w zpCTha)ZK@?w8yBwz$kcwidDu&MG7-6h#>vWM=S>i{UA(q!nlmgSqoW#dteM4Ryztg zQ#Mb`u0ajaK9;Kko5;#$`(ba zG%9=m5jPFeM_T+Ls0mYWYaTMv6A4jA`Xt*R7*%e(-*z`n(0Q#?kq;I(u&K10ZB;5N zRMI3f9DNRS43Sa34c>}>#!`N{&4TsGiK0P7P(#p0{$7+E*-nggd*~{73UqV8g-X`? zo%_H2oP@p?29RjBUqU$%5<{Y3+o$R?74C{^WWQ_empMrk6%tFPnnF^$3o5`+1G$)ah=m+G`!JMkftR?sL6e#%d zFZ*|>wbn2juz%B>2|zRZhab%Kj)*@NK=lTV(OFc5SHW;{dV{~qiFyV2U`o>Mv%mQh%n}$Rz(n5Uf6d<- zNsFpfx@IrNkQk&Wxe<>_f(8$d?`%ueKY4bvp9moTa6ac};{_P9nI@1>-&)pelcR*O z`m-oj!W2^;mFkI^L1-x!Bd*3(e@ZZQHDY-(R+eF3N&JuUNGRm+Uta#od$ykV^dSYG| zx{LieBAG6+6LDL190Ltr*Y;R30A;zu(0CcCAQm6aJBgJQ1z(5_^(GA^!SHpC#d~e=!ndqQRb0s3fzhUCa#WF8*Z3Zz*&1lZ%-kAg{+NEM*3fTH)uM6SF7KVKB02M%YH)1M3D#M`6Qpq$8*J8O=5%VFYdU$-Jo+FB~dE=b3$M1~9#!ORB%<9t&wdIm^aD z=S!2Acc5)!3)BsfJ?D)W;0lhlT^2Q4J2uSe4>U}Pde^>&RK=b=sA=_(s(xIHjajIK zxyc;vOwyJz@C17dpRP`%tA2}rf{vN$c}db z|DnZO?9V0?w70Qg#9VXB0WTb!Bl}(6OUY(vEE{nAdFq%702J#{4~te;0bwu*zEq+c zg-YPSsVZH4YgCb=SZtOKhK>6rL{CnmPnJS@N+yynP#~{HD-wKq?S>r4I!j7f%hR*6 zB!Ym4&K<&;lC(8aA6n>KXcYNtzY5Js9FPrJ3Vl!_YGszL zB|f~XKZm>M1O>tuvgtfVIwLPe?!M4_l`}Dedvoy1BZppw!`wJh6d;5s24RQ09=KE~k&-@qi0#N7^U_1{q9&DfQWJ>HDt|focIe`WT8Pm_^!}3Ar zB`eJjiS)O;MN2~copV9wyGn#2ed}E{@ZbBW$~)+Tp9p~)$@ug@0YL0g>d(P{IaqA> zU~0j;jfRt&`v@Z~e#uqJMHY}zEOv>-tBwEFV3;Vf=h^pq<9=6;kG6adn$Aq?&h-u- zW_F6spu*1;UB@}Ojv9y6rFSzxPG`?w$`UGB4{XkVGkK(MzZ65-g-Vq8-zCgC*Dp)s zG}{elRc2|dgfA>q^F7^wCjQm#gN90CLgfuEi~ymPGg`8>xDTM31786@d1$nHGi{ss zF~##Ix@ckBOVCh9rgBY>xCb+ZG4_LjE)ry91yZEp=BpbSsc7QY4$uiL#pVEcKaZ8V1Nl{CD< zI?=$om7^eAnqC6g7sIyVOw>F3^AYWVc=*#Tgp}p8Z*R?SOf4~}X2zEc?xqcA2Srh+ zua~f3;n`0L2c|au)%N>@(U+vnGC7hM*>%+%I{CJ|IEQS@b)$N)KR|1e?k`C=GFsV> zsUqX|GDzwk-M_?JL3qbNJZUW3eC}!7^xcE?%l|VTyF_nd%Sj>in89Ip3LKI2q-3qlIHqF)7sja&1Y-RbJG6f zIYBK@cNZylZ*5Oe_7QNJ74Y|2uJI33&hL0eyruIKU++s0AQjlDeZCIk*I-KcdO7Di z;cDr;)PzS4$V}uvIGFv~*GJlfz~5Z{(QF(jOiz=@_U= z*sRM`6saScw(hKa5#wXzTYXdQK{x08h}#stzQEM%=)k?WLFaDfw9b{A(CTn7|Feg8 zC;=)n_eOC@PM5~bGm6BZG>V))u#}6hJY>RgRFDz*NLp;Fi;mQ_9%V(-%K?!!YugAo z+Hh3ahWzjaA7^b#4CO<10qaKFqA0y^IK1$5{+t=a9rVOteJx0&h8_uP*xKC&588VN z{h$yQv{DOzs9*DT!tr1Er$w5(F=zOPld9dTU6%j)xTFGh- zAk8{Dom>zm{uAYBxUC&YM566M95D6^<4>n~4ei2|AmXuI{n`~f*^g2%IU%@H5V0@-hCVB2DR;b#ahYzwUnN{7RbNXn>$6`@{3|A^ z!?)Y9bTNLW@LI>VA6x+iUu&yhkQFNg>{9rFU>2BZqh?j`T9!S%*P;PsaT;G9PSDtl zk2`m9t=HQT_IwyqhTny*+;+x~l9B@9KXSK^+!AZ>Q8x2&oIZu3u~hLPFF{i1D~2H> zPzGp+;A7BfkPQFCzE7_5ax~L|-PmD8mSU&)3SdiZlbf_12?W*@qi})r<+-aMs_81L z&%1Jbm)l%&p;&$G>E5{mlX;a8QOGqPha7!5vV=oIC`vCK1~v^AuLBpg<8BZjVIhYK zxF2fIL9mjKeiuFP>8l{DOUyGzE|Mg2Gc{+y-!GCZa;c#`f-34S9Q9BEI6ahFKwA_T z^_YqtKo;MaL2^3x2@JT;d(cO(p>!e=2Jy;vatFhCn(}tnY&|>$H-`&w=5~5YhI0kC zwbv)DU5NxXx46d6&z7u4WZ*S-ubC9|JU;YU_MmS?0o5(3xKN8y_vgiZbp_4cw;M~1GOiEo z=4=su3*m4xY#DH&Os{nOf%~#Y!Q`({Oy>Y5mPM!^b~D{ZQPkxcrgF7fUXY3b#3$F&^l$bq0S!am#XerJ~7(>8XwgU;a%6 zxQss_XH{J`L&5Yt6VkiZi!u>^5_HI$@u;P{?HQg?Te~m!3_t}JBVDT=WE_0n~CvACc&is`H3Z;vfkn&lh}6s%TOcNFkEr> z9`}@cGea6()aG~Y?K-rwi@S5_ET20ragjfhT^ROpu`}!z+%2c{SW{g}FBprofeiqs zNbnNHQj6#3j<`MS9vHTMaNkZFHudHGz`^j^69sVZijq)gCrF7ZCgdXT&H`);rN-`C zw$(Uawre&(D3JTyGUa0F#QQ-?_3dQ#N@2>?76rJ3-Z67D=~IqIH(yM&ECeFr-G;B= za(G`gWE#jm%(E6B4W22J`|*%~I%qDd;ZZIv^j5U;?(K)Y0OAo7JD#prN_TXcHFDp? zrX%V|%LrU%6Vh=g5ieC9P4iESmEHON{)b4EW7#Tvft_Wib|J3W^|@97oRasZ6C%u& zHAm`rj%Bj6T%U+@k40-NK7lmb+C)+61rYSnBt`N9EhM2Ga+7(6JWp)r3o@fmZgf&FZRTD%s<{12O0g zmw%mN2<{@HYuUWRVj#3vj|{i#5s!G;skOhA8-x^XVEeb%gegTStfdSGu@Y61zoG>i zBRq`;c2$Oj=Qm2)3K6sDgJj;$OCuA3Mes^jl7Wp-1ba^8lcAiTGHO1(oi<4w;$sP! z=pvvJ`y+~I9hnn5KbPh%V=?v^E#GY@4n0z#fu!pjlh!|GllIh5_~5l=HH%3s% zGqge5sRD58-cn|R^%y;OH@MTYcuKKC%oTABn`Nts@(CW|x87$I%hHkTl^oc-~F%w#+;SX>inQ|j2T+x%yHn*Z+Ig1}LNiifP(U)l+- z4GvtO*vJ=~vl<%6=qE=?#;fMRjlAJyTNmc-#`UaWxx8I@1G=T16b%Pq?hPE4)Kr(; zoIj>T^CCcBWw0LT6{ZRUyK>US9kKRZ46hds8FrOx9`l=vL5kxm@G^vQ{*1=%b_3Tt z+V=v(^wsV!Dc1`Mob$)ZTsXF2TBt&iQE+IDSWjKzt5yUmL^w7V+|QU6VnuD^eW%hvOb>25tp;z!|dxla1nqBx5P zAhsJbu=KWDLPVIYaSu3o&lq|WJXui%O&q4qx>Z(yh#&&$K*Z$fFSFiNwQG+X{@yCh z(0UIU2<=mm9d&nj8iCL3FlRK;v(1v>Y;U(QU6%K#$Z>bBeweOoxx(FKyW)M2rvNH#4fTo@^{6XG7D8H}7Jb3YbLPI?37 z7G-!h80Pt0w=pS5&vgMi-*K z8G)GTN4N49=`3FiYx&UPfA#5ILew+h)pJ(9=id^c#bLZ|yHA&WJ-fw`e%KO z`FU={!S(yF{v1J3s`Gj8p`hO?sH|6F1;*y zKfA+Csh*>d>!t@##;0e7ezvJT_bAV@^1HQfbWTsNN{e3Lwr%q*Ii4*hyud#99o%P> z1Fkoo30!8Fkcn)4H_IBALX`5Tf|c~lAH-u&4eg>3!3q#$Y-dsN%~RcaNny{6-aOO5 z$fQ0bjxtJKx`;#>mWDn`K;pv(JyMgSR^=I+jR6{+vp;7Nru)<%O*U4<|5%(HzSf=f zG%uAbPjcFsAcResD={sVYwu@jc1h(Bqm3`sdi*_~vG3u;6`;I~Vy;Fu2vr$FSks zJ3GSnu0Jt?0Z4Y^sIDFIg{D;bq-e?00F9vYaz#AE^G#`Yer+`}7dqo8bE#U4)*7^O z&kdHal$mVHd(UxScimYmYZc7en(&>mx2&soKl@p-gv+E(jP-CmBr@}Z{&9Dh%TZ5f zRmOs=Y7t+2sRY1jcyQr}!MQawwOUj+bkg8w=_NXu=+eZT$Nhya_iTnWV20}kZ^im5 zEZK~_Eqp#Ai0RjgE^}!d>qe-e1LOIRMD%hOe-Ip!Y>&WmH-Y$UrC+%w_RpCK4B5nq z%Ziy-51*i*GpFMfbqJbI=E^Jcuq$-<-R3)C3v3fLSA|_K#}=*qN>SA-aNJ5y zic8)#FtN`NdnWf4^@tb2!Q33NE)v5wHa8XCynfw+_!Z?S9r5rQIF|yN>MnU&Ut)Eb z*P+75_bA`Fv-VjmL^|}PeVimUX82$={Y!bSDW*#t8ZLtt5YR^@#ZW8ujVXfd+(+87 z@}wQQv9K7g(TXTnehSIbk7%E_DqJ{;r>pwjeV8Ui*cZQ<3A!D-t^@bWzVwR*aUFD} z9|{vB6C;?ujqpPYc)zZ%_9d!_!f-%t=Dh1T_k|(TYVYwB%jzGNwGIm%GCBTzGU(J5 zy1K7wVyQL?UsScGI5UFe(3cv0MLay8o>%36Y%$&($K$Pqi_ZU5@fpX^ngiztn2F&f z_eGwgU6E{+1okm4G{+%7gBnf?Xx^$X{_YS1c7f%UTXr12Mi42San@)$8{N&$gK-~GZ@?f!`a&s~vgQdR>E*)DoVk@ydTrcneSE2g*~$^cf$4OHI6v*{?G9$A;?dhUD^} zxiX)>R(+el-Y@~n?rl!6322-8)zucIz7UWVq}}|s-@>Yzn#Xl1oaS+awo3G4p``w= zQ0=DhZ8EvAaCKu~z z^U(YwJ8R7}p~J3tz(VKp&Km-*S5In64#LrR@q?Z$=f z{;a9e+k(JzSmr<%gimHgMRYl;lB|R<{Y3O-8~uuP4P~5S=UKV4`8t(V$cy zVr_7F<%62DK>uX9hXy0p14DjT#3Wln!YF-YGxg2~)I&S8k2a*SM{HS{sg;cebGbXz z#z;r40<`J*keM7Y-UMbM3YYw(?yPw7(_CQ{S3e$TD+k@i@KE5AbnAmc47%0JemokI z>?J`7AK&-zi$~Unf3wfmKF)4X?F-SeEJ|HdL}yS{pOlZ4JsFyxr@VUgo3e*B?`?nJ z5#_Vq5*9vWP(Crpe0@sy@~H2(Y0)me?%qx_@zD?LOA}_UHI!R2HB&DWjO$`dm!|oO zJU=%F55(Iv;V5vWtMOV2v5nO*x+t-S&cVKwY-MuT#cUUv`k00Z44x-IbS?!qO}{P= zCeLHy+DbyChnF!bHme0rru_WKt7a&1OT~lH<@cJ?55!JTy9}E^lsKDi{dM|B@#go| zmX_D<=ol<0h8ECLMqUHV<|8-=3`FUr%o`7#U3qg~7C%gxVdr^Q`aOb2jO0?TwehD- zh4<}I?@Q-V$;Oj9;m%3zF_BuAkGUN^D58QRNr6PL#d9>k@$p&vriR+3XBiamH2i>< zhL}*a-oF^zuC-K1NLlr-u#qF9_lj3_7=H&a5jdJ{kyUDMUA9)!TK6_Vi=~`By^anw_aE@@VIv(@Jmg_v$D{Z@e znKpF~eiXMcIG$ek##}c+eAR0C zURdtPod>nH5`v2ex#_lvwDiJ)_TuWfR)IGiGv%wPeM}PP8o6*>fdXDpZVS*3T<*Wo?%?iFrwRLMFP+ep^)!g_QZXcp(ps~S zm%D$)^49McsUZs`iD8@BkxTCx?t4+EHO5~*e<)W-UDJ2JTtB0+vor6(y?gia$)(=J zAbpNE1V9EJI&mBcMzI7{;#1o@nC-SZ>n0wPvwLKyPG57@9Z>|E6gNP8R{NSM|9~4 z4R&WDJe@rZZnUvqzk0|P_$|b&J$&1UA2C2tP|IG$X91dZ{>2HJRW(M|>~6}#LUfMY zgTSLv;*%*>U*0C&8$L2>E@pMYvWcDRj*;OXuQhuuiO#Yu-JLV$ocR35c|e-+Qn#P# z6@7+g^h;F*1gdTjM=MBR|;ialuIkehpj@p)!r7%wT;_2;6A0SdRs_r-doXlzCOkD zADnLPN54Mjl_f5XmX5F$%sy$n^&W$g#en^@AGP&XJ>094fPb|osxvQ~WO`(4R4~N* z73D`IHCm%_#d*C257v6Aw{Ps9K6|DE8H*zjg3V6(0=P`}=Vb}H0<3YDaoUpg&%LB3ALhlJW4k>|5*h>|7AC!g*|M^1gfuJG z(=8pvW4N%%o5KE*@#%z&TiZMzaqkU8CZOGD8;LL{g}S-9%|3trJRs*s%K4UYu`RvoXhCTgjXUY*+~ z`zr%g&4cs)049Ob0?B@-PM`viabKhJ4y#Fyaqw5stp3)7=t#lm3KaqvUkjpM;W{v1 zqO#YJ*-nZ_Y+K96+lJpD-QB+VaO9^^s%h9X91f`snHLG$lFb{%P3@4#iYQDox5-N? zw3mktBs}B2pX+v+bB%IPv-?X`=5p9<=&Yw1IbBGmUhW-Nr%G`XQLjy!z@M?>ZChc` z^EW#$9I86&qsGKP>SdJ`<_fq!tJ~ z^STgW*9W|1D<8|W&(x}1Et5NMBE|hgUVk#I;ud;L;Y3(@;?V)Bv4FLFA_Fo2Zd=&- z+d-w&s$obXPcu_( z!9CP@mOtHXsv}3g0O=i+SxIH0k7mz2v!5pHxY3o#qCY#8W92qVzdJJ=ot$U_yp&9X zRbILIhfb!}cVZcK9hoi9rR>bA!TrYivsbezc0&Bq<(jnWS0=v+^JI>EnNi76X`=8U zu!baVTE>rWjc?6!XR~3#B@28TainiUpv!xtgZYHZXP7ja!cCgC z3qEVD{dko9==tKJc%M&}qrzMu&C78?cGa9)?ArAg#iMSONXb2Y) z`6*wZtK1+Q;WHKIx4BCMv!D6JcI!aXo6A8%B4}%WMtI&96WpAj=bV?}kC;(!;D6{T zvAaGd+KMikTyo2rJ34{rYkz^YST0l7b#%_%r3ROP9t{JE9%b+{nW=cQ4?C3|y%P@m z?&dgQZTq)aWZ*m^cqJ!AF72#X#?FopPV!F`t_k19SQ?}BiCGwzR%}t;T#diY$n~Yk zOO2{=(}8{+u>ulS8rP4KV8!y~iLyQ=oMz7K!CWWtM|_(Ra1XlAdEXl^_C_R+F@`YqSa*TcM7MbAeR~Sn$WHo=N?mpNw^y)TLPqYdC%U zP(&;4!FWl#^4;yF%ItQ|u?}pCY}5SF8o$n5)=S$Z=r2}R+Jv)uWAB|b`{D_hoNZXN7;W(;vdq09fz9==0wZ58bs+4yPWFiY}hAAJ47%Dbv zYz(?iFnbklu+CaeeP>CD(yj@8xSgqsUpB$6H1P5ri}nMnVciPrUw4SITr{sGb(hYK zm|N$Mc9B%x#vA9z+Jz~UfLvx2Ca%VItP|IgQ9scvtxX_7`ygxUd(#FGK-YSN7~w9z z8x*Xh-g@J4<-%c-1H&#F1jrQmiISLSPzh*!t;$TkSOycpx-rGrj~o(Al&nHYn;Ir_ z@5{=|EeelHAEFhRcDCMZ+p_CQSOVIvm*cTD@08-sxM6MxFcFE3-$Ekj$j9Z~Un7lU ze|?2ceKBs~;`>bwzhtJt+YKSCLz{6%Y{`rGa$I@s`YI}huId_(5+t4_`;^J-&{PhR zenXdT_14)}h=y`o=V!>oX zTMf^=EB@X6@u7wY(=Fa#R10VSim@iK|K#K_sxLrafwbSEK>ka&pA=!C!b95vOAkWs z&aqzkal`_4ugYJPX6YdCwm2p}nL2v$lF)auY%k}xND}`VtJ(TAS`*fqIdOrMln!aX zL6U_rs7aMIr!%%aMx)~$0e4y6XhT%Jj#KO|A$7eN;|CBk2vYcMhUDlxxl{X*J!Os? z&v?Dc;%swT8FVB@Czo9y&z396qq6Boe1Bv{&4@voNkuWz{+gRxAYg=I@;_P(yz}fFNT7Gj-MSL>7@8bC z7dsTl84;iKBVI9a4YcY#Dv@Fei~nN%jaG@rCPbIqdbvMXR~Xmqi=2o6Y$p2Ds9VGBiU-58Vy_GhRUNcYS{UweDS)b=S;0@B5x}_I~!WpS?G| zK-Rlb980AcY#86m?ZJ{1{pmreODzB|PBae#>ongxl@A@j67d?E?!R7Z&$lXX2NA#2 z>Nzp?xAo$Gwrsy#OJRofim{X5Hx3q250Xc9;#ip&RzR!HJm>RUK_~L4$+S-xIE@DN zgK<=-oU23R%Y_#D5AF6CKEh*hRr&r3sG=nV?z%8+p#Tei0Y?gN_kNJOy6y60LbMHG^T5RpjCf(zO$#8V1tx zVQW+VJE#(McsHw65{-rPhF4h*y(U&5V0 zS)R5_lk06H8h)O>@SD>bRc*9ON$|^M>%Vk$PIw1KZTCS!;gHFqV023>YuDLbBJoEV z&W&1|rE09qQAIn6i+ok$;dsQm+z&z9bqTW7Oc;1NIy!!JqS?fjDnoR(fJyY5v-p=j zWtTr}@x-Iaz)dKQNdY{|PwLF7747opfRSRBKh11GP?w3axL@NAwcTh`U!~|GXjG$n za3ev7MxEF99Ij>Zph3Ivi>aiv-*VSP%l&yolmx(21gj#0!1Ap^gUzJ6Lnwofi_E)+ zPcCB^UB3ok1pMg-`(4w?63kR>lmG_jE<67aH|#2fW5NGss3km)vy&WMkQ&(aa-R76 zg#Mp+xPL^@7G)&OGbzjQ$l$uUg=v4U+vVs{8vpvhuDP}Kpw(5Gj6`W^B>o6!%NhiL0%bo_9Tk)>0r z`#54BJQIj<4WC9SHs<{xPRZe(cr2<2$m?BjzyT;&mQc(fe}4rZMm{t?D!^wjec(l8 zEoOwIoL)&>8RrD*4XxoJyt>&|gV<1>#X^bm`Osy;*2()$fatGsUyb041kf?hNff`{ z<;)*tXeEba&}GM1p<1SjR-hgvZvrhjnl=ih@=}3|3S-&rAjR(9S%B+`ItZn(CNzCk zUJb}Tl{b?;6eVQfg!@3@W^b?SjxPLsxtaYi?NvBlD8HXn4~T+mvl2Cm7*_j*N9!kf;kA5;qDq!Cd$ey9h$<6 z8w)>Pq6A3*jO+cWvifDG2$~Z1H9V$D?3`_+c++s_SGLDmbY?rXORyRTj&k-Je#jpDx;IzzMaL}t^z}0xI=Sj3Y1l+S6)jcSk;IN5 zr*xTQSDt%159OcafG~LKTI(O=%!U}dWM1MJ832YY*uUnvJew%ycV4ld@g8n=Vslj3 zy?D=|&(lU)))B>X@FU{PQY#?hhP#nqQR0w=Hj_N6`RGyR>w1c`GTaa5(2VHJv1|AO zm`6Pkr-CabWjU>zdt?(?10#bsVh!eo4IP72XD^@u)G{;cxbf>iOX=K;@Ib2ypW{gX96cK9QkunAzQPpyQvBBCxtsDqUa4@N z{>EyEa;uNP`cN2Rqmj`fZJKf#Yy=E-&80c^PQ_wI?byn zVvV5Gk#c-vM#)%s!tGuhylp=ars-FF>PkP;k3JA}?8Z9`mv-4K@;uw~lX}CgV8;ND zC?j*$fs!xBYd9Imm1$RJ=w{!q=uNjTGqYpblg|w_#Y3dlSqs)}d^{!zakCbuzpS4QC7jSoX&jGCI z2UjX=MGlap3RZ0*M?pqLE^lpaHlgs%nr{$@<;|tmzs!@%;vMM0=|`CDEQ+PFHpWEK zfD9obwyzZDbE&cJMT(gJhZ6Zm?w!`BDppFYaDCk}ZGXsMV>@x{n#c-ee9qIzmFG1){7Q}6DB4`7w0VsToirPwqI@mO60>v1}A{E6Gw_o0mrQU?3FCea+>2?6f8k%o5KwBak>K?XSY( z#GCjM4@8!;}2O&6Br77Y8N{;orh}ZIHlB0mThf06_dQB zIgjP*IEcJ5g2m3dc2K8;qzhaLtSBp}a|&BehR#SaHfJ*UNVMc!hP6iWi<5s$IdAQ1 z(wsZTF-T7eZPl&C32GR%3XxrHVH0E!yCSkqr@Z#E8n{j|xIhk2sy@N1;^}$L1sSOa z3KNu`^%;mz7uhAXDSi_vx`jEtMJnPnaJ zcI_iITca;nYNt(10f~mej9M)_5$=pB?x}_3_)Y7FfgC}H8FRd|%Y8l^WuI`@eoK13 z)hC+JY)*vN7;YvV#w7o<+(;ek>BySJrq9u<#bid~VMXs3-6$vh@cth=WQ(vvCpUDD z&?q-Pcu9J;>+S|Up~~1xL`#esJdsb)ovqI$h<6MUuNwY$$m6jzNLq8R{Lp%!Gp zo1gMEH>=(|Q|65_ZJD|&)OV`&xnSA&s8O%ob_X_zLPfpa?YaJ{NlQ(60SptvY)J=~ zY#tHib@>dxY{eJxn`Gm)I`g|x$|wS)U7*HB3i$6Y znLJ9}_YFr&W(KGBt6{Tr&=REnPGLIecJU3T{w0OXg>=l;!V44CkaDBEYm737)@=dL zP6?H(oNS~=fjJ^dEJse3TpdcJhPXd`M9xXID6H=N|+GEvZI?ahN~;Fc7-ud zwe3wB0wBdvCjXB`i+u=irK96L>etz6**Wu!zk5f2qNjvdA6bzM>Lz6sshDaj$Lv_? z(MM2Tp1*5I?g&Ku9D#^EZ;V&V1&0YZhyO9Vgv{06GC}DhNx^BttTSUPc&xUpv<7=> z9Is9SP>3B_Q?)%20w~kCC!`;ra_Xn{YG`2}OA_mvZ4P8_PPPZ@SQd_ooi>Onl(=ib z&KCYf>J>E*?Jb{Yc5{(suI4eGZW{+W8Io{&6Ra_W=i~1*o%G7Ykg2$PBv1@jNk48Z{j^ZK%LtxLqXs%)pcP zDUPzC1X&7q=cR}X^;+&8>c#9K32hd^L*$hSq6qKJNv2ZA_nbtM0> zGJmQqouct2`!eNRDDHNmU#k z9>+&0a_8{_AxW4FYh7P=n(Ei#LRWoxPPEaZrk0kD{Rj0C%t9^wA8EDeRSi=PeEksnK=Z6xHI|b^4>KcR8H;c zg?0dRD&yL4*~q`=$E_Y=K=Jv0TU1WI4rl|buA3`FN6LC#6Febi`o`etVA^qsxL9B0YI z5d+ir#hq~p>EE$EvK~BpEGz>1uox&7|DEL3FIE^-8nraU|DN(dpc?Cv>j)oY^LBb_q2J)b>s%POx9_-)wt>IF1UjWVY5HDn2hG=7dANTebUC3u} zjE`9Qt}?3|fek;%{S1s2+N)XM)}m#cyDyi7x88X=N+xRnHQ^_apm+MFPYp2SpmcAc@C!H2#B&QlPHU{9qEW{4rekRO855j)Wdtnsql1x|a zO_>@yK+4mlq~Tzvvp6sBICXrVaZC5Tyz;#|@4)kdCKR|CWy%#Z zTS>2EVU-aLT{2;_i#?4o?-gjJv+-%;PZwc~8AptO6?7Be@3SU;+{zf1hsj~mq6}2F zZhiaD8~HBMIcCHe$ai;5Xp}YT%;-7vLVg5UiZ*+SR#4%EqG; z$|u3q{)wm;5w3XE_}TvtNE^l|C2+5eCWxc=|-E_-B;pxZ0EhK?7eeX0)G7$-i$ z=85mJOwzvzf3NN^B~%7cGVUj|`=>(+r5PBJ=I&5|R>0O<(C7E_&Xz2MB8Benx~2Oi zTXte*ksI(#sO#3C$EUssQs{NF2h!c>pO(2smKy#ODzXPMz;h?@$~efSRi`l^_~sNF7~c&_f=fpo*zABjt! zR}qS$ zc!V}zuZLRX&%+!nnb03a4B-+<8FJ2*^{FE*9A$nel&c{KM7NT#K z1ClE#mZQ8y`%z>tn^!o13bwB{%`@|-S7$w5B;UG}297A=Hlw^h(>0`g>YT*xpS7TB@n9@0tG>gKkXGt z7HUAqMd#x_0$s#9LZHYy7_D}dr^mr(wtF}8;W{g|atst_d3))_5R+DzxALy6V(!=+ zf7Yq9d*f4L*oO?6niU{;(@VGjO;|HLy=zoHlP=xNGLLm-rr5YkcR6$JMP{jLs|T59 zM7yr05*C~E;8wfm9G1uTN&GYv-+vKDkHc5#T#r!tyg0Y(haL#VLPh*lADet4S170l zn#If!84Ld#d3{Xtia|jby8>heR|-=W!5{al4xcU{vyQYSZd_`+(*ixUz}l_?+iA5U_XbjwMGp=S3oe-mx746Pg)wiU0}@ zqj}aXJtzMRPSXHXw=th(iQl9@vbG&7vbCMG$vNz%KsKGV z96U-5uZ2~912p-2e&ug&4>84qMLyMx^o@E!yPaU#Rt4NDC8(J+Qr5dCgz^Siq~+GNJTMWm7sf-| zVb~g)R}88U2FOG!NJJ_40X@a$gkd7|qc#-C^V&lmzJEa`@C5J{B_`w9uVe_jn*{}X zCLiUrJQ?cNGwT1DDOaGdo&7V_6En8O)U0sAw8z5c5R4v`qHbl|M!~BkVgq z3kB}^-SniN+d)5Azt9gqAX?%Lg#PHUsvLFs&KHXHYkBgb5 zWsx~02Nltj|V{LWNaG15CrSstxx@$@E%)r38xZ_fmFBe6WKLgwWjv*q0HzUgkx zMF;>Z73ud`pUCcbf$RZ2XpuMx%>Lv2+nf0$G~C;e%)`SA`3UwxSd`bhIgHMQ{%Q$$ z>G?!9LVHW{O8WfH3)TvvJ^CTF{TE%40Vv{=hVSuN-5Sf`20Z|BV$TF3LlX;BUlN6C zOHMWPvDQ?X9=!2(XE+N(3}685p6?SGWTuR2!k_ zg9+ze#H`#Fx^R_xy498r5Q~Aji7}o*+su?7B^QRUyYPpg?3W#qe$uoex4Mf&SJv=B z(u%XKc%l`mB;bSZmIoZkegI@L)N~xPszjo#^XdQXna}OjLUJ7t=~j7 z@N97B$rpN)^YG`?LywuFdt@muurjDKa0Z*YOXD_x&k0PrlU6t4R*E@XBH2x zD)AH~b#DW5<7TEhP~$dWL<109#GC=_aApylU4MgEsjG~{7*Q^CLr!ZrcxV6%M%M&NAQc-Z1kJIP4?al%6&-deqxQ zOwO1cskHhm8zRmcEdMmA#40{^Q12bt`BLV*oS)S_rs!frmN+1Llw0kMai_1yWDI&Q zz(Oi>#mjkl1%WVP1Wl9+8SQPUmn4A9bwv-GTE!UZ2Y_?JjGGjL64AWjMPh$s7H9_TsCZF9i9LotN-bl-S*dwVZ zA&VV%j$&`3Y!F{@yO9!8pwfn z69eM`sV8}aB(goq=BV+0eWLn7iXfRO+_5#oE(c$2px*F|@4$~?k#)>;NWz9SDbJ4P zazo$RcmXy)pN(>ukqn5if>R*wfUa|EnP=Pe)jglmVlocX&XkHF1N)@e zb-^ndfiWeU#D!Jpq8o&dbSwhOlq+EVV$l0r5@x@Wht+|Y0bK(b7IVNM$ zD+~{_>-FILRd465ayV1j*Y>|ComEz6kr}=FIucTSosDxju zKYpDjY)c^-`CQTkmE{foPbWLd3pmVhes(1XevZ=X3L5*luxj~LA0{egCSL7zX*DTrKZ0b-2`!pg!_;dM^Q zv0}i(W#zSKa=S{jl%bVmW&RkkV|hV)6;5p+gUws3`z885GFsFf>-I~m^=WPt&l1WC zi9or+)g0(#=ZAtUt0kKpWj4E0y);f=ekn_i@^wveoZZ*ii!m(I>hqVAZ&o%+tUu#o znh~VftbF^}_)PQX!q~HzSP7$T0rCuo*BSKoq&2d~)V&9M*+S+o$BH-zqt!@r;Op<} z?!5$APATcRmN`k>Eq>S8Kvus@wmrDe^)G4w{y=(rzMN>i`RIIJj z38LTI7nYPo$WIhi2U+Juk`s(`ACz1I=w3oO5tr;&vg<%jls+dUWh@|i!!g?`rIAWG zQ2w7pfAq1Z&rTA0-*Cly54f}g5jfV2($kr#-3QcKzV^OFvZ#x&#q>SfQCWCy^!7!9 zASO}ujfzMN_R;W86tvqP?kkeA&|gH(w$Cy%R@Yxz?8bF~pQ2Q3F>R#wRQ>o=(KXMs zcR;NdQP7e)nBvf%U^bk~5ydt8 z_}0t9{H*m@w)GG7nl3*g^b>7Gt4iYzZcHLuJa-k5GQ-|ssSqMls>Pobd+G~awc1TD zZ}*#jGqILF5KTXL?B40o9?~p3k*Tc0V$iNTR2g_zdYSpz=X3j=PlcDEDHOVO*!&Tb zh1NO-fjkvAWl9uHrO)iv9``_;vdoXjr#FO>OLbQbv^grIYdb4~m0dk? ziDe+IYckbfUjPzN|KQ-=0%-iOom6`1y^OJf=-AaB^59VRFx57ocgI>t+B=1wO|Dof z&TW*xuinJR=?YOY$(?*bG|B9?8N3B+D`4NA1)7Iy&w(Lp&ApqMkUFGr*SixRGca%k z7spmM-MUVye%w`5px81gz+AoZuuh2h52hNj36xV0ZfF+ES#@_mvDplR9v7qdxO{da zY`Zj46EsO22#$JjuV1AC<`6Y1T|Dn1Aa7UX*r+=YvHmhepc^lIC^|?v8P?}=ok1%z zsL^&E@??GY&Fg;j98=bn{qAGK)4;NHj6J0vBcx@W;&bx@)EYuKH5pDv`-;VeYc8GG zqktZ9kINAEdHvJlwK0r4b8H#tPmIdeXKO#n!1A3ITPJ5%HXpSgy;hNk=1Bw6jK`+W zS*4HIf?HjIxPR$bCMjKOwRc$(#<)IWA$bL>t|8fhRgSj1-%*W_hmqtn0Gj@B0gnDv zG1vlQBnCun*hDk?k)?US?HJSt*`Jm5;^_-nw?W`Pq;C^~D7)lT z{_f5MX+Q+3`vph*c4INoHrd3=)SSf_ZU$>4UuCr7e+ru0aWtVA09z*g;OZ1mG)C9h}B!4q~Q!dmG zMPYmA5mo(?qf65)>OPQ@MjXZ+o_>uJbnaoEP#5}*L!vyWva~*#T#2wj0`T87+0KB9 z@@UXHu|TDb9AO(`<*h)OT(5>ojPsRomPJ2thcl6!O;A=i(rH!1vEiCL%<4bYKnagR z*8*9Ax!Dd-!Kz`|m;S?)PuGEKN|D!X8+s6be$PVo7}!=BqQwc7B0gyUSgw%&k4DU)v11Iim=guZ ziVnF4i5y$-0VG7?TfumUBC3q(URF56f#2<5+@4y`{kok3s1~Q#bVb0+1vQvMYr0d4 zu3O{uSgT~Zc{J9-8>@!7Oi4U-+O~=L-crS8A9UQC(!{#yZ-3}rr@DP*q?!>%YV#MX z14u%MHq2q{4vJc3krC03&4_gbxzv_j8fGs$6eXLXuAZ1VDW`VWBFvMbNuo z`P{e`hd^$94HSE7_+udSjJwxK5&oTg`2CDxmY~ynvmH=*U7#kNk3o3MNV;>5hMQo-=!GoNGC-082M&beSdTCo_PMWBM#CZPUw7QnB&^Jf>D8@)J;gW?4# zSX^UyPKbVV&=X~f5+DJl&n;c1k)Rp$aY)mlaPU14svgn(9Wqm-|}G3C#ObfoM9>RrxXg<-92i7#yVcl27k z@nZkSQ~<6)chfriwiFk$qC%QP_@amSmCBeh2~nA5g$9;oqrhwI4m_*-dD*AG*mK=_ zApqQ1%A2h_EsWMSGJ}flS}eCo-q@8LxA&`J7F{QhYxV&$U}&6MKhl%bUt-D-d_T1- z=OV$Nn=Z%1C?ia2lzlt2>3Iq-CxV~EzSADyCl2z8Oah7*v4A%sUm zAyGKU8=75hArv{Cgp*trd1;GzGSM?~^eBi|*sid2H=^Ld3A zNXovzx>=zX%Hr@6wu_MOfy~ul&;>GyT6CD$8vsaRUZa~&e@`adUY76`4OYUaN8Imf zz~JYPpAA((Qu|_k5)ZjRogm4WTcu^aXa)SV)Xfi38OJHsjGH44%1C%EJm@!fC5H@j z2ed!IpTh{wG}9LhoQD;1oG0AYEe?SO`xE^AU*!kCbH_^EV!ftXn)#N^<6Q{87OpYkE6}`RHk_s4<;2i- zz`2<-wecBI^U%=H!mO9EG)k`hU~b6O1^F|$aULuGJV3NBcV*4my?2idT7O!^tZF~ec~RZ9WWrORG6j?cb07^=Y`3H$ z?_+I50pd)4+$h=!`UM%$@uhu~${f0$3BpDwUvf8IKd&_XETGuI%A} zRFCPKg?=1=&KB+2J5OG0j-ZCv`r{7Y-9ypo< z*)-G7iS?6olJDz(lE+p{%fhl3R*ZW)V8sS~ZZ*{i9gC8gqM|Q!i3u($X(p>Mj`e0} z>W?3Jo8+aPoN(k`8XT;YwXXVLQ*%u8}tB&p*&N)QUj{9lEc(tSC+C<%heap!*g6*vHHSjUlr~A}LiLo5G+7@hv zJ<-abs3(NT);MfI$K@GFa48qp9n|tA9^a5yF!tE)o$oR65@JC+*&b3RImzG4o02G7 zipEmRki1&7hX{2+yP-|&Gf)b~Avx!89q4A4BjV664w(=O<_`VP2%VqqJSg)3%sdka zKSP16QOz<%@tog1(+~i^wDYl{Z%>Y0hR)8!5D{O!Y<#4k=W!!+)mauFgnf;+*OQ*e zLOt&r*q0=$#AY^fqZW|r(Z|R1c0BDg00IWtrkr2@^mV6DJB&ipx1!J$+;fR7PE{Zm zOzKkl2(x|=LX;StFV&pM4e~gr2ZJJcq6@Nd;f1)u_I4cq>PbrO+UYhjvDEJ~!1gXu z`^@ZXO*h?%_R2mm9sWpX1MDROVT|&Z8NqWg&`(*rWEG??o_KovMWVz3qm{57v;+e4 zaj^yq%w6cfE~tmMzGor|sssP^9B(VVs-xG>;$3amd)zzEC+b~wl%6Kg&q~H~gr~lY zo%cMCr71If@C6mg3cv5z=WFqSy*a9Lii(<*zFys>^ARdWM<`tFdkr)wBG;pY`6G4i z7m!o>xM=0q0fW}+p>iDWnHp8)ioxTe+uheYDJw= zPSsDdnGpj2(1*qrE}ckR1x*){gvE6Q)miMRp@zRlQ%?vMNzSl_X*Q`C=IL`91pFBAq1oSbvE_E zsavfB79BYl=Z;Lck&8K;X+l}=5IODG(O7JOe_6|6{ON3B^R|em2vgI@mfdDP(^cj; znVVtx!wi~{^)>%E9YK@boqX?#Yp|7(GS`*duhN=1&kT%E_h!y74&~YOU6cacF!Qq! z?1l9M{Jp6qPj!PfEeLyI~m^1zdM`#bc2nZS7d338FBq_n>bJ$~=6Pk3C_ zGa+3oxsYi!P`P9BG}YnTQPR^(gyiLAfee_Hy9T+_q4m!zEYe@_J_qY#oEh4G8&Puz z55W1yr3%=F4{Y35(a#s&iZ$_d&YDVoAA0za`Ab#0S9=35x<$yd~0;_9HIDcTQi(aAI7#t)h?XrQJf10J!v{Gasx6=uSYw;Pl&{!X;s-^KVdrbyds5<6zE;bGNp8?YF+ZaWeU6?&X6y3RnMZtljgc8L(`3w%j^C37 z#1JyP*n_ZDV!_L6O-Xq(isX52N|6(&R@lAY0CJFQt`bk%ngW0Z*J>OS1Z)BKN#?t> zWQDYS5leZe5-ft^!*Pf^CtZ#G8yzOXE(^M!u4idPWTC1sW+%6O`A;XZANX80`V<%< z3SJ=j%{r>!Iq<@|A9KDSWDtlv^sFcwIYx+<05aYh`ebLjSN{dnZd+7&F4WC-MJlow zbR+bX;k6I9Bthxq%IbI18sJA}Pufeigm%M07m1F95$1D5cNf#+O++0(?Q1^^^2gD3 zm6m#GcXR0#R&=sz<`W?Zm3!<{axCn>I)qsxvWK5bsPRFcfO^DJ*86r{@m+{_s`*OD z!rB)K&)^nDc<{FeT${VQ4%2GPY*&Yz^#&mEp%#j;x)uCUW<_iVRzh6lUQm(UDmk4E z3_umU;|f4*C?PqdL{^VRYz7?fJycHPy}hnpuAA zz{@GM)9sH#mSB_(Dw@84)HA!_xtmK8LEbAqI$z;?ms{W(@;L&j79fL#1V}aK4qW+j zK-j8jRM==>xu(z2PNPX;?O#|6NP=;Y&1myNr0qQV$V~Hac2l*7_F;PTfwwQu?gkf9 z-lU*n=7Npn>IOQnUe*eDJmTPpp*{<}iqDwoZV{2v>fGr5f{1cQ?mIU^>z#7SvK|&F zKcW#3U_2}OqJ+2P!CIo|9dqc9k<4DzA=w z@SVP+(~_8-((|qy1(0Z~6uZvMT?0TvN$h7|89-q7wl}0+5$(`OMif<>(A5Ln!Y+9OT zy}q}rVGE=oT4TGQ5;$Jv+`9Iw4uQbTk+@%+u5UB(GpX5uf3@A7 z71u>6gT&TKDoN-a?^*l<;C4>{%%8vJ;>j^~=6hxC*6Y5~)b)qx0bEenk1d0I6&qn8Yq*0=pwAWmVbF!HT8Jrf}s_CzsR3s)_{HEqVDJ?nKPmW9Rv}T z(_O?YS63%N2HjIG+A&qi_p(NePM3hin@**+4*_o(=N&cnqD!bU5e*h|ZoN-?~5g0?=bod}=H?z;LKyjJ;7CL5U|qM;>7(UWKM;T=GIz z{c#n5_z|R(1h8J(KV$o^ngoLXhxWEo2vmRp zE=Com<_P#le`OC?s=U=Iu_I^P9P3x!&DKrV&x`WbNV~{YB`2ERv$U z{U4xa`S6y_L200cfTmp+;xvQA$hfHt)|EzngBpB^&xAc16SUt_X-LD|E#;`zp;Ht` zUf7pd88`XQBcgGM=w66HjMhqf)57I}bPg&a@I7=`vS`Pj`QdqT=nk(YcqD^g1`Q0@aAc0&{u%Ctc+iuU2ZIZyFpDb zd)uqVv}Ki0?9ZP;M@>NH9x?HOC)kSDSGfEj;w6>}nVaWgW%cXi$JOL=_ujQIlWbfI z;ve;HziR!rc0Z}i2Jj6vxw>0{VZxAEn+#Pu#4?e6H0k*Qw)$m$1DIxPZ_wx}4I$e? zWd&I?1i~C5W=^^KM6H}qft*#x7w2Sb)&aCl;XCO-8L3S__0z`%fy$U>r!t0ov0pH` zYeWOnaDiaJo}sk1U(kylF$rB0N6o!U&vd74?q-SJ$IDN6EntEUFQS98I;UMPz%e-~ zlkLKygGvTcSj0a`+VyDJ?-YnFQS}1FE{Lsju|dSWfdsmJ>$W7DVnnu<%4p!yR2yUa z9}}#b;x*_I`|^2mLQvEPz=)3_%DDC<+x~uHp@wi2L?H;Mh8ZHm)A?whHt?F^@>jgX zBfnYXu>eOumJGzu6zhIDmqL>-G(Nob(CeGnqPnEL6tWCKb+>l~#10PAZ=xP)9zd@n^g&xB%mWQj>8YbylwC*~m|At8;;{Q5<2R)$c9<@H27A@*(Jv6GB6`ZErjRQ2WmZ4s54R4A6ae0|xWJoBfD({?A#*fK z_}31;hNMa`3P}arTI3TN6EHyv3{+=-e!Aqx=N=`7L z2ZK?CmdS(e$$+xK3Ho$e%Yr}0?O??0m#+b_Su2woCX@x+LGKgzXY;LT?qAEx%kNT*#jiG* z6QU1^*7EA&2oOlUk>BEeF0`(~4=ze2BlGwEvQ7mVfE3 zY^y(OU`^NmEu~ZTaG3%=0;a(7cWcAims9^hQ1O&DF%j)Y=WAnePF~S_ylT$TjAFQa z5dh5bpc?p!iir4B!YfG->AEQ_q{t`G?frdKPbyS1X&2yLdp0O(FTe5ooa;Q~*?b!wyaUdi%sktQAXH2_O;-te`aljsjnGfOR3&iJ8XLBB5%&3*!>- zf*mG%>%aj%U6@ zEMtyHv)p;|`foRT4e9z-CsI+HV%mZ$wZPO}WFyOd#Rq_{Vjt`z6nh2|RQVYC#WYDy z%tna)Z#g(~RVMTM1~jF0mHMe{_gYn$~;L^F(^y3V$#I2<~CXo)lbk`07DFEzXz#lCjwjwo3P%uhW0P0_D^VWY8q|XWm;^SajWy=PFG0ZO#~n*cu{%5Cg31mu%JFva{s8LK<+yF$M4esNQ_F!0 z_<~2S^AeMw;B{<`%i&;Osru%rqC4ld{%LM*b|&FfcnDgh{{qe6f1L!<2zmWI`r@)Z zyD&;M%C06eLuwX&I9(>^?OH=#bTZdRV8j}&4C-Iq<(c0Jfx*#CZhP9>q`{tMA50mZ zU0$zGG)Y>{s_0dmj<&YI8`ea9(2QQLTBzt5Q=A!p@F>+|H#>v$wI6;7mMIpHzW z4GBN%(K+$hR!lrvLU#Q2;ZtG+%J zOFo4OO#kyfZc$o`Cv}eP#H4d;6b#9J#2coi%v>06(@N@g;CKcZugMhn^Sy0?19;+@ z!}d-=UAfb?&Ivqh9yRee+4)EC3~|H{;MG$}Wppf-RVeEtRyGq%F6p0{N#0 z8~@J}{q=s6FuWOMw`)fVle7ur?^)u5c@QHp+(8YdWf*!ht$s%MP)U7@zh_1M#5m~` z1lswM$&;D*FpsJC_uib%jwEufpEn%$Gnyjij)VIEa|wEor=PS1^Yx5p+x&hfYmb@O1vdlk`CW}=3-NBKLpAhu6GPjCovJh3 zS%{2`#ASJ;g%0kMSc!5uY2sq@okw|Lero;Tb~Oh4a?AfdOWOpCM;!b_=nKvX_OP-f zZPll(!B6_;@5U~h={q&MTs$=5F-i&FE-z1-pPl7+4xMZZ)(+IJA5RR$oz3EM-a8!$ z21LvA4a%vFz`iXiV7~b(TFUBG{XTa3Xd>i*3q>s;S=cK6d|M4^^XtiRxlkAln zA(gj&)f~_1MazHz#`E_>zy#egb)Q%&iV73oLT%EAt-pWp|K5VG0J7F(I+4NPvcq6v zY;T6E^SY88)B0IvzR4&sFdBC0|MP2JLifC%4^4roxgYO7_Y#suR-3dgTo>N2yIGm^ zsz1ww>94o==K>$`H#i_%f+_o8?G|eTX~ytR=S_hQhz7S%B`wf@AF?X?f$bslZ3FXv>72 z9;5$bXMs1#gSq1ztKHAWh~pTMuzu|sM+ zFiQ{py8OLjTsV1k6mf#}_`HQHvTJi<2FL&QO)ALz&t1BKb2!`HQn`k6R|GsLflHl2%N2*f|iafX$9r%2m7KhZ~m~iRumOmnpwcfetb!>=2HjMan{^u7d zg{DFpl2MI3mPmChaoi@$xH9e_A(z~K1ZEH~zI!dMQt*4@lwvIV^kA%X&!3e4XRP4m zV;E8QZCki7V)ZPNNt7u+gD8d0lL#l7$uWJ{5q{PCzbBa^ET8qFg$wRI{zf=;lFdTDPxBVo2xZq}evD@EN9Z6Q&M zbEmebahd|cE+*aeetKPAKcDa}o8gqn3Pi1*nOLA3x#}Ww!qa!Fp=EMH2&B z-XIgAlf4yQn9!OyGq1Xjp+RzWM!4)f)@latSbp0$HnXz5{+HYM;{oL$4RuM{2j<-4 zLq$xfLu^46OUj;(!<0zK6*v6R;p8qy3$P3dzru;vMU@rnuV<1K8#MM#yueNBQXoG^ ziT90SwbySG8kS@4ona`Jym%)KmB@6KaWW(>fmyUhE0u7S2SP~%-m2Pehever*&Cuk=hcy%2SEeV)r#Ot;tXHne6@lS@X)rh&8A3w4qELdK;Ut{pp%_6*>VJ zYZk=4Q(jx62hne>FtI<7oQeOiZ)bty1co@H(BLoHJQ$r&SL_bw(TvE>Z;vt1`)lPw z(VN^<`Z;rljk^W(L#d)aHUy&JXc6-@P^NOG*cp7x!+>KaoA!J~rlqBo-RJAu$~B2; ztr^4^YNily5ilf~|KH}v8v|G$Hdaj{LF?cGGh4!)$va?9IgK3f6XvT8KFu4e@tz@m zUXzVtOnd@gp#n|i?&LWEVK5m+F?`~elp?y19X~5kl@9axvT!i{=kQC>_OVY5vAYes zd*BwuteF>XJw>8|=nK~ge)US4kSm^-eptBfG79!q;fOiKI{GHN&QVLGVmMf?e-ClP zEOLN|oq*_VRn_j2r-LK;MVg;W3WoVl;eH+_1n0v%LA_#AR(0ED$A?NKHK92Omsn{#_vxo6X`4Ns^@BN2kX<3eH^~V| z&)}@ux)yyAF~2AMV%&bTD!RzX`JU+KyFtT=RprDajyB;3d{9QDqm-BDMc(MbCgixEseoWUEM8fSL84Nf zB)lOMocWsgyCg(*_-2W_{^t^<|Jj5WrrQZ+^aJ&%FWOl5uI3*9#{-6;=7yCT4HfTq zE{3~i`2+bXIoh#JJJf|I=^ltu72lqCzp{PM;XlknHusooKc=1J-w2&NA#pg`p z$cDO#d7oH$1TZ11?zwa3hNJhdm+X^GlHAC%Sj;n|kQz0=kAsY1y=I^`n~ z`MuxMZxQpfaT{!~7=A2H*ENW>){B(W8xu-qo+c3dXAn%IQ=!z1be(ICbd@eSJ*gVf zeGe8}?vv1uXOO=4?nER`JH|s}y9sN@54nD|cZJ8@Ex!s-|3(+yHYnxZkXt)7r(Pvd zei2o-xMbGntNoKEG-OU_faIXgvV1+yb(&ZYQhb!Bj%-Kp2H)2V|J^d47v^-u zSEN#ctUB{jxy`A=l*u+ zI3&zEAvdRRq#U|iyvgY2V$CRualZH zl?QdF1iC%(lb~{GsQ9I`Vw;Ek;(Qt(_ERMueykN}9Bp^*YuCey*H_Cq3vXW|Ed8t? z^=)VWucySx=b*Cq2v{tx6L>sdIed8!>9fPY%DB&+!%@4Ob@Nl^O>BIoeUQrzIv=*8kExl@%q<`>Dq*4Q^ET z_Rgo}eb&4|yRt}E>z__@`+>7bu+Nznw?nHSR9NfNUK+WdQbX5hY=p>3@cG<1xT0L` zh#f{L&d%_UUhn?!Po>TVhu+6xNh?`)hPwoB()&A>QtF2^A1R&=arKqQYAJKyGOi6- z#2x!`*XQ1g=y`NmOIK0DmL`pu#{RyC5)QI7*;+fHFy^3Vgsp?3 zWUeVGS~BXdZP@WBEH*FTwN`0i!QI^@^W&`<^nrcLvlv%4FuT~a8l@UbTC3Un)~lo% zi#Rp%(#!AncXpXteNxTGJMyp5`Tz5~&89Wm+EyM{49OAP-1=4LJU&K#NL8~e zebqk=Ue*59+E<{59KjNT?F?Q|L1AV3G+||-j&9i zJnD_(O)+Y5p>pvfhKH`DSQn$Or14-tPkt|^yG!iZ_g#0*6zB4o6D_3c3NHG*|I2>) zXIE7ff7|YoI^E#Xv6eAiha(tSQ`@zb;v=)*h3AHJ+m_gx%l-q(Dbeok7Sx0BZT(sI z$x;W-_CLx@ZOLeCPx@7(EaZAf>sNV@`!=b3&HiS!zMdZ1Txjt4!AHuyC@J^loNSh>SiLXYNZkfRys za=N(pOG2j!H zduAl3kFdR$PDi`V>)3ipqu z7~1aDqXk&gN3x^?eQ3i8g^N1VHrBX?OmdhO3G$-Cb;gFrZt)XB7j(Pki0J##(+hU} z8VI{@%EMJLntFdL?FcH$Cb{HhoGLAR^R#v#;O7 zu(nNELi$s8{zKhjQbG}>JYr4cUXqr|z2p}};g~p>FTeV_l}&TulyXf90T_mGGhsh* zp-e-C-EZOJyNpUR+g|Lg6!qm*Fz7R567h~CSr53R93W|>cQKlm&E;7M=*Go`ZX2h^ z_$2dq7a0Gtz9bKM6GHl9nc&qp^T`h}$s}UCs|NYV8>POMDBT*5A_~9o61Yg-UCU|% zdxQaO%DTO`sZPp$hpl^QRgNb+3GPVd%9IO`#VgYa+cdTZ6+MR0OjNVzBU>?;+s6Gs z;LvZJUB-Kozx%ATVT2`J4F;D$^54r0CX4K(N3qMi&Hj!{TP{d4mb658`=#^vtn#+$ zArPY!;il1(19hhCe#MM~3(9gSnm^|8ZWGbc(V?C8^c2@Werel*EPl=FPy+AgwWx_y zmSxHM-XMD%>2z3q?R4h8m*m557E6fx%glWtGF<~(0;Yjc5-OVbN<(cr7+<|hj@jSU zX{`C7EF+tFXR;IXzSD&Bj7h)D5bMHfkY;K2+0_!n4qu`|BJrb(yVwHFZWs5PCk!ec z3sSW)sRean5k1cfug}X4(5Biu3H@D~`#^^L*FXr>dEyGS?V$`wX|^_SHgd)`7~)dd&g3isJ0Uz9=T;=Jmvub5V$MOqHCgIP zMv;-0f;4Iojwm=5{~Z9$!M^IA&uu0Hce=B%Vu(<_HB^)f#;h0)jplob!-9!6J84~9 z;v~Q-$$MIxQR3qKpTdBvQd5``j#}=i1rR(hRhYo+X4IH5;YGpYJAZL!kHP^>yfzr; zqi@nIX|+%ZvgSd?Aj+ZG9XL|FZdXTC)a<3{!)^I<~Az zpS^>|!T`Gt26XC5q9ILl2714ssmn~0PT@NEOe=Q$SpX)P7#tE>{#+q8<=XM-%9^YN zuO=2wJt>_v*1dZaSdQY64UI!zMqRL}27hB{PskW+8I`*2?7NTocV=A#?M|Iu*ZT$MY@B(M))ObppVS;IM+QDD}J)dBxew=*3t+d~L`a^AJ`9 zicUEeCwjt`S?XI*(NnP0$}?8u`=4)kDp=t*cTunBTvX!SAUpHFID4~GjTy5fJ1i!{ zy0YmpT@@KDW6R4!Uo-v%o_|H~iVwtD4LS5M&mj?1%p%nW!`c=4rYlItj3Bx6wUs+kZnXNUxrRQrZF$iw_9G95}nA#FEu&j1HOCQZtYE}>#Z><{@SRK}#C z%O&$n{Cnny_%k@CFcBFe_MK@$*4cp&+~!&J7E|C_HrB`TA&t9w(_bO?o#%M%9(k>| z7j-W=HL*PV50PdtlCMZgMdbrFgtO|7&Nko_27qTUcd>3CEYQ1HJZCl)?*-V3japMo zs{{%j(BF6gNhM5RDfkWqLP)+rhM@y{$!Z1gmWH2#QC@1E^EZ z{B5d8&ya*!QYCWE80IlCv8#O}j__<*0k7NqUWxNh1|9GFc)`oVhxp)GGZ7R@g2zf103d1i z)4lMgi`htULGHM(ukR?_p!rA7o?u_s#AQe@2rolfpn7>%v=a)%!GZyNTi|w+`Q*$P zTFmalaS|Zp@6ZoIEZG4EW%5ViaY+tLNVbOob^5hf2`O`>C9o-^<#=!eWoE^}4$H&Q z!0ruE$g?KgHX#5P@~_{CG&a1N1n;nZ<)~5+=>~A|WYkmGcne=KUs0Z+JhbFmdgZuM}6g{JF zj%tI2^KQ?XGhcDX7Cnn;VMAI3Q6hVk!@-;#FUu0}KWmm%+iP=9DWZuA;I9FTEI0+` zq`}z2!n|;TF8)$GaTffU?Qd~&7JULgJRczOamH|L(ah3Uxv*^o#3D*=8 x#^IWRYl;aAOnl%*1=kc47MS?({}L6Wi;gJ|-!0L(aBmL$*|F7h3q{{H_`h`_ and transform tensors originating from these +frameworks to CuTe tensors. The present page documents the conventions, the API available to the +user, and provide example code snippets for common usage patterns. + +Implicit Conversion +------------------- + +Tensors originating from frameworks supporting the DLPack protocol can be directly provided to a +JIT function as a regular parameter. |DSL|'s runtime implicitly converts the original tensor to a +CuTe tensor with a fully dynamic layout except for the stride element corresponding to the leading +dimension. The example below demonstrates this use case. + +.. code-block:: python + + import torch + import cutlass.cute as cute + + @cute.jit + def foo(src): + """ + The following lines print + + ptr o (?,?,?):(?,?,1) + + """ + print(src) + print(type(src)) + + a = torch.randn(30, 20, 32, device="cpu") + foo(a) + + +Explicit conversion using ``from_dlpack`` +------------------------------------------ + +|DSL|'s runtime provides an interface for converting DLPack-compatible tensors to CuTe tensors, + +.. code-block:: python + + b = cute.runtime.from_dlpack(a) + +where ``a`` is a tensor supporting the DLPack protocol with the ``__dlpack__`` +and ``__dlpack_device__`` methods. The resulting CuTe tensor ``b`` has a fully static layout. This +conversion is performed without copying any tensor data, enabling seamless integration with major +frameworks. Users can create tensors using NumPy, PyTorch, etc. and directly feed them into JIT +functions writtnen using |DSL|. + +The resulting CuTe tensor shares the same underlying memory buffer as the original tensor. This +zero-copy approach maximizes performance by eliminating unnecessary data duplication. However, it is +important to note that the CuTe tensor's validity is tied to the lifetime of the original tensor. If +the source tensor is destroyed or goes out of scope, the corresponding CuTe tensor becomes invalid +since it references the original memory location. + +The full signature of from_dlpack is as follows: + +.. code-block:: python + + def from_dlpack(tensor, assumed_align=None): + +The ``assumed_align`` integer parameter specifies the alignment of the tensor in unit of bytes. +The tensor's base address must be divisible by ``assumed_align``. When not provided explicitly, +the alignment is set to the natural alignment of the tensor's element type. Note that the alignment +information is part of the pointer type in the generated IR. Therefore, programs with different +alignments have a different IR and identical IRs are required for hitting the kernel caching +mechanism of |DSL|. + +Code Example +~~~~~~~~~~~~ + +The following code demonstrates how to convert a PyTorch tensor to a CuTe tensor using the +``from_dlpack`` function with default parameters. + +.. code-block:: python + + import torch + import cutlass + from cutlass.cute.runtime import from_dlpack + + x = torch.randn(30, 20, device="cpu") + y = from_dlpack(x) + +Once converted, we can access the tensor's information through various +attributes. The following list shows the attributes of the converted tensor: + +- ``tensor.shape``: the tensor's shape +- ``tensor.stride``: the tensor's stride +- ``tensor.memspace``: the tensor's memory space +- ``tensor.element_type``: the tensor's element data type + +.. code-block:: python + + import torch + import cutlass + from cutlass.cute.runtime import from_dlpack + + x = torch.randn(30, 20, device="cpu") + y = from_dlpack(x) + + print(y.shape) # (30, 20) + print(y.stride) # (20, 1) + print(y.memspace) # generic (if torch tensor in on device memory, memspace will be gmem) + print(y.element_type) # Float32 + print(y) # Tensor<0x000000000875f580@generic o (30, 20):(20, 1)> + +The string format of the resulting CuTe tensor is + +.. code-block:: + + Tensor<0x{tensor.data_ptr:016x}@{tensor.memspace} o {tensor.shape}:{tensor.stride}> + +As can be seen in the example above, ``from_dlpack`` first results in a tensor with a static layout. +To obtain dynamic or mixed static/dynamic layouts after calling ``from_dlpack``, the +``mark_layout_dynamic`` and ``mark_compact_shape_dynamic`` functions are used and described in +the following sections. + +When to Use Explicit Conversion? +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The DLPack protocol is a widely used protocol for interoperability between different frameworks. +However, there is some associated overhead. Based on our benchmark, it usually takes between 2 to 3 +us per call to ``from_dlpack``. + +Explicit conversion allows for caching the converted CuTe tensors in order to avoid the overhead of +repeated calls to ``from_dlpack``. + +.. code-block:: python + + x = torch.randn(30, 20, device="cpu") + if key not in cached_tensors: + # Do the conversion only for cache misses + cached_tensors[key] = cute.runtime.from_dlpack(x) + foo(cached_tensors[key]) + +Another use case for explicit conversion is to gain fine-grain control over which modes of a tensor +are considered dynamic from the perspective of the generated program. + +Mark the Tensor's Layout as Dynamic with ``mark_layout_dynamic`` +---------------------------------------------------------------- + +After calling this function, all shape modes become dynamic. The stride modes also become dynamic +with the following two exceptions: + +1. the leading dimension's stride remains fixed at 1; +2. stride elements equal to 0 (which indicates broadcasting) are retained. + +The full signature of ``mark_layout_dynamic`` is as follows: + +.. code-block:: python + + def mark_layout_dynamic(self, leading_dim: int|None = None): + +The ``leading_dim`` parameter specifies the leading dimension of the tensor. The leading dimension's +stride is set to 1 unless inconsistent with the layout of the DLPack tensor. For example, + +- For a tensor with layout ``(2,2,3,4):(2,1,4,12)``, if ``leading_dim`` is specified to be 1, + the layout will be marked as ``(?,?,?,?):(?,1,?,?)``. +- If ``leading_dim`` is specified to be 0, a deduction failure error is raised because the stride of + dimension 0 is 2 (not 1). + +The default value for ``leading_dim`` is ``None``. In such case, the system +automatically deduces it from the tensor's layout using the following logic: + +1. If a dimension's stride is 1, that dimension is marked as the leading dimension. +2. If multiple dimensions satisfy condition 1, an error is thrown indicating deduction failure. + Note that after converting a **PyTorch** tensor to the DLPack format, the stride for dimensions + with size 1 are canonicalized to 1. This canonicalization can increase the likelihood of + deduction failures. This behavior is specific to PyTorch and does not occur with NumPy for + example. +3. If no dimension satisfies condition 1, all strides are marked as dynamic. + +For example: + +- For a tensor with layout ``(2,2,3,4):(2,1,4,12)``, the leading dimension is 1. + The layout will be marked as ``(?,?,?,?):(?,1,?,?)``. +- For a tensor with layout ``(1,5,1):(1,1,1)``, if ``leading_dim`` is not specified, + a deduction failure error is raised. +- For a tensor with layout ``(2,2):(8,2)``, since no dimension has stride 1, + all dimensions are marked as dynamic: ``(?,?):(?,?)``. + +Code Example +~~~~~~~~~~~~ + +The following example demonstrates how to use ``mark_layout_dynamic`` to specify dynamic tensor layouts. + +* ``t0`` shows the usage of ``mark_layout_dynamic`` with unspecified ``leading_dim`` and the automatic deduction of leading dimension. +* ``t1`` & ``t2`` shows the usage of ``mark_layout_dynamic`` with specified ``leading_dim``. +* ``t3`` shows the usage of ``mark_layout_dynamic`` with no leading dimension. +* ``t4`` shows the usage of ``mark_layout_dynamic`` with broadcasted dimensions. +* ``t5`` demonstrates the deduction failure when the there're more than one dimensions with stride equals to 1. +* ``t6`` & ``t7`` demonstrates incorrect settings for ``leading_dim`` and expected errors. + +.. code-block:: python + + import torch + from cutlass.cute.runtime import from_dlpack + + # (8,4,16,2):(2,16,64,1) + a = torch.empty(16, 4, 8, 2).permute(2, 1, 0, 3) + # (1,4,1,32,1):(4,1,4,4,4) => torch tensor when dimension has shape 1, its stride is degenerated to 1, + # resulting in (1,4,1,32,1):(1,1,1,4,1) + b = torch.empty(32, 1, 1, 1, 4).permute(3, 4, 1, 0, 2) + # (2,2):(8,2) + c = torch.empty(3, 4)[::2, ::2] + # (3,1,1,5):(5,0,0,1) + d = torch.empty(3, 1, 1, 5).expand(3, 4, 2, 5) + + # auto deduce the leading dimension to be 3 + t0 = from_dlpack(a).mark_layout_dynamic() + print(t0) + # (?,?,?,?):(?,?,?,1) + + t1 = from_dlpack(b).mark_layout_dynamic(leading_dim=0) + print(t2) + # (?,?,?,?,?):(1,?,?,?,?) + + t2 = from_dlpack(b).mark_layout_dynamic(leading_dim=2) + print(t3) + # (?,?,?,?,?):(?,?,1,?,?) + + t3 = from_dlpack(c).mark_layout_dynamic() + print(t3) + # (?,?):(?,?) + + t4 = from_dlpack(d).mark_layout_dynamic() + print(t4) + # (?,?,?,?):(?,0,0,1) + + t5 = from_dlpack(b).mark_layout_dynamic() + # Can't decude the leading dimension from layout, please specify the leading_dim explicitly. + + t6 = from_dlpack(a).mark_layout_dynamic(leading_dim=1) + # Expected strides[leading_dim] == 1, but got 16 + + t7 = from_dlpack(b).mark_layout_dynamic(leading_dim=3) + # Expected strides[leading_dim] == 1, but got 4 + +Mark the Tensor's Layout as Dynamic with ``mark_compact_shape_dynamic`` +----------------------------------------------------------------------- + +The ``mark_compact_shape_dynamic`` function provides fine-grain control over dynamic shapes for compact +layouts. The full signature of ``mark_compact_shape_dynamic`` is as follows: + +.. code-block:: python + + def mark_compact_shape_dynamic(self, mode: int, stride_order: tuple[int, ...]|None = None, divisibility: int = 1): + +The ``mode`` parameter determines which shape dimension becomes dynamic. After calling this function, +the specific shape dimension given by ``mode`` is marked as dynamic immediately. The stride will be +updated accordingly but this process is delayed until the C ABI of the tensor is constructed. +For modes that have a shape of size 1, their stride are canonicalized to 0. + +The ``stride_order`` parameter specifies the ordering of strides in the tensor. It is consistent +with ``torch.Tensor.dim_order()`` and defaults to ``None``. The parameter indicates the order of +modes (dimensions) if the current layout were to be converted to row-major order. It starts from the +outermost to the innermost dimension when reading it from left to right. This parameter must be +explicitly set when the stride order cannot be automatically deduced from the tensor's layout, such +as when multiple dimensions have a stride of 1. + +For example: + +- Layout ``(4,2):(1,4)`` has a ``stride_order`` of ``(1,0)`` indicates the innermost dimension is + 0 (``4:1``), the outermost dimension is 1 (``2:4``). +- Layout ``(5,3,2,4):(3,1,15,30)`` has a ``stride_order`` of ``(3,2,0,1)`` indicates the innermost + dimension is 1 (``3:1``), the outermost dimension is 3 (``4:30``). + +If ``stride_order`` is not specified, the system automatically deduces it from the tensor's layout +using the following logic: + +1. Sort the strides in descending order. +2. If multiple dimensions have a stride of 1, a deduction failure error is raised. + +For example: + +- For a tensor with layout ``(2,2,3,4):(2,1,4,12)``, the deduced ``stride_order`` is ``[3,2,0,1]``. +- For a tensor with layout ``(1,5,1):(1,1,1)``, ``stride_order``'s deduction fails because + all dimensions have an identical stride of 1, making it impossible to determine the correct ordering. + +If ``stride_order`` is specified, the system validates that the order is consistent with the +tensor's layout. + +The ``divisibility`` parameter specifies the divisibility of the dynamic shape. It could be used to +represent the assumption alignment of the input. Defaults to 1. + +Note that this API is only available for compact tensors. For non-compact tensors, we can use +``cute.assume`` to attach divisibility information to a specific shape mode in a host JIT function, +as demonstrated in the following example: + +.. code-block:: python + + @cute.jit + def foo(a: cute.Tensor): + new_shape = a.shape + # use cute.assume to set shape of mode=0 with divisibility=16 + new_shape[0] = cute.assume(new_shape[0], 16) + new_layout = cute.make_layout(new_shape, stride=a.stride) + new_a = cute.make_tensor(a.iterator, new_layout) + + +Code Example +~~~~~~~~~~~~ + +The following example demonstrates how to use ``mark_compact_shape_dynamic`` to specify dynamic tensor layouts. + +* ``t0`` & ``t1`` show the usage of ``mark_compact_shape_dynamic`` with unspecified ``stride_order`` and different ``mode`` and ``divisibility``. +* ``t2`` shows the usage of consecutive ``mark_compact_shape_dynamic`` with unspecified ``stride_order`` and different ``mode`` and ``divisibility``. +* ``t3`` & ``t4`` show the usage of ``mark_compact_shape_dynamic`` with different specified ``stride_order``. +* ``t5``, ``t6``, ``t7``, ``t8``, ``t9``, ``t10``, ``t11``, and ``t12`` demonstrate incorrect settings for parameters and expected errors. + +.. code-block:: python + + import torch + from cutlass.cute.runtime import from_dlpack + + @cute.jit + def kernel(t: cute.Tensor): + pass + + # (8,4,16,2):(2,16,64,1) + a = torch.empty(16, 4, 8, 2).permute(2, 1, 0, 3) + # (1,4,1,32,1):(4,1,4,4,4) => torch tensor when dimension has shape 1, its stride is degenerated to 1, + # resulting in (1,4,1,32,1):(1,1,1,4,1) + # b.dim_order() is (3,2,4,0,1) + b = torch.empty(32, 1, 1, 1, 4).permute(3, 4, 1, 0, 2) + + # auto deduce the stride order to be [2,1,0,3] + t0 = from_dlpack(a).mark_compact_shape_dynamic( + mode=0, divisibility=2 + ) + kernel(t0) + # (?{div=2},4,16,2):(2,?{div=4},?{div=16},1) + print(t0) + + t1 = from_dlpack(a).mark_compact_shape_dynamic( + mode=1, divisibility=2 + ) + kernel(t1) + # (8,?{div=2},16,2):(2,16,?{div=32},1) + print(t1) + + t2 = from_dlpack(a).mark_compact_shape_dynamic( + mode=1, divisibility=2 + ).mark_compact_shape_dynamic( + mode=3, divisibility=2 + ) + kernel(t2) + # (8,?{div=2},16,?{div=2}):(?{div=2},?{div=16},?{div=32},1) + print(t2) + + t3 = from_dlpack(b).mark_compact_shape_dynamic( + mode=2, divisibility=1, stride_order=(3, 0, 2, 4, 1) + ) + kernel(t3) + # (1,4,?,32,1):(0,1,4,?{div=4},0) + print(t3) + + t4 = from_dlpack(b).mark_compact_shape_dynamic( + mode=2, divisibility=1, stride_order=(2, 3, 4, 0, 1) + ) + kernel(t4) + # (1,4,?,32,1):(0,1,128,4,0) + print(t4) + + t5 = t2.mark_compact_shape_dynamic( + mode=3, divisibility=5, stride_order=(0, 1, 2, 3) + ) + # The stride_order is not consistent with the last stride_order + + t6 = from_dlpack(a).mark_compact_shape_dynamic( + mode=3, divisibility=5, stride_order=(0, 1, 2, 3) + ) + # The stride_order is not consistent with the deduced stride_order + + t7 = from_dlpack(b).mark_compact_shape_dynamic( + mode=0, divisibility=4 + ) + # The layout could not be deduced, please specify the stride_order explicitly + + t8 = from_dlpack(b).mark_compact_shape_dynamic( + mode=30, divisibility=5, stride_order=(3, 0, 2, 4, 1) + ) + # Expected mode value to be in range [0, 5), but got 30 + + t9 = from_dlpack(b).mark_compact_shape_dynamic( + mode=3, divisibility=5, stride_order=(2, 1, 2, 3, 4) + ) + # Expected stride_order to contain all the dimensions of the tensor, but it doesn't contain 0. + + t10 = from_dlpack(b).mark_compact_shape_dynamic( + mode=3, divisibility=5, stride_order=(0, 1, 2, 3, 4, 5) + ) + # Expected stride_order to have 5 elements, but got 6. + + t11 = from_dlpack(b).mark_compact_shape_dynamic( + mode=0, divisibility=4, stride_order=b.dim_order() + ) + # The shape(1) of mode(0) is not divisible by the divisibility(4) + + t12 = from_dlpack(b).mark_compact_shape_dynamic( + mode=0, divisibility=1, stride_order=(2, 1, 3, 0, 4) + ) + # The stride_order is not consistent with the layout diff --git a/media/docs/pythonDSL/cute_dsl_general/notebooks.rst b/media/docs/pythonDSL/cute_dsl_general/notebooks.rst new file mode 100644 index 00000000..623c2ac3 --- /dev/null +++ b/media/docs/pythonDSL/cute_dsl_general/notebooks.rst @@ -0,0 +1,16 @@ +.. _notebooks: + +Educational Notebooks +===================== + +A number of notebooks for educational purposes are provided in the `CUTLASS GitHub repository `__. +A list with handful links is given below: + +- `"Hello world" `__ +- `Printing `__ +- `Data Types Basics `__ +- `Tensors `__ +- `The TensorSSA Abstraction `__ +- `Layout Algebra `__ +- `Element-wise Add Tutorial `__ +- `Using CUDA Graphs `__ diff --git a/media/docs/pythonDSL/faqs.rst b/media/docs/pythonDSL/faqs.rst new file mode 100644 index 00000000..e8cce741 --- /dev/null +++ b/media/docs/pythonDSL/faqs.rst @@ -0,0 +1,137 @@ +.. _faqs: + +FAQs +==== + +General +--------------------- + +**Are the DSLs replacing C++ templates?** + + TL;DR: No - but also yes. The CUTLASS 4.0 release (CuTe DSL), along with all + future extensions to our Python-native programming models, does not come at the + expense of CUTLASS C++. CUTLASS 2.x and 3.x C++ APIs are both going to continue + receiving fixes and updates for the architectures we support them for. However, + CUTLASS 4.x CuTe DSL is fully isomorphic in its programming model and performance + with CuTe C++ for Blackwell, and it is our hope that the community embraces this + for much easier while still equally performant custom kernel development. This is + why we are releasing CuTe DSL with support for all architectures starting with the + NVIDIA Ampere Architecture. + +**What is the difference between CuTe DSL, CUTLASS Python, and CUTLASS DSLs?** + + CUTLASS Python was the Python interface for instantiating C++ kernels via a Python + frontend. This is now deprecated with the release of CUTLASS 4.0. CUTLASS DSLs are + a family of Python DSLs for native device programming in Python. Currently, this is + limited to our initial release of CuTe DSL, but future versions will include higher-level + abstractions that gradually trade off control for convenience. + +**What should I learn, CUTLASS C++ or the Python DSLs?** + + We believe the Python DSLs will significantly improve the learning curve and recommend starting + with them for all newcomers, as they eliminate the inherent complexity of learning C++ + metaprogramming for GPU kernel programming. Since CuTe C++ and CuTe DSL share fully isomorphic + programming models and patterns, any knowledge gained can eventually be applied to C++. + +**Where will the code live? PIP wheel or GitHub repo? Do I have to build it myself?** + + This is a major change compared to CUTLASS C++ and Python DSLs. Going forward, + the GitHub code only exists as a way for users to file issues and pull requests against. + While it can be used with the pip wheel, we do not recommend most users do so unless they are + hacking on the DSL itself. For all other users, we recommend they + simply ``pip install nvidia-cutlas-dsl`` and use the pip wheel as the single source + of truth for the dialect compiler and DSL implementation. CUTLASS GitHub repository will + contain a ``requirements.txt`` file pinning the version of the wheel consistent with the state + of the OSS repository (please see :doc:`quick_start`). This means getting started with + CUTLASS is easier than ever: no more CMake command lines to learn and no more builds to kick + off. Simply install the pip wheel and start running the examples. + +Migration +--------------------- + +**Should I port my code from C++ templates to Python?** + + Almost certainly not, unless you need extremely fast JIT times for your kernel and C++ compile times + are a blocker for you. The 2.x and 3.x APIs will continue to be supported, and Nvidia's Hopper and + Blackwell architectures 3.x will continue to improve in terms of features + and performance. + +**Are portability promises different with Python?** + + For the initial release while the DSL is still in beta, we do not promise any portability + as we may make changes to the DSL itself. While we do not expect any changes to the CuTe operations, + the DSL utilities, decorators, helper classes like pipelines and schedulers may change as we refine them + with community feedback. We encourage users to file issues and discussions on GitHub during this + beta period with their feedback! + + In the long term, we plan to continue to treat the OSS community with care. + Just like the prior history of CUTLASS, we plan not to break users unless necessary, + but we reserve the right to make limited breaking changes in case we believe it is a + net benefit to the community and project. These will be announced ahead of time and/or + clearly highlighted in the CHANGELOG of each release. + +Technical +--------------------- +**What NVIDIA architectures will it support?** + + CuTe DSL will support all NVIDIA GPU architectures starting with NVIDIA Ampere Architecture (SM80). + +**Will it be compatible with DL frameworks (e.g., PyTorch, JAX)?** + + Yes, we will provide utilities to convert from DLPack-supported tensor formats + to ``cute.Tensor``. This should allow a user to never have to leave Python + when writing model code in their framework of choice. Our JAX interoperability story is not + as strong as PyTorch's today, however, we are actively working on improving it + and welcome contributions in this space. + +**Does it compile to PTX or SASS?** + + CuTe DSL compiles the program down to PTX. After that, we currently use the PTX compiler that + ships with the CUDA toolkit to compile the PTX down to SASS. We plan to remove + this limitation in the future and allow the use of the PTX JIT that is included in the + CUDA driver in case a user does not have a CUDA toolkit installed. + +**Do I need to use NVCC or NVRTC?** + + No, the ``nvidia-cutlass-dsl`` wheel packages is everything needed to generate GPU kernels. It + shares the driver requirements of the 12.9 toolkit which can be found + `here `__. + +**How would one debug the code?** + + Since CuTe DSL is not native python and an embedded DSL instead, tools like `pdb` + cannot be used. However, if you have experience with GPU kernel programming, the debugging + techniques will be nearly identical. Typically, compile time and runtime printing + of types and values are the most expedient. Please see `documentation on printing `__ + to learn how to print types and values at both compile time and runtime. + You can also use ``cuda-gdb`` to set breakpoints in the program and step through the execution + or use tools such as ``compute-sanitizer`` to detect and triage bugs in your program. As the DSL + matures, our source location tracking from Python user programs will also improve to provide + more helpful source-level mapping when setting breakpoints and using other tools such as nsight. + +**How would one implement warp specialization in CuTe DSL?** + + Exactly the same way you would in C++ but in a Python-native syntax instead. + Consult our :doc:`cute_dsl_general/dsl_control_flow` and + `"Blackwell kernel example" `__ + for a detailed how-to guide. + +**Can I call functions from other functions or use OOP?** + + Yes. We frequently call functions from one another and set up class + hierarchies to organize and modularize our code for pipelines and schedulers. + Consult the :doc:`cute_dsl_general/dsl_introduction` documentation or our examples for more details. + +License +--------------------- +**Q:What is the license for CuTe DSL and the associated GitHub samples?** + CuTe DSL components available `on Github `__ and via the nvidia-cutlass-dsl Python pip wheel + are released under the `"NVIDIA Software End User License Agreement (EULA)" `__. + Because the pip package includes a compiler that shares several components with the CUDA Toolkit, + it is subject to usage terms and restrictions similar to those of the CUDA SDK. Please refer to the EULA for specific terms of use. + + CuTe DSL samples and Jupyter notbooks, released `on GitHub `__ are provided under + the BSD 3-Clause License and may be used and redistributed under those terms. This distinction ensures that developers have flexibility + when using or modifying the code samples, independent of the compiler and runtime components governed by the EULA. + + If you have any questions or need clarification, feel free to contact us. diff --git a/media/docs/pythonDSL/functionality.rst b/media/docs/pythonDSL/functionality.rst new file mode 100644 index 00000000..b3575dd7 --- /dev/null +++ b/media/docs/pythonDSL/functionality.rst @@ -0,0 +1,34 @@ +.. _functionality: + +Functionality +==================== + +The CUTLASS DSL 4.0 release supports **Python 3.12** only. It shares the same driver requirements +as the `CUDA Toolkit 12.9 `__. +Specifically, the driver version must be 575.51.03 or later. + +Currently, only Linux x86_64 is supported. Additional platform support will be added in future releases. + +Supported MMA Operations +--------------------------------- + +**NVIDIA Ampere Architecture:** + +- FP16 / BF16 tensor core instructions + +**NVIDIA Hopper Architecture:** + +- FP16 / BF16 +- FP8 + +**NVIDIA Blackwell Architecture:** + +- FP16 / BF16 +- TF32 +- I8 +- F8 + +Notable Limitations +------------------------------ + +For current constraints and unsupported features, refer to the :doc:`limitations` section. diff --git a/media/docs/pythonDSL/limitations.rst b/media/docs/pythonDSL/limitations.rst new file mode 100644 index 00000000..73d23a25 --- /dev/null +++ b/media/docs/pythonDSL/limitations.rst @@ -0,0 +1,279 @@ +.. _limitations: + +Limitations +==================== + +.. contents:: + :depth: 2 + :local: + +Overview +--------------------- +CuTe DSL is an embedded domain-specific language within Python. It utilizes a subset of Python's +syntax to provide a streamlined programming experience. It is important to understand that CuTe DSL +does NOT implement the complete Python language semantics in its JIT compilation process. + +This section documents the current limitations of the CuTe DSL. While some of these limitations +may be addressed in future releases, developers should be aware of them when building applications with +the DSL. + +Notable unsupported features +---------------------------- + +- GeForce RTX 50 Series support +- RS WGMMA (The input matrix A comes from register and the input matrix B comes from shared memory) +- Programmatic Dependent Launch (PDL) +- narrow-precision data type support, including related tensor core instructions +- convolutions +- full support for ahead of time compilation +- preferred clusters +- CLC-based tile schedulers +- EVT support +- Windows support + +Programming Model +--------------------- + +**Python Native Data Types** + CuTe DSL supports Python data structures when used for "meta-programming," + but these structures cannot be treated as dynamic values modifiable at runtime. + For instance, lists and dictionaries can be used to configure kernel parameters + during compilation or serve as containers for dynamic values, + but their structure and organization cannot be altered during kernel execution. + + - **Static Values:** + - Evaluated during JIT compilation phase + - Immutable after compilation completes + - Most Python native types (lists, tuples, dictionaries) are processed as static values + - Primarily utilized for "meta-programming" and configuration purposes + - Example: Lists can contain dynamic values but their structure cannot + be modified during kernel execution + + - **Dynamic Values:** + - Evaluated during runtime execution + - Modifiable during execution of JIT-compiled functions + - Only a specific subset of Python types are supported as dynamic values + - Primitive types are automatically converted when passed as function arguments: + - ``int`` → ``Int32`` (may be updated to ``Int64`` in future releases) + - ``bool`` → ``Bool`` + - ``float`` → ``Float32`` (may be updated to ``Float64`` in future releases) + + The JIT compiler processes Python native types analogously to C++ template parameters. + The compiled code cannot manipulate dynamic values of composite types + such as lists, tuples, or dictionaries. + + For example, following code doesn't work as traditional Python program inside JIT function. + + .. code:: python + + @cute.jit + def foo(a: Float32, b: Float32, i: Int32, res: cute.Tensor): + xs = [a, b] + # indexing list with dynamic index is not supported in CuTe DSL: + res[0] = xs[i] + + if i == 0: + # This will alway append Float32(3.0) to the list regardless + # of the runtime value of `i` + xs.append(Float32(3.0)) + + for i in range_dynamic(10): + # This only append one element to the list at compile-time + # as loop doesn't unroll at compile-time + xs.append(Float32(1.0)) + +**Python Function** + The DSL currently does not implement support for return values from Python functions, + although this capability is planned for future releases. + + Example: + + .. code:: python + + @cute.jit + def foo(): + return 1 # Currently unsupported in CuTe DSL + +**Expression or Statement with Dependent Type** + CuTe DSL implements static typing and does not support dependent types. + The type of each expression must be determinable during compile time, + in contrast to standard Python which implements dynamic typing. + + Example illustrating functionality in Python that is not supported in the DSL: + + .. code:: python + + # Valid in standard Python, but unsupported in CuTe DSL + max(int(1), float(2.0)) # => 2.0 : float + max(int(3), float(2.0)) # => 3 : int + + In CuTe DSL, types are promoted. For example: + + .. code:: python + + @cute.jit + def foo(a: Int32, b: Float32, res: cute.Tensor): + res[0] = max(a, b) # Type is automatically promoted to Float32 + + Following code using inlined if-else expression with dependent types + is not supported in CuTe DSL: + + .. code:: python + + @cute.jit + def foo(cond: Boolean, a: Int32, b: Float32, res: cute.Tensor): + res[0] = a if cond else b + + +**Control Flow** + The DSL transforms Python control flow statements (``if``, ``for``, ``while``) + during Abstract Syntax Tree (AST) processing into structured control flow in MLIR + which has the same constraints as dependent types. For instance, + changing type of a variable in loop body is not allowed. + + - Variables must be defined prior to the control flow statement + - Type consistency must be maintained throughout the control flow statement + - Don't support early exit or return from if-else statements + + Example illustrating functionality in Python that is not supported in the DSL: + + .. code:: python + + @cute.jit + def foo(): + a = Int32(1) + for i in range_dynamic(10): + a = Float32(2) # Changing type inside loop-body is not allowed in the DSL + +**Built-in Operators** + The DSL transforms built-in operators like ``and``, ``or``, ``max``, ``min``, etc. + into MLIR operations. They also follow the same constraints of dependent types. + For instance, ``a and b`` requires ``a`` and ``b`` to be of the same type. + + Comparison like ``==`` on Sequence of dynamic values is known to not produce + expected result at runtime. + +**Object Oriented Programming** + The DSL is implemented on top of Python and supports Python's object-oriented programming (OOP) features + for meta-programming at compile-time. + + However, similar to other composed data types, the DSL provides limited support for OOP when objects + contain dynamic values. It is strongly recommended to avoid passing dynamic values between member methods + through class state in your code. + + The following example illustrates functionality in Python that is not supported in the DSL + without implementing the ``DynamicExpression`` protocol: + + .. code:: python + + class Foo: + def __init__(self, a: Int32): + self.a = a + + def set_a(self, i: Int32): + self.a = i + + def get_a(self): + return self.a + + @cute.jit + def foo(a: Int32, res: cute.Tensor): + foo = Foo(a) + for i in cutlass.range_dynamic(10): + foo.set_a(i) + + # This fails to compile because `a` is assigned a local value defined within the for-loop body + # and is not visible outside of the loop body + res[0] = foo.get_a() + + The example above fails to compile because ``Foo.a`` is assigned a local value defined within the for-loop body, + which is not visible outside the loop body. + + The CuTe DSL implements an internal mechanism that provides limited support for OOP patterns via protocol. + As the DSL continues to evolve to support additional features, this mechanism is subject to change + and is not recommended for direct use in users' code for better portability. + + +**CuTe Layout algebra in native Python** + Entirety of CuTe Layout algebra operations and APIs require JIT compilation. These + functionalities are exclusively available within JIT-compiled functions and cannot be + accessed in standard Python execution environments. + + Additionally, there exists a restricted set of data types that can be passed as arguments + to JIT-compiled functions, which further constrains their usage in native Python contexts. + Only following CuTe algebra types are supported as JIT function arguments: ``Tensor``, ``Pointer``, + ``Shape``, ``Stride``, ``Coord`` and ``IntTuple``. For ``Stride``, we don't support ``ScacledBasis`` + from native Python Context. Unfortunately, in the first release, we don't support + passing ``Layout`` under native Python Context. + + +Suggestions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For reliable and predictable results: + +- Avoid dependent types in your code +- Implement explicit type conversion for dynamic values +- Clearly distinguish between static (compile-time) and dynamic (runtime) values +- Use type annotations as much as possible to help JIT compiler + to identify type to avoid ambiguity + + +.. code:: python + + # Example demonstrating explicit typing + alpha = 1.0 # Explicitly defined as float using `1.0` instead of `1` + # or `float(1)` + beta = 2.0 # Explicitly defined as float + result = max(alpha, beta) # Will correctly perform float comparison + +**Debugging Capabilities** + Debugging tools and facilities for the Python DSL are currently more limited in comparison to the C++ + API. For instance, we don't support single-stepping through the JIT-compiled code. And lack of exception + handling in JIT-compiled code makes it hard to debug in some cases. + +**Integration with Frameworks** + Integration with certain deep learning frameworks is in early development stages and may have + limitations. For instance, converting frameworking tensor to cute.Tensor is known to have overhead + with 2us~3us per tensor as we convert from general DLPack protocol which offers comptibility with + all frameworks. + +**Hashing DSL APIs and Objects** + DSL APIs and Objects are sensitive to MLIR context, region or other contextual information which has no meaning cross + different context. Any stateful design rely on ``__hash__`` likely misbehave with unexpected results. An example is + ``functools.lru_cache``, which combined with ``@cute.jit``, it may cache MLIR object from one context and use in another one. + + +Future Improvements +--------------------- + +The CuTe DSL development team is actively addressing these limitations. +Upcoming releases will aim to: + +- Implement support for return values from JIT compiled functions +- Improve support for built-in operators to handle more cases without dependent types +- Enhance debugging capabilities and tools +- Improve error messages with precise diagnostic information +- Extend support for additional numeric data types +- Improve performance of converting framework tensor to ``cute.Tensor`` with native support + for different frameworks +- Offer more user friendly benchmarking methodology + +Design Limitations Likely to Remain +-------------------------------------------- + +The primary objective of CuTe DSL is to provide a domain-specific language for expressing +complex CUDA kernels with optimal GPU performance, not to execute arbitrary Python code on GPU hardware. + +The following limitations will likely remain by design: + +- **Complex Data Structures as Dynamic Values**: Lists, tuples, and dictionaries will continue to function + as static containers. While they can store dynamic values, their structure (adding/removing elements) + cannot be modified during execution of JIT-compiled functions. + +- **Dependent Types**: Supporting dependent types would introduce substantial complexity and + adversely affect the performance characteristics of generated code. + +- **CuTe Layout Algebra**: We don't have plan to extend the support of CuTe Layout Algebra + under native Python Context. We are planning to extend support for data types and allow + JIT function to interoperate with native Python code. diff --git a/media/docs/pythonDSL/overview.rst b/media/docs/pythonDSL/overview.rst new file mode 100644 index 00000000..07abfb09 --- /dev/null +++ b/media/docs/pythonDSL/overview.rst @@ -0,0 +1,108 @@ +.. _overview: + +Overview +=========================== + +CUTLASS 4.x bridges the gap between productivity and performance for CUDA kernel development. +By providing Python-based DSLs to the powerful CUTLASS C++ template library, it enables +faster iteration, easier prototyping, and a gentler learning curve for high-performance linear +algebra on NVIDIA GPUs. + +Overall we envision CUTLASS DSLs as a family of domain-specific languages (DSLs). +With the release of 4.0, we are releasing the first of these in CuTe DSL. +This is a low level programming model that is fully consistent with CuTe C++ abstractions — exposing +core concepts such as layouts, tensors, hardware atoms, and full control over the hardware thread and data hierarchy. + +Why CUTLASS DSLs? +============================ + +While CUTLASS offers exceptional performance through its C++ template abstractions, the complexity +can present challenges for many developers. CUTLASS 4.x addresses this by: + +- **Simplifying metaprogramming**: Metaprogramming in Python is a lot more intuitive than with C++ +- **Accelerating Iteration**: Rapid prototyping with familiar Python syntax and blazing fast compile times +- **Lowering Barriers**: Reduced learning curve for GPU programming concepts and consistency between CuTe C++ and DSL +- **Maintaining Performance**: Generated code leverages optimized CUTLASS primitives + +Students can learn GPU programming concepts without the complexity of C++ templates. +Researchers and performance engineers can rapidly explore algorithms, prototype, and tune +kernels before moving to production implementations. + +Key Concepts and Approach +================================ + +CUTLASS DSLs translate Python code into a custom intermediate representation (IR), +which is then Just-In-Time (JIT) compiled into optimized CUDA kernels using MLIR and `ptxas`. + +Core CuTe DSL Abstractions +----------------------------------- + +- **Layouts** – Describe how data is organized in memory and across threads. +- **Tensors** – Combine data pointers or iterators with layout metadata. +- **Atoms** – Represent fundamental hardware operations like matrix multiply-accumulate (MMA) or memory copy. +- **Tiled Operations** – Define how atoms are applied across thread blocks and warps (e.g., ``TiledMma``, ``TiledCopy``). + +For more on CuTe abstractions, refer to the `CuTe C++ library documentation `__. + +**Pythonic Kernel Expression** + +Developers express kernel logic, data movement, and computation using familiar Python syntax and control flow. + +The DSLs simplify expressing loop tiling, threading strategies, and data transformations using concise Python code. + +**JIT Compilation** + +Python kernels are compiled at runtime into CUDA device code using MLIR infrastructure and NVIDIA’s ``ptxas`` toolchain, +enabling rapid iteration and interactive debugging. + +Relationship to CUTLASS C++ +================================= + +CUTLASS DSLs are not a replacement for the CUTLASS C++ library or its 2.x and 3.x APIs. Instead, it aims to be a high-productivity kernel +authoring framework that shares all concepts with CUTLASS 3.x C++ API such as CuTe, pipelines, schedulers etc. + +- **Performance**: Generated kernels aim to match CUTLASS C++ kernels in performance; however, some performance gaps + may exist due to missing optimizations that have been added over the years to CUTLASS C++ and may be missing in the DSLs examples. +- **Library**: The CUTLASS DSLs do not currently ship with a full GEMM/Conv autotuning profiler or library interface + akin to CUTLASS C++. Instead, it focuses on generating and autotuning individual kernel instances (for example: via tile size exploration) and via native integration DL frameworks that support auto-tuning. + +Getting Started +================================ + +- :doc:`quick_start` – Initial setup and installation. +- :doc:`cute_dsl` – Overview of the typical development and workflow using CuTe DSL. +- :doc:`cute_dsl_api` – Refer to the full API documentation. +- :doc:`limitations` – Understand current CuTe DSL constraints and differences from C++. +- :doc:`faqs` – Common questions and known issues. + +Current Status & Roadmap +================================= + +CuTe DSL is in public beta and actively evolving. Interfaces and features are subject to +change as we improve the system. + +Upcoming Milestones +---------------------------------- + +- Public release targeted for **Summer 2025** +- Expanded support for additional data types and kernel types +- Usability improvements: better error messages, debugging tools, and streamlined APIs +- Broader integration of CUTLASS primitives and features + +For known issues and workarounds, please consult the :doc:`limitations` and :doc:`faqs`. + +Community & Feedback +================================== + +We welcome contributions and feedback from the developer community! + +You can: + +- Submit bug reports or feature requests via our `GitHub Issues page `__ +- Join the CUTLASS community on `Discord `__ to ask questions and share ideas +- Contribute examples, tutorials, or enhancements to the DSLs +- Report unclear or missing documentation +- Propose support for additional data types or kernel variants +- Help prioritize roadmap features by upvoting GitHub issues + +Thank you for helping shape the future of CUTLASS DSLs! \ No newline at end of file diff --git a/media/docs/pythonDSL/quick_start.rst b/media/docs/pythonDSL/quick_start.rst new file mode 100644 index 00000000..0c7fb505 --- /dev/null +++ b/media/docs/pythonDSL/quick_start.rst @@ -0,0 +1,31 @@ +.. _quick_start: + +Quick Start Guide +======================= + +The CUTLASS DSL 4.0 release currently supports **Linux** and **Python 3.12** only. To install CUTLASS DSLs (limited to CuTe DSL for now), use the following command + +Installation +----------------------- + +To install the CUTLASS DSL, run: + +.. code-block:: bash + + pip install nvidia-cutlass-dsl + +The ``nvidia-cutlass-dsl`` wheel includes everything needed to generate GPU kernels. It requires +the same NVIDIA driver version as the +`CUDA Toolkit 12.9 `_. + +To ensure compatibility with the examples and code on `GitHub `_, +use the ``requirements.txt`` file from the corresponding commit in the repository. + +Recommended Dependencies +--------------------------------- + +To run examples and begin development, we recommend installing: + +.. code-block:: bash + + pip install torch jupyter diff --git a/python/CuTeDSL/EULA.txt b/python/CuTeDSL/EULA.txt new file mode 100644 index 00000000..e7699599 --- /dev/null +++ b/python/CuTeDSL/EULA.txt @@ -0,0 +1,188 @@ +NVIDIA Software License Agreement + +IMPORTANT NOTICE – PLEASE READ AND AGREE BEFORE USING THE SOFTWARE +This software license agreement (“Agreement”) is a legal agreement between you, whether an individual or entity, (“you”) and NVIDIA Corporation (“NVIDIA”) and governs the use of the NVIDIA CUTLASS DSLs software and materials that NVIDIA delivers to you under this Agreement (“Software”). +NVIDIA and you are each a “party” and collectively the “parties.” +This Agreement can be accepted only by an adult of legal age of majority in the country in which the Software is used. +If you don’t have the required age or authority to accept this Agreement, or if you don’t accept all the terms and conditions of this Agreement, do not use the Software. + +1. License Grants + + 1.1. License Grant to You. The Software made available by NVIDIA to you is licensed, not sold. + Subject to the terms of this Agreement, NVIDIA grants you a limited, non-exclusive, revocable, non-transferable, and non-sublicensable (except as expressly granted in this Agreement), license to: + + a. install and use copies of the Software, + b. configure the Software using configuration files provided (if applicable), + c. modify and create derivative works of any sample or example source code NVIDIA delivers to you as part of the Software (“Derivatives”) (if applicable), and + d. distribute python files in the Software package in source format as incorporated into a software application subject to the following distribution requirements: + + i. Your application must have material additional functionality, beyond the included portions of the Software. + ii. The distributable portions of the Software shall only be accessed by your application. + iii. The following notice shall be included in modifications and derivative works of sample source code distributed: “This software contains source code provided by NVIDIA Corporation.” + iv. Unless a developer tool is identified in this Agreement as distributable, it is delivered for your internal use only. + v. The terms under which you distribute your application must be consistent with the terms of this Agreement, including (without limitation) terms relating to the license grant and license restrictions and protection of NVIDIA’s intellectual property rights. + vi. Additionally, you agree that you will protect the privacy, security and legal rights of your application users. + + The foregoing (a) through (d) are, collectively, the “Purpose”, and the developed applications are only for use in systems with NVIDIA GPUs. + + 1.2. License Grant to NVIDIA. Subject to the terms of this Agreement, you grant NVIDIA and its affiliates a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit at NVIDIA’s discretion any Derivatives created by or for you. + You may, but are not required to, deliver any Derivatives to NVIDIA. + +2. License Restrictions + + Your license to use the Software and Derivatives is restricted as stated in this Section 2 (“License Restrictions”). + You will cooperate with NVIDIA and, upon NVIDIA’s written request, you will confirm in writing and provide reasonably requested information to verify your compliance with the terms of this Agreement. + You may not: + + 2.1. Use the Software or Derivatives for any purpose other than the Purpose; + + 2.2. Sell, rent, sublicense, transfer, distribute or otherwise make available to others (except authorized users as stated in Section 3 (“Authorized Users”)) any portion of the Software or Derivatives, except as expressly granted in Section 1.1 (“License Grant to You”); + + 2.3. Reverse engineer, decompile, or disassemble the Software components provided in binary form, nor attempt in any other manner to obtain source code of such Software; + + 2.4. Modify or create derivative works of the Software, except as expressly granted in Section 1.1 (“License Grant to You”); + + 2.5. Change or remove copyright or other proprietary notices in the Software; + + 2.6. Bypass, disable, or circumvent any technical limitation, encryption, security, digital rights management or authentication mechanism in the Software; + + 2.7. Use the Software or Derivatives in any manner that would cause them to become subject to an open source software license, subject to the terms in Section 6 (“Components Under Other Licenses”); + + 2.8. Use the Software or Derivatives in violation of any applicable law or regulation in relevant jurisdictions + + 2.9. Indicate that a product or service developed with the Software or Derivatives is sponsored or endorsed by NVIDIA; + + 2.10. Replace any NVIDIA software components in the Software that are governed by this Agreement with other software that implements NVIDIA APIs; + + 2.11. Reverse engineer, decompile or disassemble any portion of the output generated using Software elements for the purpose of translating such output artifacts to target a non-NVIDIA platform; or + +3. Authorized Users + + You may allow employees and contractors of your entity or of your subsidiary(ies), and for educational institutions also enrolled students, to internally access and use the Software as authorized by this Agreement from your secure network to perform the work authorized by this Agreement on your behalf. + You are responsible for the compliance with the terms of this Agreement by your authorized users. + Any act or omission that if committed by you would constitute a breach of this Agreement will be deemed to constitute a breach of this Agreement if committed by your authorized users. + +4. Pre-Release + + Software versions identified as alpha, beta, preview, early access or otherwise as pre-release (“Pre-Release”) may not be fully functional, may contain errors or design flaws, and may have reduced or different security, privacy, availability and reliability standards relative to NVIDIA commercial offerings. + You use Pre-Release Software at your own risk. NVIDIA did not design or test the Software for use in production or business-critical systems. + NVIDIA may choose not to make available a commercial version of Pre-Release Software. + NVIDIA may also choose to abandon development and terminate the availability of Pre-Release Software at any time without liability. + +5. Updates + + NVIDIA may at any time and at its option, change, discontinue, or deprecate any part, or all, of the Software, or change or remove features or functionality, or make available patches, workarounds or other updates to the Software. + Unless the updates are provided with their separate governing terms, they are deemed part of the Software licensed to you under this Agreement, and your continued use of the Software is deemed acceptance of such changes. + +6. Components Under Other Licenses + + The Software may include or be distributed with components provided with separate legal notices or terms that accompany the components, such as open source software licenses and other license terms (“Other Licenses”). + The components are subject to the applicable Other Licenses, including any proprietary notices, disclaimers, requirements and extended use rights; + except that this Agreement will prevail regarding the use of third-party open source software, unless a third-party open source software license requires its license terms to prevail. + Open source software license means any software, data or documentation subject to any license identified as an open source license by the Open Source Initiative (http://opensource.org), Free Software Foundation (http://www.fsf.org) or other similar open source organization or listed by the Software Package Data Exchange (SPDX) Workgroup under the Linux Foundation (http://www.spdx.org). + +7. Ownership + + 7.1. NVIDIA Ownership. The Software, including all intellectual property rights, is and will remain the sole and exclusive property of NVIDIA or its licensors. + Except as expressly granted in this Agreement, (a) NVIDIA reserves all rights, interests and remedies in connection with the Software, and (b) no other license or right is granted to you by implication, estoppel or otherwise. + + 7.2. Your Ownership. Subject to the rights of NVIDIA and its suppliers in the Software, which continue to be licensed as stated in this Agreement, even when incorporated in your products or services, and the extent permitted by applicable law, as between you and NVIDIA, you hold all rights, title and interest in and to your products, services and Derivatives you develop as permitted in this Agreement including their respective intellectual property rights. + +8. Feedback + + You may, but you are not obligated to, provide suggestions, requests, fixes, modifications, enhancements, or other feedback regarding the Software (collectively, “Feedback”). + Feedback, even if designated as confidential by you, will not create any confidentiality obligation for NVIDIA or its affiliates. + If you provide Feedback, you grant NVIDIA, its affiliates and its designees a non-exclusive, perpetual, irrevocable, sublicensable, worldwide, royalty-free, fully paid-up and transferable license, under your intellectual property rights, to publicly perform, publicly display, reproduce, use, make, have made, sell, offer for sale, distribute (through multiple tiers of distribution), import, create derivative works of and otherwise commercialize and exploit the Feedback at NVIDIA’s discretion. + +9. Termination + + 9.1. Termination. This Agreement will automatically terminate without notice from NVIDIA if you fail to comply with any of the terms in this Agreement or if you commence or participate in any legal proceeding against NVIDIA with respect to the Software. + Additionally, either party may terminate this Agreement at any time with thirty (30) days’ advance written notice to the other party. + + 9.2. Effect of Termination. Upon any expiration or termination of this Agreement, you will promptly (a) stop using and return, delete or destroy NVIDIA confidential information and all Software received under this Agreement, and (b) delete or destroy Derivatives created under this Agreement, unless an authorized NVIDIA representative provides prior written approval that you may keep a copy of the Derivatives solely for archival purposes. + Upon written request, you will certify in writing that you have complied with your obligations under this Section 9.2 (“Effect of Termination”). + + 9.3. Survival. Section 1.2 (“License Grant to NVIDIA”), Section 5 (“Updates”), Section 6 (“Components Under Other Licenses”), Section 7 (“Ownership”), Section 8 (“Feedback), Section 9.2 (“Effect of Termination”), Section 9.3 (“Survival”), Section 10 (“Disclaimer of Warranties”), Section 11 (“Limitation of Liability”), Section 12 (“Use in Mission Critical Applications”), Section 13 (“Governing Law and Jurisdiction”), Section 14 (“Indemnity”) and Section 15 (“General”) will survive any expiration or termination of this Agreement. + +10. Disclaimer of Warranties + + THE SOFTWARE IS PROVIDED BY NVIDIA AS-IS AND WITH ALL FAULTS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA DISCLAIMS ALL WARRANTIES AND REPRESENTATIONS OF ANY KIND, WHETHER + EXPRESS, IMPLIED OR STATUTORY, RELATING TO OR ARISING UNDER THIS AGREEMENT, INCLUDING, WITHOUT LIMITATION, THE WARRANTIES OF TITLE, NONINFRINGEMENT, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, USAGE OF TRADE AND COURSE OF DEALING. NVIDIA DOES NOT WARRANT OR ASSUME RESPONSIBILITY FOR THE ACCURACY OR COMPLETENESS OF ANY THIRD-PARTY INFORMATION, TEXT, GRAPHICS, LINKS CONTAINED IN THE SOFTWARE. + WITHOUT LIMITING THE FOREGOING, NVIDIA DOES NOT WARRANT THAT THE SOFTWARE WILL MEET YOUR REQUIREMENTS, ANY DEFECTS OR ERRORS WILL BE CORRECTED, ANY CERTAIN CONTENT WILL BE AVAILABLE; OR THAT THE SOFTWARE IS FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS. NO INFORMATION OR ADVICE GIVEN BY NVIDIA WILL IN ANY WAY INCREASE THE SCOPE OF ANY WARRANTY EXPRESSLY PROVIDED IN THIS AGREEMENT. + NVIDIA does not warrant or assume responsibility for the accuracy or completeness of any third-party information, text, graphics or links contained in the Software. + +11. Limitations of Liability + + 11.1. EXCLUSIONS. TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL NVIDIA BE LIABLE FOR ANY (I) INDIRECT, PUNITIVE, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, OR (ii) DAMAGES FOR (a) THE COST OF PROCURING SUBSTITUTE GOODS, OR (b) LOSS OF PROFITS, REVENUES, USE, DATA OR GOODWILL ARISING OUT OF OR RELATED TO THIS AGREEMENT, WHETHER BASED ON BREACH OF CONTRACT, TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY, OR OTHERWISE, AND EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES AND EVEN IF A PARTY’S REMEDIES FAIL THEIR ESSENTIAL PURPOSE. + + 11.2. DAMAGES CAP. ADDITIONALLY, TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, NVIDIA’S TOTAL CUMULATIVE AGGREGATE LIABILITY FOR ANY AND ALL LIABILITIES, OBLIGATIONS OR CLAIMS ARISING OUT OF OR RELATED TO THIS AGREEMENT WILL NOT EXCEED FIVE U.S. DOLLARS (US$5). + +12. Use in Mission Critical Applications + + You acknowledge that the Software provided under this Agreement is not designed or tested by NVIDIA for use in any system or application where the use or failure of such system or application developed with NVIDIA’s Software could result in injury, death or catastrophic damage (each, a “Mission Critical Application”). + Examples of Mission Critical Applications include use in avionics, navigation, autonomous vehicle applications, AI solutions for automotive products, military, medical, life support or other mission-critical or life-critical applications. + NVIDIA will not be liable to you or any third party, in whole or in part, for any claims or damages arising from these uses. + You are solely responsible for ensuring that systems and applications developed with the Software include sufficient safety and redundancy features and comply with all applicable legal and regulatory standards and requirements. + +13. Governing Law and Jurisdiction + + This Agreement will be governed in all respects by the laws of the United States and the laws of the State of Delaware, without regard to conflict of laws principles or the United Nations Convention on Contracts for the International Sale of Goods. + The state and federal courts residing in Santa Clara County, California will have exclusive jurisdiction over any dispute or claim arising out of or related to this Agreement, and the parties irrevocably consent to personal jurisdiction and venue in those courts; + except that either party may apply for injunctive remedies or an equivalent type of urgent legal relief in any jurisdiction. + +14. Indemnity + + By using the Software you agree to defend, indemnify and hold harmless NVIDIA and its affiliates and their respective officers, directors, employees and agents from and against any claims, disputes, demands, liabilities, damages, losses, costs and expenses arising out of or in any way connected with (i) products or services that have been developed or deployed with or use the Software, or claims that they violate laws, or infringe, violate, or misappropriate any third party right; + or (ii) use of the Software in breach of the terms of this Agreement. + +15. General + + 15.1. Independent Contractors. + The parties are independent contractors, and this Agreement does not create a joint venture, partnership, agency, or other form of business association between the parties. + Neither party will have the power to bind the other party or incur any obligation on its behalf without the other party’s prior written consent. + Nothing in this Agreement prevents either party from participating in similar arrangements with third parties. + + 15.2. No Assignment. + NVIDIA may assign, delegate or transfer its rights or obligations under this Agreement by any means or operation of law. + You may not, without NVIDIA’s prior written consent, assign, delegate or transfer any of your rights or obligations under this Agreement by any means or operation of law, and any attempt to do so is null and void. + + 15.3. No Waiver. + No failure or delay by a party to enforce any term or obligation of this Agreement will operate as a waiver by that party, or prevent the enforcement of such term or obligation later. + + 15.4. Trade Compliance. + You agree to comply with all applicable export, import, trade and economic sanctions laws and regulations, as amended, including without limitation U.S. Export Administration Regulations and Office of Foreign Assets Control regulations. + You confirm (a) your understanding that export or reexport of certain NVIDIA products or technologies may require a license or other approval from appropriate authorities and (b) that you will not export or reexport any products or technology, directly or indirectly, without first obtaining any required license or other approval from appropriate authorities, (i) to any countries that are subject to any U.S. or local export restrictions (currently including, but not necessarily limited to, Belarus, Cuba, Iran, North Korea, Russia, Syria, the Region of Crimea, Donetsk People’s Republic Region and Luhansk People’s Republic Region); + (ii) to any end-user who you know or have reason to know will utilize them in the design, development or production of nuclear, chemical or biological weapons, missiles, rocket systems, unmanned air vehicles capable of a maximum range of at least 300 kilometers, regardless of payload, or intended for military end-use, or any weapons of mass destruction; + (iii) to any end-user who has been prohibited from participating in the U.S. or local export transactions by any governing authority; + or (iv) to any known military or military-intelligence end-user or for any known military or military-intelligence end-use in accordance with U.S. trade compliance laws and regulations. + + 15.5. Government Rights. + The Software, documentation and technology (“Protected Items”) are “Commercial products” as this term is defined at 48 C.F.R. + 2.101, consisting of “commercial computer software” and “commercial computer software documentation” as such terms are used in, respectively, 48 C.F.R. + 12.212 and 48 C.F.R. 227.7202 & 252.227-7014(a)(1). Before any Protected Items are supplied to the U.S. Government, you will (i) inform the U.S. Government in writing that the Protected Items are and must be treated as commercial computer software and commercial computer software documentation developed at private expense; + (ii) inform the U.S. Government that the Protected Items are provided subject to the terms of the Agreement; + and (iii) mark the Protected Items as commercial computer software and commercial computer software documentation developed at private expense. + In no event will you permit the U.S. Government to acquire rights in Protected Items beyond those specified in 48 C.F.R. + 52.227-19(b)(1)-(2) or 252.227-7013(c) except as expressly approved by NVIDIA in writing. + + 15.6. Notices. + Please direct your legal notices or other correspondence to legalnotices@nvidia.com with a copy mailed to NVIDIA Corporation, 2788 San Tomas Expressway, Santa Clara, California 95051, United States of America, Attention: Legal Department. + If NVIDIA needs to contact you, you consent to receive the notices by email and agree that such notices will satisfy any legal communication requirements. + + 15.7. Severability. + If a court of competent jurisdiction rules that a provision of this Agreement is unenforceable, that provision will be deemed modified to the extent necessary to make it enforceable and the remainder of this Agreement will continue in full force and effect. + + 15.8. Amendment. + Any amendment to this Agreement must be in writing and signed by authorized representatives of both parties. + + 15.9. Construction. + The headings in the Agreement are included solely for convenience and are not intended to affect the meaning or interpretation of the Agreement. + As required by the context of the Agreement, the singular of a term includes the plural and vice versa. + + 15.10. Force Majeure. + Neither party will be liable during any period where an event or circumstance prevents or delays that party from performing its obligations under this Agreement and that event or circumstance: (i) is not within the reasonable control of that party and is not the result of that party’s negligence, and (ii) cannot be overcome or avoided by that party using reasonably diligent efforts. + + 15.11. Entire Agreement. + Regarding the subject matter of this Agreement, the parties agree that (a) this Agreement constitutes the entire and exclusive agreement between the parties and supersedes all prior and contemporaneous communications and (b) any additional or different terms or conditions, whether contained in purchase orders, order acknowledgments, invoices or otherwise, will not be binding and are null and void. + +(v. May 8, 2025) diff --git a/python/CuTeDSL/base_dsl/__init__.py b/python/CuTeDSL/base_dsl/__init__.py new file mode 100644 index 00000000..cbb617dc --- /dev/null +++ b/python/CuTeDSL/base_dsl/__init__.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +# Local module imports +from .dsl import * +from .runtime import * +from ._mlir_helpers import lru_cache_ir +from .env_manager import get_str_env_var, detect_gpu_arch + diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py b/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py new file mode 100644 index 00000000..607a24d0 --- /dev/null +++ b/python/CuTeDSL/base_dsl/_mlir_helpers/__init__.py @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides MLIR Dialect helper functions +""" + +from . import arith +from .lru_cache_ir import lru_cache_ir + + +__all__ = ["arith", "lru_cache_ir"] + +try: + from . import gpu + + __all__.extend(["gpu"]) +except ImportError: + pass diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py b/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py new file mode 100644 index 00000000..d515113b --- /dev/null +++ b/python/CuTeDSL/base_dsl/_mlir_helpers/arith.py @@ -0,0 +1,691 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides MLIR Arith Dialect helper functions +""" + +import array +import numpy as np + +from ..common import * +from ..._mlir import ir # type: ignore +from ..._mlir.extras import types as T # type: ignore +from ..._mlir.dialects import arith, nvgpu, math, builtin # type: ignore + +from .lru_cache_ir import lru_cache_ir + +# ============================================================================= +# Arith Dialect Helper functions +# ============================================================================= + + +def recast_type(src_type, res_elem_type) -> ir.Type: + if isinstance(src_type, T.VectorType): + if src_type.scalable: + res_type = T.vector( + *src_type.shape, + res_elem_type, + scalable=src_type.scalable, + scalable_dims=src_type.scalable_dims, + ) + else: + res_type = T.vector(*src_type.shape, res_elem_type) + elif isinstance(src_type, T.RankedTensorType): + res_type = T.RankedTensorType.get( + element_type=res_elem_type, shape=src_type.shape, strides=src_type.strides + ) + elif isinstance(src_type, T.UnrankedTensorType): + res_type = T.UnrankedTensorType.get(element_type=res_elem_type) + elif isinstance(src_type, T.MemRefType): + res_type = T.MemRefType.get( + element_type=res_elem_type, shape=src_type.shape, strides=src_type.strides + ) + else: + res_type = res_elem_type + return res_type + + +def is_scalar(ty) -> bool: + return not isinstance( + ty, (T.VectorType, T.RankedTensorType, T.UnrankedTensorType, T.MemRefType) + ) + + +def element_type(ty) -> ir.Type: + if not is_scalar(ty): + return ty.element_type + else: + return ty + + +def is_narrow_precision(ty) -> bool: + narrow_types = { + T.f8E8M0FNU(), + T.f8E4M3FN(), + T.f8E4M3(), + T.f8E5M2(), + T.f8E4M3B11FNUZ(), + T.f4E2M1FN(), + T.f6E3M2FN(), + T.f6E2M3FN(), + } + return ty in narrow_types + + +def is_float_type(ty) -> bool: + return ( + arith._is_float_type(ty) + # TODO-upstream: prediction is not correct. Patch here and fix in upstream later + or is_narrow_precision(ty) + or ty in (T.bf16(), T.tf32()) + ) + + +def truncf_to_narrow(res_ty, src, loc, ip): + res_elem_ty = element_type(res_ty) + if res_elem_ty == T.f8E8M0FNU(): + rnd = nvgpu.RoundingMode.RP + else: + rnd = nvgpu.RoundingMode.RN + return nvgpu.cvt_fptrunc(res_ty, src, rnd=rnd, loc=loc, ip=ip) + + +def extf_from_narrow(res_ty, src, loc, ip): + src_elem_ty = element_type(src.type) + + # When source type is E8M0, temporary element type has to be bf16 + tmp_elem_ty = T.bf16() if src_elem_ty == T.f8E8M0FNU() else T.f16() + tmp_ty = recast_type(src.type, tmp_elem_ty) + + # narrow -> bf16/f16 -> target type + tmp = nvgpu.cvt_fpext(tmp_ty, src, loc=loc, ip=ip) + return arith.extf(res_ty, tmp, loc=loc, ip=ip) + + +def bitcast(src, res_elem_type, *, loc=None, ip=None): + res_type = recast_type(src.type, res_elem_type) + return arith.bitcast(res_type, src, loc=loc, ip=ip) + + +def cvtf(src, res_elem_type, *, loc=None, ip=None): + src_elem_type = element_type(src.type) + + if res_elem_type == src_elem_type: + return src + + res_type = recast_type(src.type, res_elem_type) + + # Treat TF32 as F32 and use i32 as intermediate data + # TODO-upstream: update arith to support tf32 <-> f32 conversion + if src_elem_type == T.tf32(): + # tf32 -> i32 + tmp_type = recast_type(src.type, T.i32()) + src = builtin.unrealized_conversion_cast([tmp_type], [src], loc=loc, ip=ip) + # i32 -> f32 + src = bitcast(src, T.f32(), loc=loc, ip=ip) + # f32 -> X with `cvtf` recursively + return cvtf(src, res_elem_type, loc=loc, ip=ip) + + if res_elem_type == T.tf32(): + # X -> f32 with `cvtf`` recursively + tmp = cvtf(src, T.f32(), loc=loc, ip=ip) + # f32 -> i32 + tmp = bitcast(tmp, T.i32(), loc=loc, ip=ip) + # i32 -> tf32 + return builtin.unrealized_conversion_cast([res_type], [tmp], loc=loc, ip=ip) + + if res_elem_type.width > src_elem_type.width: + if is_narrow_precision(src_elem_type): + return extf_from_narrow(res_type, src, loc, ip) + else: + return arith.extf(res_type, src, loc=loc, ip=ip) + else: + tmp_mlir_type = recast_type(src.type, T.f32()) + + # f16 -- extf -> f32 -- truncf -> bf16 + # TODO-upstream: update arith to support bf16 <-> f16 conversion? + if (src_elem_type == T.f16() and res_elem_type == T.bf16()) or ( + src_elem_type == T.bf16() and res_elem_type == T.f16() + ): + tmp = arith.extf(tmp_mlir_type, src, loc=loc, ip=ip) + return arith.truncf(res_type, tmp, loc=loc, ip=ip) + + # {f8, f6, f4} -> f16, f32, ... + elif is_narrow_precision(res_elem_type): + return truncf_to_narrow(res_type, src, loc, ip) + else: + return arith.truncf(res_type, src, loc=loc, ip=ip) + + +def fptoi(src, signed: Union[bool, None], res_elem_type, *, loc=None, ip=None): + res_type = recast_type(src.type, res_elem_type) + # TODO-upstream: update arith to support this kind of conversion + if element_type(src.type) in (T.tf32(), T.bf16()): + src = cvtf(src, T.f32(), loc=loc, ip=ip) + + if signed: + return arith.fptosi(res_type, src, loc=loc, ip=ip) + else: + return arith.fptoui(res_type, src, loc=loc, ip=ip) + + +def itofp(src, signed: Union[bool, None], res_elem_type, *, loc=None, ip=None): + res_type = recast_type(src.type, res_elem_type) + + orig_res_type = res_type + # TODO-upstream: update arith to support this kind of conversion + if res_elem_type in (T.tf32(), T.bf16()): + res_type = recast_type(src.type, T.f32()) + + if signed and element_type(src.type).width > 1: + res = arith.sitofp(res_type, src, loc=loc, ip=ip) + else: + res = arith.uitofp(res_type, src, loc=loc, ip=ip) + + if orig_res_type == res_type: + return res + + return cvtf(res, element_type(orig_res_type), loc=loc, ip=ip) + + +def int_to_int(a, dst_elem_type, *, loc=None, ip=None): + src_signed = a.signed + dst_signed = dst_elem_type.signed + src_width = element_type(a.type).width + dst_width = dst_elem_type.width + + dst_mlir_type = recast_type(a.type, dst_elem_type.mlir_type) + + if dst_width == src_width: + return a + elif src_signed and not dst_signed: + # Signed -> Unsigned + if dst_width > src_width: + return arith.extui(dst_mlir_type, a, loc=loc, ip=ip) + else: + return arith.trunci(dst_mlir_type, a, loc=loc, ip=ip) + elif src_signed == dst_signed: + # Same signedness + if dst_width > src_width: + if src_signed and src_width > 1: + return arith.extsi(dst_mlir_type, a, loc=loc, ip=ip) + else: + return arith.extui(dst_mlir_type, a, loc=loc, ip=ip) + else: + return arith.trunci(dst_mlir_type, a, loc=loc, ip=ip) + else: + # Unsigned -> Signed + if dst_width > src_width: + return arith.extui(dst_mlir_type, a, loc=loc, ip=ip) + else: + # For truncation from unsigned to signed, we need to handle overflow + # First truncate to the target width + trunc = arith.trunci(dst_mlir_type, a, loc=loc, ip=ip) + # Then reinterpret as signed + if dst_signed: + return arith.bitcast(dst_mlir_type, trunc, loc=loc, ip=ip) + return trunc + + +# ============================================================================= +# Arith Ops Emitter Helpers +# - assuming type of lhs and rhs match each other +# - op name matches python module operator +# ============================================================================= + + +def _cast(res_elem_ty, src, is_signed=None, *, loc=None, ip=None): + """ + This function provides simplified interface to upstream op builder + arith.truncf(T.vector(shape, new_type), src) + + is simplified as because it's element-wise op which can't change shape + arith.truncf(new_type, src) + """ + if isinstance(src, ir.Value): + src_ty = src.type + else: + src_ty = type(src).mlir_type + src = src.ir_value() + + src_elem_ty = element_type(src_ty) + + if src_elem_ty == res_elem_ty: + return src + elif is_float_type(src_elem_ty) and is_float_type(res_elem_ty): + # float-to-float + return cvtf(src, res_elem_ty, loc=loc, ip=ip) + elif arith._is_integer_like_type(src_elem_ty) and arith._is_integer_like_type( + res_elem_ty + ): + if src_elem_ty.width >= res_elem_ty.width: + cast_op = arith.trunci + else: + if is_signed: + cast_op = arith.extsi + else: + cast_op = arith.extui + + res_ty = recast_type(src_ty, res_elem_ty) + return cast_op(res_ty, src, loc=loc, ip=ip) + elif is_float_type(src_elem_ty) and arith._is_integer_like_type(res_elem_ty): + return fptoi(src, is_signed, res_elem_ty, loc=loc, ip=ip) + elif arith._is_integer_like_type(src_elem_ty) and is_float_type(res_elem_ty): + return itofp(src, is_signed, res_elem_ty, loc=loc, ip=ip) + else: + raise DSLRuntimeError( + f"cast from {src_elem_ty} to {res_elem_ty} is not supported" + ) + + +@lru_cache_ir() +def const(value, ty=None, *, loc=None, ip=None): + """ + Generates dynamic expression for constant values. + """ + from ..typing import Numeric, NumericMeta + from ..dsl import is_dynamic_expression, _numpy_type_to_mlir_type + + if isinstance(value, Numeric): + value = value.value + + # Early return + if is_dynamic_expression(value) and ( + value.type.isinstance(value.type) or T.bool().isinstance(value.type) + ): + return value + + # Assume type + if ty is None: + if isinstance(value, float): + ty = T.f32() + elif isinstance(value, bool): + ty = T.bool() + elif isinstance(value, int): + ty = T.i32() + elif isinstance(value, np.ndarray): + ty = T.vector(*value.shape, _numpy_type_to_mlir_type(value.dtype)) + value = array.array(value.dtype.kind, value.flatten().tolist()) + else: + raise DSLNotImplemented(f"{type(value)} is not supported") + elif isinstance(ty, NumericMeta): + ty = ty.mlir_type + elif isinstance(ty, ir.Type): + if ir.RankedTensorType.isinstance(ty) or ir.VectorType.isinstance(ty): + elem_ty = ty.element_type + if isinstance(elem_ty, ir.IntegerType): + attr = ir.IntegerAttr.get(elem_ty, value) + else: + attr = ir.FloatAttr.get(elem_ty, value) + value = ir.DenseElementsAttr.get_splat(ty, attr) + elif arith._is_float_type(ty) and isinstance(value, (bool, int)): + value = float(value) + elif arith._is_integer_like_type(ty) and isinstance(value, float): + value = int(value) + else: + raise DSLNotImplemented(f"type {ty} is not supported") + + return arith.constant(ty, value, loc=loc, ip=ip) + + +def _dispatch_to_rhs_r_op(op): + """Decorator that dispatches to the right-hand-side's reverse operation. + + If the other operand is not an ArithValue or is a subclass (more specific) + of ArithValue, this allows proper method resolution for binary operations. + """ + + def wrapper(self, other, **kwargs): + if not isinstance(other, ArithValue): + if not isinstance(other, (int, float, bool)): + # allows to call other.__rmul__ + return NotImplemented + + return op(self, other, **kwargs) + + return wrapper + + +def _binary_op(op): + """ + Decorator to check if the 'other' argument is an ArithValue. + If not, returns NotImplemented. + """ + + def wrapper(self, other, **kwargs): + # When reach this point, `self` must be cast to base `ArithValue` type + if isinstance(other, (int, float, bool)): + other = const(other, self.type).with_signedness(self.signed) + + # Call the original function + # If sub-class doesn't implement overloaded arithmetic, cast to base class + return op(self, other, **kwargs) + + return wrapper + + +# Operator overloading +@ir.register_value_caster(ir.Float4E2M1FNType.static_typeid) +@ir.register_value_caster(ir.Float6E2M3FNType.static_typeid) +@ir.register_value_caster(ir.Float6E3M2FNType.static_typeid) +@ir.register_value_caster(ir.Float8E4M3FNType.static_typeid) +@ir.register_value_caster(ir.Float8E4M3B11FNUZType.static_typeid) +@ir.register_value_caster(ir.Float8E5M2Type.static_typeid) +@ir.register_value_caster(ir.Float8E4M3Type.static_typeid) +@ir.register_value_caster(ir.Float8E8M0FNUType.static_typeid) +@ir.register_value_caster(ir.BF16Type.static_typeid) +@ir.register_value_caster(ir.F16Type.static_typeid) +@ir.register_value_caster(ir.FloatTF32Type.static_typeid) +@ir.register_value_caster(ir.F32Type.static_typeid) +@ir.register_value_caster(ir.F64Type.static_typeid) +@ir.register_value_caster(ir.IntegerType.static_typeid) +@ir.register_value_caster(ir.VectorType.static_typeid) +@ir.register_value_caster(ir.RankedTensorType.static_typeid) +class ArithValue(ir.Value): + """Overloads operators for MLIR's Arith dialects binary operations.""" + + def __init__(self, v, signed: Union[bool, None] = None): + if isinstance(v, int): + v = arith.constant(self.type, v) + super().__init__(v) + + elem_ty = element_type(self.type) + self.is_float = arith._is_float_type(elem_ty) + # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL + self.signed = signed and elem_ty.width > 1 + + def with_signedness(self, signed: Union[bool, None]): + return type(self)(self, signed) + + def __neg__(self, *, loc=None, ip=None): + if self.type == T.bool(): + raise TypeError( + "Negation, the operator `-` is not supported for boolean type" + ) + + if self.is_float: + return arith.negf(self, loc=loc, ip=ip) + else: + c0 = arith.constant(self.type, 0, loc=loc, ip=ip) + return arith.subi(c0, self, loc=loc, ip=ip) + + @_binary_op + def __pow__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float and other.is_float: + return math.powf(self, other, loc=loc, ip=ip) + elif self.is_float and not other.is_float: + return math.fpowi(self, other, loc=loc, ip=ip) + elif not self.is_float and other.is_float: + lhs = itofp(self, self.signed, T.f32(), loc=loc, ip=ip) + rhs = cvtf(other, T.f32(), loc=loc, ip=ip) + return math.powf(lhs, rhs, loc=loc, ip=ip) + elif not self.is_float and not other.is_float: + return math.ipowi(self, other, loc=loc, ip=ip) + else: + raise DSLNotImplemented(f"Unsupported '{self} ** {other}'") + + @_binary_op + def __rpow__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__pow__(self, loc=loc, ip=ip) + + # arith operators + + @_dispatch_to_rhs_r_op + @_binary_op + def __add__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.addf(self, other, loc=loc, ip=ip) + else: + return arith.addi(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __sub__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.subf(self, other, loc=loc, ip=ip) + else: + return arith.subi(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __mul__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.mulf(self, other, loc=loc, ip=ip) + else: + return arith.muli(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __truediv__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.divf(self, other, loc=loc, ip=ip) + else: + lhs = itofp(self, self.signed, T.f32(), loc=loc, ip=ip) + rhs = itofp(other, other.signed, T.f32(), loc=loc, ip=ip) + return arith.divf(lhs, rhs, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __floordiv__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + q = arith.divf(self, other, loc=loc, ip=ip) + return math.floor(q, loc=loc, ip=ip) + elif self.signed: + return arith.floordivsi(self, other, loc=loc, ip=ip) + else: + return arith.divui(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __mod__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.remf(self, other, loc=loc, ip=ip) + elif self.signed: + return arith.remsi(self, other, loc=loc, ip=ip) + else: + return arith.remui(self, other, loc=loc, ip=ip) + + @_binary_op + def __radd__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__add__(self, loc=loc, ip=ip) + + @_binary_op + def __rsub__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__sub__(self, loc=loc, ip=ip) + + @_binary_op + def __rmul__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__mul__(self, loc=loc, ip=ip) + + @_binary_op + def __rtruediv__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__truediv__(self, loc=loc, ip=ip) + + @_binary_op + def __rfloordiv__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__floordiv__(self, loc=loc, ip=ip) + + @_binary_op + def __rmod__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__mod__(self, loc=loc, ip=ip) + + # Comparison operators (comparison doesn't have right-hand-side variants) + @_dispatch_to_rhs_r_op + @_binary_op + def __lt__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.cmpf(arith.CmpFPredicate.OLT, self, other, loc=loc, ip=ip) + elif self.signed: + return arith.cmpi(arith.CmpIPredicate.slt, self, other, loc=loc, ip=ip) + else: + return arith.cmpi(arith.CmpIPredicate.ult, self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __le__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.cmpf(arith.CmpFPredicate.OLE, self, other, loc=loc, ip=ip) + elif self.signed: + return arith.cmpi(arith.CmpIPredicate.sle, self, other, loc=loc, ip=ip) + else: + return arith.cmpi(arith.CmpIPredicate.ule, self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __eq__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.cmpf(arith.CmpFPredicate.OEQ, self, other, loc=loc, ip=ip) + else: + return arith.cmpi(arith.CmpIPredicate.eq, self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __ne__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + # In Python, bool(float("nan")) is True, so use unordered comparison here + return arith.cmpf(arith.CmpFPredicate.UNE, self, other, loc=loc, ip=ip) + else: + return arith.cmpi(arith.CmpIPredicate.ne, self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __gt__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.cmpf(arith.CmpFPredicate.OGT, self, other, loc=loc, ip=ip) + elif self.signed: + return arith.cmpi(arith.CmpIPredicate.sgt, self, other, loc=loc, ip=ip) + else: + return arith.cmpi(arith.CmpIPredicate.ugt, self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __ge__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.is_float: + return arith.cmpf(arith.CmpFPredicate.OGE, self, other, loc=loc, ip=ip) + elif self.signed: + return arith.cmpi(arith.CmpIPredicate.sge, self, other, loc=loc, ip=ip) + else: + return arith.cmpi(arith.CmpIPredicate.uge, self, other, loc=loc, ip=ip) + + # Unary operators + def __invert__(self, *, loc=None, ip=None) -> "ArithValue": + return arith.xori(self, arith.const(self.type, -1)) + + # Bitwise operations + @_dispatch_to_rhs_r_op + @_binary_op + def __and__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.andi(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __or__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.ori(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __xor__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.xori(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __rshift__(self, other, *, loc=None, ip=None) -> "ArithValue": + if self.signed: + return arith.shrsi(self, other, loc=loc, ip=ip) + else: + return arith.shrui(self, other, loc=loc, ip=ip) + + @_dispatch_to_rhs_r_op + @_binary_op + def __lshift__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.shli(self, other, loc=loc, ip=ip) + + @_binary_op + def __rand__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.andi(other, self, loc=loc, ip=ip) + + @_binary_op + def __ror__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.ori(other, self, loc=loc, ip=ip) + + @_binary_op + def __rxor__(self, other, *, loc=None, ip=None) -> "ArithValue": + return arith.xori(other, self, loc=loc, ip=ip) + + @_binary_op + def __rrshift__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__rshift__(self, loc=loc, ip=ip) + + @_binary_op + def __rlshift__(self, other, *, loc=None, ip=None) -> "ArithValue": + return other.__lshift__(self, loc=loc, ip=ip) + + def __hash__(self): + return super().__hash__() + + def __str__(self): + return super().__str__().replace(ir.Value.__name__, ArithValue.__name__) + + def __repr__(self): + return self.__str__() + + +def _min(lhs, rhs, *, loc=None, ip=None): + """ + This function provides a unified interface for building arith min + + Assuming the operands have the same type + """ + from ..dsl import is_dynamic_expression + + if not is_dynamic_expression(lhs): + if not is_dynamic_expression(rhs): + return min(lhs, rhs) + else: + lhs = arith.constant(rhs.type, lhs, loc=loc, ip=ip) + else: + if not is_dynamic_expression(rhs): + rhs = arith.constant(lhs.type, rhs, loc=loc, ip=ip) + + if arith._is_integer_like_type(lhs.type): + if lhs.signed: + return arith.minsi(lhs, rhs, loc=loc, ip=ip) + else: + return arith.minui(lhs, rhs, loc=loc, ip=ip) + else: + return arith.minimumf(lhs, rhs, loc=loc, ip=ip) + + +def _max(lhs, rhs, *, loc=None, ip=None): + """ + This function provides a unified interface for building arith max + + Assuming the operands have the same type + """ + from ..dsl import is_dynamic_expression + + if not is_dynamic_expression(lhs): + if not is_dynamic_expression(rhs): + return max(lhs, rhs) + else: + lhs = arith.constant(rhs.type, lhs, loc=loc, ip=ip) + else: + if not is_dynamic_expression(rhs): + rhs = arith.constant(lhs.type, rhs, loc=loc, ip=ip) + + if arith._is_integer_like_type(lhs.type): + if lhs.signed: + return arith.maxsi(lhs, rhs, loc=loc, ip=ip) + else: + return arith.maxui(lhs, rhs, loc=loc, ip=ip) + else: + return arith.maximumf(lhs, rhs, loc=loc, ip=ip) diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py b/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py new file mode 100644 index 00000000..a0b0d050 --- /dev/null +++ b/python/CuTeDSL/base_dsl/_mlir_helpers/gpu.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides MLIR GPU Dialect helper functions +""" + + +from ..._mlir import ir +from ..._mlir.dialects import gpu, arith, scf +from ..._mlir.extras import types as T + +from ..common import * + +# ============================================================================= +# GPU Dialect Helper functions +# ============================================================================= + + +def create_async_token(): + token_ty = gpu.AsyncTokenType.get() + token = gpu.wait(token_ty, []) + return token + + +def printf(fmt, *args, threadNumber=-1): + """Generate gpu.printf OP predicated on threadNumber""" + type_formats = [] + for arg in args: + ty_format = None + if ir.IndexType.isinstance(arg.type): + ty_format = "%llu" + if ir.IntegerType.isinstance(arg.type): + width = ir.IntegerType(arg.type).width + if width == 64: + ty_format = "%llu" + elif width == 32: + ty_format = "%d" + elif width == 1: + ty_format = "%i" + if ir.F32Type.isinstance(arg.type): + ty_format = "%f" + if ty_format is None: + raise DSLNotImplemented(arg.type) + type_formats.append(ty_format) + if threadNumber == -1: + gpu.printf(fmt.format(*type_formats) + "\n", args) + if threadNumber != -1: + tidx = gpu.thread_id(gpu.Dimension.x) + predicate = arith.cmpi( + arith.CmpIPredicate.eq, tidx, arith.constant(_T.index(), threadNumber) + ) + if_op = scf.IfOp(predicate) + with ir.InsertionPoint(if_op.then_block): + gpu.printf(fmt.format(*type_formats) + "\n", args) + scf.yield_([]) diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py b/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py new file mode 100644 index 00000000..57d717b4 --- /dev/null +++ b/python/CuTeDSL/base_dsl/_mlir_helpers/lru_cache_ir.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides @lru_cache_ir +It extends functools.lru_cache with IR Context awareness. + +Example usage: +from cutlass import ir +from lru_cache_ir import lru_cache_ir + +@lru_cache_ir(ir, maxsize=128, typed=False) +def make_layout(...): +... + +""" + + +from functools import lru_cache, wraps + +from ..._mlir import ir # type: ignore + + +def get_ir_context(func): + """ + Return the context for given func called under ir. + Currently the context includes MLIRContext and InsertionPoint. + """ + try: + if ir: + return (ir.Context.current, ir.InsertionPoint.current) + else: + return None + except ValueError: + return None + + +def lru_cache_ir(maxsize=128, typed=True): + """ + Applies an LRU cache to a given function, with awareness of IR context. + + Usage is similar to functools.lru_cache while taking `ir` as required argument. + + :param ir: The IR object from which to derive the context by `get_ir_context` + :param maxsize: Max cache size, same as functools.lru_cache + :param typed: Whether params are type-sensitive, default to True as IR is type-sensitive + """ + + def decorator(func): + # Use functools.lru_cache with a custom wrapper to control the key generation + @lru_cache(maxsize=maxsize, typed=typed) + def cached_func(context, *args, **kwargs): + return func(*args, **kwargs) + + @wraps(func) + def wrapper(*args, **kwargs): + try: + # Call the cached function with the context + return cached_func(get_ir_context(func), *args, **kwargs) + except (RuntimeError, TypeError): + return func(*args, **kwargs) + + # Expose cache-related methods for introspection + wrapper.cache_clear = cached_func.cache_clear + wrapper.cache_info = cached_func.cache_info + return wrapper + + return decorator diff --git a/python/CuTeDSL/base_dsl/_mlir_helpers/op.py b/python/CuTeDSL/base_dsl/_mlir_helpers/op.py new file mode 100644 index 00000000..3989c75e --- /dev/null +++ b/python/CuTeDSL/base_dsl/_mlir_helpers/op.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides MLIR's OP helper functions +""" + + +import inspect +from functools import wraps + +from ..._mlir import ir + + +def dsl_user_op(opFunc): + @wraps(opFunc) + def wrapper(*args, **kwargs): + loc = kwargs.pop("loc", None) + if loc is None: + frame = inspect.currentframe().f_back + file_loc = ir.Location.file(frame.f_code.co_filename, frame.f_lineno, 0) + loc = ir.Location.name(frame.f_code.co_name, childLoc=file_loc) + res_or_list = opFunc(*args, **kwargs, loc=loc) + return res_or_list + + return wrapper diff --git a/python/CuTeDSL/base_dsl/ast_helpers.py b/python/CuTeDSL/base_dsl/ast_helpers.py new file mode 100644 index 00000000..e8796cff --- /dev/null +++ b/python/CuTeDSL/base_dsl/ast_helpers.py @@ -0,0 +1,584 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides helper functions that are generated by the preprocessor. +The preprocessor read through python's ast and changes the input code. +""" + +from typing import Callable, Iterator, Optional, overload + +from .utils.logger import log +from .common import * + +from ._mlir_helpers.arith import ArithValue + +class Executor: + """ + The Executor class handles dynamic and compile-time (constexpr) execution + of "for" loops and "if-else-elif" statements. + + Methods: + set_functions: Assigns the functions for checking loop bounds and + conditional evaluation. + + for_dynamic: Generates MLIR for OP + for_constexpr: Executes a for loop at JIT compile-time + for_execute: Decides whether to execute the loop at compile-time or generate MLIR for OP based on the provided bounds. + + if_dynamic: Generates MLIR if OP + if_constexpr: Executes a if at JIT compile-time by python interpreter + if_execute: Decides whether to execute the if statement at compile-time or generate MLIR if OP based on the predicate. + """ + + def __init__(self): + self._is_dynamic_expression = None + self._loop_execute_range_dynamic = None + self._if_dynamic = None + self._while_dynamic = None + + def set_functions( + self, + is_dynamic_expression: Callable, + loop_execute_range_dynamic: Callable, + if_dynamic: Callable, + while_dynamic: Callable, + ): + self._is_dynamic_expression = is_dynamic_expression + self._loop_execute_range_dynamic = loop_execute_range_dynamic + self._if_dynamic = if_dynamic + self._while_dynamic = while_dynamic + + @staticmethod + def convert_to_list(x): + """This function is used to convert x to a list. + If x is None, return an empty list. + If x is not a list, return a list containing x. + Otherwise, return x itself. + """ + if x is None: + return [] + if not isinstance(x, list): + return [x] + return x + + @staticmethod + def converge_ret_val(res): + """This function is used to converge res (the return value) of the function. + If res is None, return None. + If res is a list and has only one element, return the element. + Otherwise, return res itself. + """ + if res is None: + return res + elif isinstance(res, list) and len(res) == 1: + return res[0] + return res + + def for_dynamic( + self, + func: Callable, + start, + stop, + step, + used_args: list, + iter_args: list, + iter_arg_names: list, + unroll=bool, + unroll_full=int, + ): + log().info("start [%s] stop [%s] step [%s]", start, stop, step) + return self._loop_execute_range_dynamic( + func, + start, + stop, + step, + used_args, + iter_args, + iter_arg_names, + unroll, + unroll_full, + ) + + @staticmethod + def for_constexpr( + func: Callable, + start: int, + stop: int, + step: int, + used_args: list, + iter_args: list, + ): + log().info("start [%s] stop [%s] step [%s]", start, stop, step) + loop_results = iter_args + log().debug("iter_args [%s]", iter_args) + for i in range(start, stop, step): + log().debug("i [%s] iter_args [%s]", i, iter_args) + loop_results = func(i, *used_args, *loop_results) + log().debug("loop_results [%s]", loop_results) + if loop_results is None: + loop_results = [] + if not isinstance(loop_results, list): + loop_results = [loop_results] + + log().debug("done loop_results [%s]", loop_results) + return Executor.converge_ret_val(loop_results) + + def for_execute( + self, + func, + start, + stop, + step, + used_args=[], + iter_args=[], + iter_arg_names=[], + unroll=-1, + unroll_full=False, + is_range_constexpr=None, + ): + assert ( + self._loop_execute_range_dynamic and self._is_dynamic_expression + ), "Functions must be set before execution." + log().debug("start [%s] stop [%s] step [%s]", start, stop, step) + any_dynamic_expression = ( + self._is_dynamic_expression(start) + or self._is_dynamic_expression(stop) + or self._is_dynamic_expression(step) + ) + + if is_range_constexpr is None: + if not any_dynamic_expression: + return self.for_constexpr(func, start, stop, step, used_args, iter_args) + else: + return self.for_dynamic( + func, + start, + stop, + step, + used_args, + iter_args, + iter_arg_names, + unroll, + unroll_full, + ) + + # Ensure bounds are compile-time constants for constexpr execution + if is_range_constexpr: + if any_dynamic_expression: + raise DSLRuntimeError( + "Loop bounds must be constexpr (compile-time constants)" + ) + return self.for_constexpr(func, start, stop, step, used_args, iter_args) + + # MLIR generation + return self.for_dynamic( + func, + start, + stop, + step, + used_args, + iter_args, + iter_arg_names, + unroll, + unroll_full, + ) + + def if_dynamic( + self, + pred, + then_block: Callable, + else_block: Optional[Callable] = None, + used_args=[], + yield_args=[], + yield_arg_names=[], + ): + return self._if_dynamic( + pred, then_block, else_block, used_args, yield_args, yield_arg_names + ) + + @staticmethod + def if_constexpr( + pred, + then_block: Callable, + else_block: Optional[Callable] = None, + used_args=[], + yield_args=[], + ): + if pred: + log().debug(" running then block [%s]", yield_args) + res = then_block(*used_args, *yield_args) + log().debug("result [%s]", res) + return Executor.converge_ret_val(res) + elif else_block is not None: + log().debug("running else [%s]", yield_args) + res = else_block(*used_args, *yield_args) + log().debug("result [%s]", res) + return Executor.converge_ret_val(res) + + def if_execute( + self, + pred, + then_block: Callable, + else_block: Optional[Callable] = None, + used_args=[], + yield_args=[], + yield_arg_names=[], + if_constexpr=None, + ): + assert ( + self._if_dynamic and self._is_dynamic_expression + ), "Functions must be set before execution." + + is_if_constexpr = not self._is_dynamic_expression(pred) + if if_constexpr is None: + if is_if_constexpr: + return self.if_constexpr( + pred, then_block, else_block, used_args, yield_args + ) + else: + return self.if_dynamic( + pred, then_block, else_block, used_args, yield_args, yield_arg_names + ) + + # Ensure bounds are compile-time constants for constexpr execution + if if_constexpr: + if not is_if_constexpr: + raise DSLRuntimeError( + "If predicate must be constexpr (compile-time constants)" + ) + return self.if_constexpr( + pred, then_block, else_block, used_args, yield_args + ) + + # MLIR generation + return self.if_dynamic( + pred, then_block, else_block, used_args, yield_args, yield_arg_names + ) + + def while_dynamic( + self, + while_before_block: Callable, + while_after_block: Callable, + used_args=[], + yield_args=[], + yield_arg_names=[], + ): + return self._while_dynamic( + while_before_block, + while_after_block, + used_args, + yield_args, + yield_arg_names, + ) + + @staticmethod + def while_constexpr( + while_before_block, + while_after_block, + used_args=[], + yield_args=[], + ): + log().debug( + "while_constexpr begin %s", while_before_block.__qualname__ + ) + cond, loop_results = while_before_block(*used_args, *yield_args) + while cond: + loop_results = Executor.convert_to_list(loop_results) + log().debug( + "calling while_after [%s], [%s]", + used_args, + loop_results, + ) + loop_results = while_after_block(*used_args, *loop_results) + log().debug( + "while after [%s]", loop_results + ) + loop_results = Executor.convert_to_list(loop_results) + log().debug( + "calling while_before [%s], [%s]", + used_args, + loop_results, + ) + cond, loop_results = while_before_block(*used_args, *loop_results) + log().debug( + "while_before cond, results [%s], [%s]", + cond, + loop_results, + ) + + log().debug( + "while_constexpr results %s", loop_results + ) + return Executor.converge_ret_val(loop_results) + + def while_execute( + self, + pred, + while_before_block: Callable, + while_after_block: Callable, + used_args=[], + yield_args=[], + yield_arg_names=[], + while_constexpr=None, + ): + assert ( + self._while_dynamic and self._is_dynamic_expression + ), "Functions must be set before execution." + + is_while_constexpr = not self._is_dynamic_expression(pred) + + # Ensure bounds are compile-time constants for constexpr execution + if while_constexpr: + if not is_while_constexpr: + raise DSLRuntimeError( + "While predicate must be constexpr (compile-time constants)" + ) + return self.while_constexpr( + while_before_block, while_after_block, used_args, yield_args + ) + + # MLIR generation + return self.while_dynamic( + while_before_block, + while_after_block, + used_args, + yield_args, + yield_arg_names, + ) + + +# ============================================================================= +# Decorator +# ============================================================================= + +executor = Executor() + + +def loop_selector( + start, + stop, + step, + used_args=[], + iter_args=[], + iter_arg_names=[], + unroll=-1, + unroll_full=False, + constexpr=None, +): + log().info( + "start [%s] stop [%s] step [%s] used_args [%s] iter_args [%s] unroll [%s] unroll_full [%s] constexpr [%s]", + start, + stop, + step, + used_args, + iter_args, + unroll, + unroll_full, + constexpr, + ) + from .typing import Integer, Numeric + + def _maybe_upcast(value): + if isinstance(value, Integer): + value = value.ir_value() + + return value + + start = _maybe_upcast(start) + stop = _maybe_upcast(stop) + step = _maybe_upcast(step) + + def ir_loop(func): + return executor.for_execute( + func, + start, + stop, + step, + used_args, + iter_args, + iter_arg_names, + unroll, + unroll_full, + constexpr, + ) + + return ir_loop + + +def if_selector(pred, used_args=[], yield_args=[]): + log().info("pred [%s] used_args [%s] yield_args [%s]", pred, used_args, yield_args) + # Handle Numeric types here? + + from .typing import Numeric + + if isinstance(pred, Numeric): + pred = pred.value + + def ir_loop(func): + return func(pred, *used_args, *yield_args) + + return ir_loop + + +def while_selector(pred, used_args=[], yield_args=[]): + def ir_while_loop(func): + return func(pred, *used_args, *yield_args) + + return ir_while_loop + + +def while_executor( + pred, + while_before_block: Callable, + while_after_block: Callable, + used_args=[], + yield_args=[], + yield_arg_names=[], + constexpr=None, +): + return executor.while_execute( + pred, + while_before_block, + while_after_block, + used_args, + yield_args, + yield_arg_names, + constexpr, + ) + + +def if_executor( + pred, + then_block: Callable, + else_block: Optional[Callable] = None, + used_args=[], + yield_args=[], + yield_arg_names=[], + constexpr=None, +): + return executor.if_execute( + pred, then_block, else_block, used_args, yield_args, yield_arg_names, constexpr + ) + + +# ============================================================================= +# Range +# ============================================================================= + + +class range_dynamic: + @overload + def __new__(cls, stop, unroll=0, unroll_full=False): + pass + + @overload + def __new__(cls, start, stop, step, unroll=0, unroll_full=False): + pass + + def __new__(cls, *args, **kwargs): + raise DSLRuntimeError("range_dynamic should be always preprocessed to IR") + + +class range_constexpr: + def __init__(self, *args): + if len(args) == 1: + self.start = 0 + self.stop = args[0] + self.step = 1 + elif len(args) == 2: + self.start, self.stop = args + self.step = 1 + elif len(args) == 3: + self.start, self.stop, self.step = args + else: + raise DSLRuntimeError( + "range_constexpr supports up to 3 arguments (start, stop, step)" + ) + # Ensure the arguments are compile-time constants (if required) + for arg_name, arg_value in [ + ("step", self.step), + ("start", self.start), + ("stop", self.stop), + ]: + if executor._is_dynamic_expression(arg_value): + raise DSLRuntimeError( + f"`range_constexpr` requires `constexpr` (non-IR Values) for all arguments, " + f"but `{arg_name}` is not. If the arguments are dynamic, use `range`; the DSL " + f"will handle them during runtime. ", + suggestion="Use `range` instead of `range_constexpr`.", + ) + + def __iter__(self) -> Iterator[int]: + current = self.start + while current < self.stop: + yield current + current += self.step + + +# ============================================================================= +# If expressions +# ============================================================================= + + +def const_expr(expression): + if executor._is_dynamic_expression(expression): + raise DSLRuntimeError( + f"The function `const_expr({expression})` received a dynamic expression (non compile-time constant).", + context={ + "const_expr": "Accepts only constexpr (compile-time constant)", + "If your expression depends on dynamic values": "Avoid marking it as `const_expr()`", + "If the expression could be either dynamic or constexpr": "Omit explicit `const_expr()` marker; the DSL will infer the correct handling automatically", + }, + ) + return expression + + +def dynamic_expr(expression): + raise DSLRuntimeError("dynamic_expr should be always preprocessed to IR") + + +# ============================================================================= +# Assertion & casting +# ============================================================================= + + +def assert_executor(test, msg=None): + from .typing import Numeric + + fail = False + # Implicit convert dynamic expression to bool is not allowed + # So here explicitly do a None check + if test is not None and executor._is_dynamic_expression(test): + if isinstance(test, Numeric): + try: + test = test.to(bool) + except: + fail = True + else: + fail = True + + if not fail: + assert test, msg + else: + raise DSLRuntimeError( + "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.", + suggestion = "Please replace with runtime assert." + ) + + +def bool_cast(value): + if executor._is_dynamic_expression(value): + raise DSLRuntimeError( + "Only constexpr (Python Value) is allowed here, but got non-constexpr (IR Values) expression.", + suggestion = "Please explicitly convert to boolean with expressions like comparision." + ) + return bool(value) diff --git a/python/CuTeDSL/base_dsl/ast_preprocessor.py b/python/CuTeDSL/base_dsl/ast_preprocessor.py new file mode 100644 index 00000000..e165c1db --- /dev/null +++ b/python/CuTeDSL/base_dsl/ast_preprocessor.py @@ -0,0 +1,1459 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module defines the `DSLPreprocessor` class, which acts as a Python preprocessor. +It uses Python's AST and rewrites specific Python statements such as `for` and `if-else`. + +The preprocessor operates on the following constructs: + - `for` loops: + - Rewrites `for` loops with the `@loop_selector` decorator. + - Supports `range`, `range_dynamic`, and `range_constexpr` for loop iteration. + - `if-elif-else` statements: + - Rewrites conditional statements with the `@if_selector` decorator. + - Supports `dynamic_expr` and `const_expr` in the condition expressions. + +Additionally, both `for` loops and `if-else` statements require `yield` +operation generation. The preprocessor handles this by: + - Using a `ScopeManager` to track symbols across different scopes during AST traversal. + - Identifying read-only, read-write, and active variables for DSL constructs. + - Generating `yield` operations for symbols that are classified as read-write or write. + +It is designed to be generic and can handle `for` and `if` constructs from other dialects. +In such cases, the user's DSL should implement `@loop_selector` and `@if_selector` +to generate dialect-specific operations for `for` and `if` statements. +""" + +import ast +import importlib +import inspect +import textwrap +from dataclasses import dataclass +from typing import List, Set, Dict, Any, Callable, Optional +from types import ModuleType + +from .common import * +from .utils.logger import log + + +class OrderedSet: + """ + A deterministic set implementation for ordered operations. + """ + + def __init__(self, iterable=None): + self._dict = dict.fromkeys(iterable or []) + + def add(self, item): + self._dict[item] = None + + def __iter__(self): + return iter(self._dict) + + def __and__(self, other): + return OrderedSet(key for key in self._dict if key in other) + + def __or__(self, other): + new_dict = self._dict.copy() + new_dict.update(dict.fromkeys(other)) + return OrderedSet(new_dict) + + def __sub__(self, other): + return OrderedSet(key for key in self._dict if key not in other) + + +@dataclass +class ScopeManager: + """ + Manages symbol scopes during AST traversal. + Manage nested scopes during transformations. + """ + + scopes: List[Set[str]] + current_scope: Set[str] + + @classmethod + def create(cls) -> "ScopeManager": + return cls([], set()) + + def enter_scope(self) -> None: + self.scopes.append(self.current_scope.copy()) + + def exit_scope(self) -> None: + self.current_scope = self.scopes.pop() + + def add_to_scope(self, name: str) -> None: + self.current_scope.add(name) + + def get_active_symbols(self) -> Set[str]: + return set(self.current_scope) + + +class DSLPreprocessor(ast.NodeTransformer): + """ + A preprocessor for transforming Python ASTs. It supports: + + - Rewriting `for` loops with the `@loop_selector` decorator. + - Rewriting `if-elif-else` statements with the `@if_selector` decorator. + - Generating `yield` operations for read-write or write symbols. + """ + + DECORATOR_FOR_STATEMENT = "loop_selector" + DECORATOR_IF_STATEMENT = "if_selector" + DECORATOR_WHILE_STATEMENT = "while_selector" + IF_EXECUTOR = "if_executor" + WHILE_EXECUTOR = "while_executor" + ASSERT_EXECUTOR = "assert_executor" + BOOL_CAST = "bool_cast" + IMPLICIT_DOWNCAST_NUMERIC_TYPE = "implicitDowncastNumericType" + SUPPORTED_FOR_RANGE_STATEMENTS = {"range", "range_dynamic", "range_constexpr"} + + def __init__(self): + super().__init__() + self.counter = 0 # Unique function names for multiple loops + self.scope_manager = ScopeManager.create() + self.processed_functions = set() + self.function_counter = 0 + self.function_name = "" + self.class_name = None + self.file_name = "" + self.function_depth = 0 + self.local_closures = set() + self.function_globals = None + + def _get_module_imports(self, decorated_func): + """Extract imports from the module containing the decorated function""" + # Get the module containing the decorated function + module = inspect.getmodule(decorated_func) + if module is None: + return {} + + # Get the module source code + try: + source = inspect.getsource(module) + module_ast = ast.parse(source) + + # Extract imports from the full module + imports = {} + for node in ast.walk(module_ast): + if isinstance(node, ast.Import): + for name in node.names: + imports[name.name] = name.asname if name.asname else name.name + elif isinstance(node, ast.ImportFrom): + module_name = node.module + for name in node.names: + if name.name == "*": + # Handle wildcard imports + try: + imported_module = importlib.import_module(module_name) + imports[module_name] = imported_module + except ImportError: + pass + else: + full_name = f"{module_name}.{name.name}" + imports[full_name] = ( + name.asname if name.asname else name.name + ) + return imports + except (IOError, TypeError): + return {} + + def exec(self, function_name, original_function, code_object, exec_globals): + # Get imports from the original module + module_imports = self._get_module_imports(original_function) + + # Import all required modules + for module_path, alias in module_imports.items(): + try: + if "." in module_path: + base_module, attribute = module_path.rsplit(".", 1) + module = importlib.import_module(base_module) + if hasattr(module, attribute): + attr = getattr(module, attribute) + exec_globals[alias] = attr + else: + path = importlib.import_module(module_path) + exec_globals[alias] = path + except (ImportError, AttributeError) as e: + raise ImportError(f"Failed to import {module_path}: {str(e)}") + + # Execute the transformed code + log().info( + "ASTPreprocessor Executing transformed code for function [%s]", + function_name, + ) + exec(code_object, exec_globals) + return exec_globals.get(function_name) + + @staticmethod + def print_ast(transformed_tree=None): + print("#", "-" * 40, "Transformed AST", "-" * 40) + unparsed_code = ast.unparse(transformed_tree) + print(unparsed_code) + print("#", "-" * 40, "End Transformed AST", "-" * 40) + + def make_func_param_name(self, base_name, used_names): + """Generate a unique parameter name that doesn't collide with existing names.""" + if base_name not in used_names: + return base_name + + i = 0 + while f"{base_name}_{i}" in used_names: + i += 1 + return f"{base_name}_{i}" + + def transform_function(self, func_name, function_pointer): + """ + Transforms a function. + """ + # Skip if the function has already been processed + if function_pointer in self.processed_functions: + log().info( + "ASTPreprocessor Skipping already processed function [%s]", func_name + ) + return [] + + # Step 1. Parse the given function + file_name = inspect.getsourcefile(function_pointer) + lines, start_line = inspect.getsourcelines(function_pointer) + dedented_source = textwrap.dedent("".join(lines)) + tree = ast.parse(dedented_source, filename=file_name) + # Bump the line numbers so they match the real source file + ast.increment_lineno(tree, start_line - 1) + + # Step 1.2 Check the decorator + if not self.check_decorator(tree.body[0]): + log().info( + "[%s] - Skipping function due to missing decorator", + func_name, + ) + return [] + + self.processed_functions.add(function_pointer) + log().info("ASTPreprocessor Transforming function [%s]", func_name) + + # Step 2. Transform the function + transformed_tree = self.visit(tree) + ast.fix_missing_locations(transformed_tree) + combined_body = transformed_tree.body + + # Step 3. Return the transformed tree + return combined_body + + def check_early_exit(self, tree): + """ + Checks if a given region or scope in the provided Python code has early exits. + """ + + class EarlyExitChecker(ast.NodeVisitor): + def __init__(self): + self.has_early_exit = False + self.early_exit_node = None + self.early_exit_type = None + + def visit_Return(self, node): + self.has_early_exit = True + self.early_exit_node = node + self.early_exit_type = "return" + + def visit_Break(self, node): + self.has_early_exit = True + self.early_exit_node = node + self.early_exit_type = "break" + + def visit_Continue(self, node): + self.has_early_exit = True + self.early_exit_node = node + self.early_exit_type = "continue" + + def visit_Raise(self, node): + self.has_early_exit = True + self.early_exit_node = node + self.early_exit_type = "raise" + + checker = EarlyExitChecker() + checker.visit(tree) + if not checker.has_early_exit: + return + raise DSLAstPreprocessorError( + message=f"Early exit ({checker.early_exit_type}) is not allowed in `{self.function_name}`" + + (f" in `{self.class_name}`" if self.class_name else ""), + filename=self.file_name, + snippet=ast.unparse(tree), + suggestion=( + "If predicates are constant expression, write like " + "`if const_expr(...)` or `for ... in range_constexpr`. " + "In that case, early exit will be executed by Python " + "interpreter, so it's supported." + ), + ) + + def is_node_constexpr(self, node) -> bool: + """ + Determines if the node is a constexpr. + Supported nodes are if, for, while statements. + """ + if isinstance(node, ast.If) or isinstance(node, ast.While): + if isinstance(node.test, ast.Call): + func = node.test.func + + if isinstance(func, ast.Attribute) and func.attr == "const_expr": + return True + + elif isinstance(func, ast.Name) and func.id == "const_expr": + return True + elif isinstance(node, ast.For): + if isinstance(node.iter, ast.Call): + func = node.iter.func + if isinstance(func, ast.Attribute) and func.attr == "range_constexpr": + return True + + elif isinstance(func, ast.Name) and func.id == "range_constexpr": + return True + return False + + def transform(self, original_function, exec_globals): + """ + Transforms the provided function using the preprocessor. + """ + self.file_name = inspect.getsourcefile(original_function) + self.function_globals = exec_globals + transformed_tree = self.transform_function( + original_function.__name__, original_function + ) + unified_tree = ast.Module(body=transformed_tree, type_ignores=[]) + unified_tree = ast.fix_missing_locations(unified_tree) + + return unified_tree + + def analyze_region_variables(self, node: Union[ast.For, ast.If], active_symbols): + """ + Analyze variables in different code regions to identify read-only, write-only, + and active variables for DSL constructs. + """ + + # we need orderedset to keep the insertion order the same. otherwise generated IR is different each time + read_args = OrderedSet() + write_args = OrderedSet() + local_closure = self.local_closures + file_name = self.file_name + region_node = node + + class RegionAnalyzer(ast.NodeVisitor): + + def visit_Name(self, node): + """ + Mark every load as read, and every store as write. + """ + if isinstance(node.ctx, ast.Load): + read_args.add(node.id) + elif isinstance(node.ctx, ast.Store): + write_args.add(node.id) + + @staticmethod + def get_call_base(func_node): + if isinstance(func_node, ast.Attribute): + # If the .value is another Attribute, keep digging + if isinstance(func_node.value, ast.Attribute): + return RegionAnalyzer.get_call_base(func_node.value) + # If the .value is a Name, that's our base + elif isinstance(func_node.value, ast.Name): + return func_node.value.id + else: + # Could be something else (lambda, call, etc.) + return None + elif isinstance(func_node, ast.Name): + return None + return None + + @staticmethod + def get_function_name(func_node: ast.Call): + if isinstance(func_node.func, ast.Name): + function_name = func_node.func.id + # Check if it's a method or attribute call + elif isinstance(func_node.func, ast.Attribute): + function_name = func_node.func.attr + else: + function_name = None + return function_name + + def visit_Call(self, node): + base_name = RegionAnalyzer.get_call_base(node.func) + + if isinstance(node.func, ast.Name): + func_name = node.func.id + if func_name in local_closure: + raise DSLAstPreprocessorError( + f"Function `{func_name}` is a closure and is not supported in for/if statements", + filename=file_name, + snippet=ast.unparse(region_node), + ) + + # Classes are mutable by default. Mark them as write. If they are + # dataclass(frozen=True), treat them as read in runtime. + if base_name is not None and base_name not in ("self"): + write_args.add(base_name) + + self.generic_visit(node) + + analyzer = RegionAnalyzer() + analyzer.visit(ast.Module(body=node)) + + # Argument can be Load and Store. We should just mark it as Store. + read_args = read_args - write_args + + used_args = read_args & active_symbols + iter_args = write_args & active_symbols + flattend_args = used_args | iter_args + + return list(used_args), list(iter_args), list(flattend_args) + + def extract_range_args(self, iter_node): + args = iter_node.args + if len(args) == 1: + return ast.Constant(value=0), self.visit(args[0]), ast.Constant(value=1) + elif len(args) == 2: + return self.visit(args[0]), self.visit(args[1]), ast.Constant(value=1) + elif len(args) == 3: + return self.visit(args[0]), self.visit(args[1]), self.visit(args[2]) + else: + raise DSLAstPreprocessorError( + "Unsupported number of arguments in range", filename=self.file_name + ) + + def extract_unroll_args(self, iter_node): + keywords = {kw.arg: kw.value for kw in iter_node.keywords} + return ( + keywords.get("unroll", ast.Constant(value=-1)), + keywords.get("unroll_full", ast.Constant(value=False)), + ) + + def create_loop_function( + self, + func_name, + node, + start, + stop, + step, + unroll, + unroll_full, + used_args, + iter_args, + flattened_args, + is_loop_constexpr, + ): + """ + Creates a loop body function with the `loop_selector` decorator. + """ + + func_args = [ast.arg(arg=node.target.id, annotation=None)] + func_args += [ast.arg(arg=var, annotation=None) for var in flattened_args] + + # Create the loop body + transformed_body = [] + for stmt in node.body: + transformed_stmt = self.visit(stmt) # Recursively visit inner statements + if isinstance(transformed_stmt, list): + transformed_body.extend(transformed_stmt) + else: + transformed_body.append(transformed_stmt) + + # Handle the return for a single iterated argument correctly + if len(iter_args) == 0: + transformed_body.append(ast.Return()) + else: + transformed_body.append( + ast.Return( + value=ast.List( + elts=[ast.Name(id=var, ctx=ast.Load()) for var in iter_args], + ctx=ast.Load(), + ) + ) + ) + + # Define the decorator with parameters + decorator = ast.copy_location( + ast.Call( + func=ast.Name(id=self.DECORATOR_FOR_STATEMENT, ctx=ast.Load()), + args=[start, stop, step], + keywords=[ + ast.keyword(arg="unroll", value=unroll), + ast.keyword(arg="unroll_full", value=unroll_full), + ast.keyword(arg="constexpr", value=is_loop_constexpr), + ast.keyword( + arg="used_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="iter_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in iter_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="iter_arg_names", + value=ast.List( + elts=[ast.Constant(value=arg) for arg in iter_args], + ctx=ast.Load(), + ), + ), + ], + ), + node, + ) + + return ast.copy_location( + ast.FunctionDef( + name=func_name, + args=ast.arguments( + posonlyargs=[], + args=func_args, + kwonlyargs=[], + kw_defaults=[], + defaults=[], + ), + body=transformed_body, + decorator_list=[decorator], + ), + node, + ) + + def create_loop_call(self, func_name, iter_args): + """ + Assigns the returned value from the loop function directly (without a tuple unpacking). + """ + if len(iter_args) == 0: + return ast.Expr(value=ast.Name(id=func_name, ctx=ast.Load())) + elif len(iter_args) == 1: + return ast.Assign( + targets=[ast.Name(id=iter_args[0], ctx=ast.Store())], + value=ast.Name(id=func_name, ctx=ast.Load()), + ) + else: + return ast.Assign( + targets=[ + ast.Tuple( + elts=[ast.Name(id=var, ctx=ast.Store()) for var in iter_args], + ctx=ast.Store(), + ) + ], + value=ast.Name(id=func_name, ctx=ast.Load()), + ) + + def is_supported_range_call(self, node): + return ( + isinstance(node, ast.For) + and isinstance(node.iter, ast.Call) + and ( + ( + isinstance(node.iter.func, ast.Name) + and node.iter.func.id in self.SUPPORTED_FOR_RANGE_STATEMENTS + ) + or ( + isinstance(node.iter.func, ast.Attribute) + and node.iter.func.attr in self.SUPPORTED_FOR_RANGE_STATEMENTS + ) + ) + ) + + def get_loop_constexpr(self, node): + if not self.is_supported_range_call(node): + return None + + # Map function names to their constexpr values + constexpr_map = {"range": None, "range_dynamic": False, "range_constexpr": True} + range_name = ( + node.iter.func.id + if isinstance(node.iter.func, ast.Name) + else node.iter.func.attr + ) + return ast.Constant(value=constexpr_map[range_name]) + + def transform_for_loop(self, node, active_symbols): + # Constexpr doesn't get preprocessed + if self.is_node_constexpr(node): + self.generic_visit(node) + return node + + # We only support range, range_constexpr, range_dynamic + if self.is_supported_range_call(node): + constexpr_val = self.get_loop_constexpr(node) + # Check for early exit and raise exception + self.check_early_exit(node) + start, stop, step = self.extract_range_args(node.iter) + unroll, unroll_full = self.extract_unroll_args(node.iter) + used_args, iter_args, flat_args = self.analyze_region_variables( + node, active_symbols + ) + + func_name = f"loop_body_{self.counter}" + self.counter += 1 + + func_def = self.create_loop_function( + func_name, + node, + start, + stop, + step, + unroll, + unroll_full, + used_args, + iter_args, + flat_args, + constexpr_val, + ) + + assign = ast.copy_location( + self.create_loop_call(func_name, iter_args), node + ) + + # This should work fine as it modifies the AST structure + return [func_def, assign] + + self.generic_visit(node) + + return node + + def visit_BoolOp(self, node): + # Visit child nodes first + self.generic_visit(node) + + # It is necessary to expand short circuit evaluation explicit here + # Although we do not support inline if-else for IR generation, this is actually evaluated in Python + # So it's fine here + # Transform "and" to "and_" + if isinstance(node.op, ast.And): + # Create an if-else statement in AST form + # if type(lhs) == bool and lhs == False: + # return lhs + # else + # return and_(lhs, rhs) + short_circuit_value = ast.Constant(value=False) + helper_func = ast.Name(id="and_", ctx=ast.Load()) + # Transform "or" to "or_" + elif isinstance(node.op, ast.Or): + # Create an if-else statement in AST form + # if type(lhs) == bool and lhs == True: + # return lhs + # else + # return or_(lhs, rhs) + short_circuit_value = ast.Constant(value=True) + helper_func = ast.Name(id="or_", ctx=ast.Load()) + else: + # BoolOp should be either And or Or + raise DSLAstPreprocessorError( + f"Unsupported boolean operation: {node.op}", + filename=self.file_name, + snippet=ast.unparse(node), + ) + + test = ast.BoolOp( + op=ast.And(), + values=[ + ast.Compare( + left=ast.Call( + func=ast.Name(id="type", ctx=ast.Load()), + args=[node.values[0]], + keywords=[], + ), + ops=[ast.Eq()], + comparators=[ast.Name(id="bool", ctx=ast.Load())], + ), + ast.Compare( + left=node.values[0], + ops=[ast.Eq()], + comparators=[short_circuit_value], + ), + ], + ) + return ast.copy_location( + ast.IfExp( + test=test, + body=node.values[0], + orelse=ast.Call( + func=helper_func, + args=node.values, + keywords=[], + ), + ), + node, + ) + + def visit_UnaryOp(self, node): + # Visit child nodes first + self.generic_visit(node) + + # Transform "not" to "~" as we overload __invert__ + if isinstance(node.op, ast.Not): + func_name = ast.Name(id="not_", ctx=ast.Load()) + return ast.copy_location( + ast.Call(func=func_name, args=[node.operand], keywords=[]), node + ) + + return node + + def visit_For(self, node): + active_symbols = self.scope_manager.get_active_symbols() + self.scope_manager.enter_scope() + + if isinstance(node.target, ast.Name): + self.scope_manager.add_to_scope(node.target.id) + + new_for_node = self.transform_for_loop(node, active_symbols) + self.scope_manager.exit_scope() + return new_for_node + + def visit_Name(self, node): + self.generic_visit(node) + return node + + def visit_Assert(self, node): + test = self.visit(node.test) + + args = [ast.keyword(arg="test", value=test)] + if node.msg: + msg = self.visit(node.msg) + args.append(ast.keyword(arg="msg", value=msg)) + + # Rewrite to assert_executor(test, msg) + new_node = ast.Expr( + ast.Call( + func=ast.Name(id=self.ASSERT_EXECUTOR, ctx=ast.Load()), + args=[], + keywords=args, + ) + ) + + # Propagate line number from original node to new node + ast.copy_location(new_node, node) + return new_node + + def visit_Call(self, node): + func = node.func + self.generic_visit(node) + + # Check if the function is 'bool' + if isinstance(func, ast.Name) and func.id == "bool": + return ast.copy_location( + ast.Call( + func=ast.Name(id=self.BOOL_CAST, ctx=ast.Load()), + args=[node.args[0]], + keywords=[], + ), + node, + ) + elif isinstance(func, ast.Attribute) and isinstance(func.value, ast.Name): + def create_downcast_call(arg): + return ast.copy_location( + ast.Call( + func=ast.Name( + id=self.IMPLICIT_DOWNCAST_NUMERIC_TYPE, ctx=ast.Load() + ), + args=[arg], + keywords=[], + ), + arg, + ) + module = self.function_globals.get(func.value.id) + if isinstance(module, ModuleType) and module.__package__.endswith( + "._mlir.dialects" + ): + # Check if argument is Numeric, if so, call ir_value() + args = [] + for arg in node.args: + args.append(create_downcast_call(arg)) + kwargs = [] + for kwarg in node.keywords: + kwargs.append( + ast.copy_location( + ast.keyword( + arg=kwarg.arg, + value=create_downcast_call(kwarg.value), + ), + kwarg, + ) + ) + return ast.copy_location( + ast.Call(func=func, args=args, keywords=kwargs), node + ) + + return node + + def visit_ClassDef(self, node): + self.class_name = node.name + self.generic_visit(node) + self.class_name = None + return node + + def _visit_target(self, target): + if isinstance(target, ast.Name): + self.scope_manager.add_to_scope(target.id) + elif isinstance(target, ast.Tuple): + for t in target.elts: + if isinstance(t, ast.Name): + self.scope_manager.add_to_scope(t.id) + + def visit_Assign(self, node): + for target in node.targets: + self._visit_target(target) + self.generic_visit(node) + return node + + def visit_AugAssign(self, node): + self._visit_target(node.target) + self.generic_visit(node) + return node + + def check_decorator(self, node: ast.AST) -> bool: + """ + Check if the function has the correct decorator for preprocessing. + """ + if not isinstance(node, ast.FunctionDef): + return False + decorator_list = node.decorator_list + if len(decorator_list) == 0: + return False + + for d in decorator_list: + if isinstance(d, ast.Call): + if isinstance(d.func, ast.Attribute): + if d.func.attr in ["jit", "kernel"]: + if d.keywords == []: + return True + for keyword in d.keywords: + if keyword.arg == "preprocess": + try: + if isinstance(keyword.value, ast.Constant): + return keyword.value.value + else: + return ast.literal_eval(keyword.value) + except: + pass + + elif isinstance(d, ast.Attribute): + if d.attr in ["jit", "kernel"]: + return True + + return False + + def remove_dsl_decorator(self, decorator_list): + """ + Remove .jit and .kernel decorators + The decorator can be in two forms: + - @jit(...) + - @jit + """ + new_decorator_list = [] + decorator_names = ["jit", "kernel"] + for d in decorator_list: + is_jit_or_kernel = False + if isinstance(d, ast.Call): + if isinstance(d.func, ast.Attribute): + if d.func.attr in decorator_names: + is_jit_or_kernel = True + elif isinstance(d, ast.Attribute): + if d.attr in decorator_names: + is_jit_or_kernel = True + + if not is_jit_or_kernel: + new_decorator_list.append(d) + return new_decorator_list + + def visit_FunctionDef(self, node): + self.scope_manager.enter_scope() + self.function_counter += 1 + self.function_name = node.name + if self.function_depth > 0: + self.local_closures.add(node.name) + + self.function_depth += 1 + + # Add function name and arguments + self.scope_manager.add_to_scope(node.name) + for arg in node.args.args: + self.scope_manager.add_to_scope(arg.arg) + + self.generic_visit(node) + self.scope_manager.exit_scope() + + self.function_depth -= 1 + + # Remove .jit and .kernel decorators + node.decorator_list = self.remove_dsl_decorator(node.decorator_list) + return node + + def visit_With(self, node): + self.scope_manager.enter_scope() + + for item in node.items: + if isinstance(item.optional_vars, ast.Name): + self.scope_manager.add_to_scope(item.optional_vars.id) + self.generic_visit(node) + + self.scope_manager.exit_scope() + return node + + def visit_While(self, node): + active_symbols = self.scope_manager.get_active_symbols() + self.scope_manager.enter_scope() + + # Constexpr doesn't get preprocessed + if self.is_node_constexpr(node): + self.generic_visit(node) + self.scope_manager.exit_scope() + return node + + # Check for early exit and raise exception + self.check_early_exit(node) + + used_args, yield_args, flat_args = self.analyze_region_variables( + node, active_symbols + ) + func_name = f"while_region_{self.counter}" + self.counter += 1 + + func_def = self.create_while_function( + func_name, node, used_args, yield_args, flat_args + ) + assign = ast.copy_location(self.create_loop_call(func_name, yield_args), node) + + self.scope_manager.exit_scope() + return [func_def, assign] + + def visit_Try(self, node): + self.scope_manager.enter_scope() + self.generic_visit(node) + self.scope_manager.exit_scope() + return node + + def visit_ExceptHandler(self, node): + self.scope_manager.enter_scope() + if node.name: # Exception variable + self.scope_manager.add_to_scope(node.name) + self.generic_visit(node) + self.scope_manager.exit_scope() + return node + + def create_if_call(self, func_name, yield_args, flat_args): + """Creates the assignment statement for the if function call""" + if not yield_args: + return ast.Expr(value=ast.Name(id=func_name, ctx=ast.Load())) + elif len(yield_args) == 1: + return ast.Assign( + targets=[ast.Name(id=yield_args[0], ctx=ast.Store())], + value=ast.Name(id=func_name, ctx=ast.Load()), + ) + else: + return ast.Assign( + targets=[ + ast.Tuple( + elts=[ast.Name(id=var, ctx=ast.Store()) for var in yield_args], + ctx=ast.Store(), + ) + ], + value=ast.Name(id=func_name, ctx=ast.Load()), + ) + + def visit_IfExp(self, node): + """ + Visits an inline if-else expression (ternary operator). + This is the Python equivalent of `x if condition else y`. + """ + # Check if the condition is constexpr + constexpr_val, test = self.is_constexpr(node) + + node.test = test + node.body = self.visit(node.body) + node.orelse = self.visit(node.orelse) + + # If it's a constexpr node, we don't need to transform it + if constexpr_val.value is True: + return node + + # Emit + # node if type(pred) == bool else select_(pred, body, orelse) + # so if pred is a python bool, use python to short-circuit and avoid emit arith.select + return ast.copy_location( + ast.IfExp( + test=ast.Compare( + left=ast.Call( + func=ast.Name(id="type", ctx=ast.Load()), + args=[node.test], + keywords=[], + ), + ops=[ast.Eq()], + comparators=[ast.Name(id="bool", ctx=ast.Load())], + ), + body=node, # Original ternary expression + orelse=ast.Call( + func=ast.Name(id="select_", ctx=ast.Load()), + args=[ + node.test, + node.body, + node.orelse, + ], + keywords=[], + ), + ), + node, + ) + + def visit_If(self, node): + active_symbols = self.scope_manager.get_active_symbols() + self.scope_manager.enter_scope() + + # Constexpr doesn't get preprocessed + if self.is_node_constexpr(node): + self.generic_visit(node) + self.scope_manager.exit_scope() + return node + + # Check for early exit and raise exception + self.check_early_exit(node) + + used_args, yield_args, flat_args = self.analyze_region_variables( + node, active_symbols + ) + func_name = f"if_region_{self.counter}" + self.counter += 1 + + func_def = self.create_if_function( + func_name, node, used_args, yield_args, flat_args + ) + assign = ast.copy_location( + self.create_if_call(func_name, yield_args, flat_args), node + ) + + self.scope_manager.exit_scope() + return [func_def, assign] + + def is_constexpr(self, node): + """Determines if the if condition is wrapped in const_expr or dynamic_expr""" + if isinstance(node.test, ast.Call): + func = node.test.func + + # Check if the function is 'const_expr' + if isinstance(func, ast.Name) and func.id == "const_expr": + return ast.Constant(value=True), node.test.args[0] + + # Check if the function is 'dynamic_expr' + elif isinstance(func, ast.Name) and func.id == "dynamic_expr": + return ast.Constant(value=False), self.visit(node.test.args[0]) + + # Check if it's an attribute access for 'const_expr' or 'dynamic_expr' + elif isinstance(func, ast.Attribute): + if func.attr == "const_expr": + return ast.Constant(value=True), node.test.args[0] + elif func.attr == "dynamic_expr": + return ast.Constant(value=False), self.visit(node.test.args[0]) + + return ast.Constant(value=None), self.visit(node.test) + + def create_if_function( + self, func_name, node, used_args, yield_args, flattened_args + ): + is_constexpr, test_expr = self.is_constexpr(node) + pred_name = self.make_func_param_name("pred", flattened_args) + func_args = [ast.arg(arg=pred_name, annotation=None)] + func_args += [ast.arg(arg=var, annotation=None) for var in flattened_args] + func_args_then_else = [ + ast.arg(arg=var, annotation=None) for var in flattened_args + ] + + then_body = [] + for stmt in node.body: + transformed_stmt = self.visit(stmt) # Recursively visit inner statements + if isinstance(transformed_stmt, list): + then_body.extend(transformed_stmt) + else: + then_body.append(transformed_stmt) + + # Create common return list for all blocks + return_list = ast.List( + elts=[ast.Name(id=var, ctx=ast.Load()) for var in yield_args], + ctx=ast.Load(), + ) + + # Create common function arguments + func_decorator_arguments = ast.arguments( + posonlyargs=[], args=func_args, kwonlyargs=[], kw_defaults=[], defaults=[] + ) + func_then_else_arguments = ast.arguments( + posonlyargs=[], + args=func_args_then_else, + kwonlyargs=[], + kw_defaults=[], + defaults=[], + ) + + then_block_name = f"then_block_{self.counter}" + else_block_name = f"else_block_{self.counter}" + elif_region_name = f"elif_region_{self.counter}" + self.counter += 1 + + # Create then block + then_block = ast.copy_location( + ast.FunctionDef( + name=then_block_name, + args=func_then_else_arguments, + body=then_body + [ast.Return(value=return_list)], + decorator_list=[], + ), + node, + ) + + # Decorator keywords + decorator_keywords = [ + ast.keyword( + arg="pred", value=test_expr + ), # ast.Name(id="pred", ctx=ast.Load()) + ast.keyword( + arg="used_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="yield_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args], + ctx=ast.Load(), + ), + ), + ] + + # Create decorator + decorator = ast.copy_location( + ast.Call( + func=ast.Name(id=self.DECORATOR_IF_STATEMENT, ctx=ast.Load()), + args=[], + keywords=decorator_keywords, + ), + node, + ) + + # Executor keywords + execute_keywords = [ + ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())), + ast.keyword( + arg="used_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="yield_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="yield_arg_names", + value=ast.List( + elts=[ast.Constant(value=arg) for arg in yield_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="then_block", value=ast.Name(id=then_block_name, ctx=ast.Load()) + ), + ] + + # Handle different cases + if not yield_args and node.orelse == []: + # No yield_args case - only then_block needed + execute_call = ast.copy_location( + ast.Call( + func=ast.copy_location( + ast.Name(id=self.IF_EXECUTOR, ctx=ast.Load()), node + ), + args=[], + keywords=execute_keywords, + ), + node, + ) + func_body = [then_block, ast.Return(value=execute_call)] + else: + # Create else block based on node.orelse + if node.orelse: + if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If): + # Handle elif case + elif_node = node.orelse[0] + nested_if_name = elif_region_name + # Recursion for nested elif + nested_if = self.create_if_function( + nested_if_name, elif_node, used_args, yield_args, flattened_args + ) + else_block = ast.FunctionDef( + name=else_block_name, + args=func_then_else_arguments, + body=[ + nested_if, + ast.Return( + value=ast.Name(id=nested_if_name, ctx=ast.Load()) + ), + ], + decorator_list=[], + ) + else: + + else_body = [] + for stmt in node.orelse: + transformed_stmt = self.visit( + stmt + ) # Recursively visit inner statements + if isinstance(transformed_stmt, list): + else_body.extend(transformed_stmt) + else: + else_body.append(transformed_stmt) + + # Regular else block + else_block = ast.FunctionDef( + name=else_block_name, + args=func_then_else_arguments, + body=else_body + [ast.Return(value=return_list)], + decorator_list=[], + ) + else: + # Default else block + else_block = ast.FunctionDef( + name=else_block_name, + args=func_then_else_arguments, + body=[ast.Return(value=return_list)], + decorator_list=[], + ) + + # Add else_block to execute keywords + execute_keywords.append( + ast.keyword( + arg="else_block", value=ast.Name(id=else_block_name, ctx=ast.Load()) + ) + ) + # Add constexpr + execute_keywords.append(ast.keyword(arg="constexpr", value=is_constexpr)) + + execute_call = ast.copy_location( + ast.Call( + func=ast.Name(id=self.IF_EXECUTOR, ctx=ast.Load()), + args=[], + keywords=execute_keywords, + ), + node, + ) + func_body = [ + then_block, + ast.copy_location(else_block, node), + ast.Return(value=execute_call), + ] + + return ast.copy_location( + ast.FunctionDef( + name=func_name, + args=func_decorator_arguments, + body=func_body, + decorator_list=[decorator], + ), + node, + ) + + def create_while_function( + self, func_name, node, used_args, yield_args, flattened_args + ): + """Create a while function that looks like: + + @while_selector(pred, used_args=[], yield_args=[]) + def while_region(pred, flattened_args): + def while_before_block(*used_args, *yield_args): + # Note that during eval of pred can possibly alter yield_args + return *pred, yield_args + def while_after_block(*used_args, yield_args): + ...loop_body_transformed... + return yield_args + return self.while_executor(pred, used_args, yield_args, + while_before_block, while_after_block, constexpr) + yield_args = while_region(pred, flattened_args) + + Which will later be executed as psuedo-code: + + # Dynamic mode: + scf.WhileOp(types(yield_args), yield_args) + with InsertionPoint(before_block): + cond, yield_args = while_before_block(*flattened_args) + scf.ConditionOp(cond, yield_args) + with InsertionPoint(after_block): + yield_args = while_after_block(yield_args) + scf.YieldOp(yield_args) + return while_op.results_ + + # Const mode: + cond, yield_args = while_before_block(yield_args) + while pred: + yield_args = body_block(yield_args) + cond, yield_args = while_before_block(yield_args) + return yield_args + """ + is_constexpr, test_expr = self.is_constexpr(node) + pred_name = self.make_func_param_name("pred", flattened_args) + + # Section: decorator construction + decorator_keywords = [ + ast.keyword(arg="pred", value=test_expr), + ast.keyword( + arg="used_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="yield_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args], + ctx=ast.Load(), + ), + ), + ] + decorator = ast.copy_location( + ast.Call( + func=ast.Name(id=self.DECORATOR_WHILE_STATEMENT, ctx=ast.Load()), + args=[], + keywords=decorator_keywords, + ), + node, + ) + + # Section: Shared initialization for before and after blocks + while_before_block_name = f"while_before_block_{self.counter}" + while_after_block_name = f"while_after_block_{self.counter}" + self.counter += 1 + block_args_args = [ast.arg(arg=var, annotation=None) for var in used_args] + block_args_args += [ast.arg(arg=var, annotation=None) for var in yield_args] + block_args = ast.arguments( + posonlyargs=[], + args=block_args_args, + kwonlyargs=[], + kw_defaults=[], + defaults=[], + ) + + yield_args_ast_name_list = ast.List( + elts=[ast.Name(id=var, ctx=ast.Load()) for var in yield_args], + ctx=ast.Load(), + ) + + # Section: while_before_block FunctionDef, which contains condition + while_before_return_list = ast.List( + elts=[test_expr, yield_args_ast_name_list], + ctx=ast.Load(), + ) + while_before_stmts = [ast.Return(value=while_before_return_list)] + while_before_block = ast.copy_location( + ast.FunctionDef( + name=while_before_block_name, + args=block_args, + body=while_before_stmts, + decorator_list=[], + ), + test_expr, + ) + + # Section: while_after_block FunctionDef, which contains loop body + while_after_stmts = [] + for stmt in node.body: + transformed_stmt = self.visit(stmt) # Recursively visit inner statements + if isinstance(transformed_stmt, list): + while_after_stmts.extend(transformed_stmt) + else: + while_after_stmts.append(transformed_stmt) + while_after_stmts.append(ast.Return(value=yield_args_ast_name_list)) + + while_after_block = ast.copy_location( + ast.FunctionDef( + name=while_after_block_name, + args=block_args, + body=while_after_stmts, + decorator_list=[], + ), + node, + ) + + # Section: Execute via executor + execute_keywords = [ + ast.keyword(arg="pred", value=ast.Name(id=pred_name, ctx=ast.Load())), + ast.keyword( + arg="used_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in used_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="yield_args", + value=ast.List( + elts=[ast.Name(id=arg, ctx=ast.Load()) for arg in yield_args], + ctx=ast.Load(), + ), + ), + ast.keyword( + arg="while_before_block", + value=ast.Name(id=while_before_block_name, ctx=ast.Load()), + ), + ast.keyword( + arg="while_after_block", + value=ast.Name(id=while_after_block_name, ctx=ast.Load()), + ), + ast.keyword(arg="constexpr", value=is_constexpr), + ast.keyword( + arg="yield_arg_names", + value=ast.List( + elts=[ast.Constant(value=arg) for arg in yield_args], + ctx=ast.Load(), + ), + ), + ] + + execute_call = ast.Call( + func=ast.Name(id=self.WHILE_EXECUTOR, ctx=ast.Load()), + args=[], + keywords=execute_keywords, + ) + + # Putting everything together, FunctionDef for while_region + func_args_args = [ast.arg(arg=pred_name, annotation=None)] + func_args_args += [ast.arg(arg=var, annotation=None) for var in flattened_args] + func_args = ast.arguments( + posonlyargs=[], + args=func_args_args, + kwonlyargs=[], + kw_defaults=[], + defaults=[], + ) + + return ast.copy_location( + ast.FunctionDef( + name=func_name, + args=func_args, + body=[ + while_before_block, + while_after_block, + ast.Return(value=execute_call), + ], + decorator_list=[decorator], + ), + node, + ) diff --git a/python/CuTeDSL/base_dsl/cache_helpers.py b/python/CuTeDSL/base_dsl/cache_helpers.py new file mode 100644 index 00000000..8ea08874 --- /dev/null +++ b/python/CuTeDSL/base_dsl/cache_helpers.py @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides jit cache load/dump helper functions +""" + +import os +import uuid +import random +import tempfile +import pwd +import time +from pathlib import Path +import hashlib + +from .utils.logger import log +from .jit_executor import JitExecutor + +from .._mlir import ir + +# ============================================================================= +# Jit Cache Helper functions +# ============================================================================= + + +def get_current_user(): + # Try to get the user from the environment variable first + user = os.getenv("USER") or os.getenv("USERNAME") + if not user: + # Fallback for Unix-like systems + user = pwd.getpwuid(os.getuid()).pw_name + return user + + +try: + default_generated_ir_path = f"/tmp/{get_current_user()}/cutlass_python_cache/" +except Exception as e: + # If all else fails, provide a default fallback path + default_generated_ir_path = "/tmp/cutlass_python_cache/" + print(f"Could not determine user, using default path. Error: {e}") + + +def load_ir(file, asBytecode=False): + """Load generated IR from a file.""" + assert "mlir" in file + func_name = file.split(".mlir")[0].split("dsl_")[-1] + with ir.Context() as ctx: + with open(file, "rb" if asBytecode else "r") as f: + module = ir.Module.parse(f.read()) + + return func_name, module + + +def make_unique_filename(fpath: Path, new_ext: str = None) -> Path: + """Generate a unique filename with an optional new extension.""" + random_part = random.randint(0, 999999) + timestamp = time.time() + hash_input = f"{fpath}_{timestamp}_{random_part}".encode() + hash_code = hashlib.md5(hash_input).hexdigest()[:16] # Shorter hash for readability + stem_with_hash = f"{fpath.stem}_{hash_code}" + return fpath.with_name(stem_with_hash).with_suffix(new_ext or fpath.suffix) + + +def save_ir( + dsl_name: str, + module: object, + fname: str, + isTemp: bool = False, + asBytecode: bool = False, +) -> str: + """Save generated IR to a file.""" + initial_name = f"{dsl_name.lower()}_{fname}.mlir" + save_path = Path(tempfile.gettempdir() if isTemp else os.getcwd()) + save_fname = save_path / initial_name + # Random ID to avoid any collisions + rnd_id = str(uuid.uuid4()) + pid = os.getpid() + # use temp dir to be robust against program interruptions + temp_dir = os.path.join(save_path, f"tmp.pid_{pid}_{rnd_id}") + # If the process exits abnormally, may leave a temporary folder. Needs to be removed manually. + os.makedirs(temp_dir, exist_ok=False) + temp_fname = os.path.join(temp_dir, initial_name) + + if asBytecode: + with open(temp_fname, "wb") as f: + module.operation.write_bytecode(f) + else: + with open(temp_fname, "w") as f: + print(module, file=f) + # os.replace is guaranteed to be atomic on POSIX systems if it succeeds + # so filepath cannot see a partial write + os.replace(temp_fname, save_fname) + os.removedirs(temp_dir) + log().debug("Generated IR saved into %s", save_fname) + return save_fname + + +def check_func_name(jit_cache, func_name): + if not func_name in jit_cache: + jit_cache[func_name] = JitExecutor(None, None, None, None, None, None) + return jit_cache + + +def load_cache_from_path(dsl_name, cache_limit, path=default_generated_ir_path): + """Load cache from a directory path.""" + if not os.path.exists(path): + return dict() + files = os.listdir(path) + jit_cache = dict() + try: + for idx, file in enumerate(files): + if idx >= int(cache_limit): + break + # identify dsl prefix + if not file.startswith(f"{dsl_name.lower()}"): + continue + if ".mlir" in file: + func_name, ir_module = load_ir( + os.path.join(path, file), asBytecode=True + ) + jit_cache = check_func_name(jit_cache, func_name) + jit_cache[func_name].ir_module = ir_module + except Exception as e: + print(f"{dsl_name} failed with loading generated IR cache.", e) + jit_cache = dict() + return jit_cache + + +def dump_cache_to_path( + dsl_name, jit_cache, cache_limit, path=default_generated_ir_path +): + log().info("JIT cache : dumping [%s] items=[%s]", dsl_name, len(jit_cache)) + if not os.path.exists(path): + os.makedirs(path) + original_path = os.getcwd() + try: + os.chdir(path) + for idx, [key, value] in enumerate(jit_cache.items()): + if idx >= int(cache_limit): + break + save_ir(dsl_name, value.ir_module, key, asBytecode=True) + except Exception as e: + print(f"{dsl_name} failed with caching generated IR", e) + finally: + os.chdir(original_path) diff --git a/python/CuTeDSL/base_dsl/common.py b/python/CuTeDSL/base_dsl/common.py new file mode 100644 index 00000000..3cf413ed --- /dev/null +++ b/python/CuTeDSL/base_dsl/common.py @@ -0,0 +1,268 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import os +from typing import Any, Dict, Iterable, Optional, Union + +""" +This module provides a Exception classes DSL class for any Dialect. +""" + + +# Add color codes at the top of the file after imports +class Colors: + """ANSI color codes for error messages""" + + RED = "\033[91m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + GREEN = "\033[92m" + BOLD = "\033[1m" + RESET = "\033[0m" + + +# ============================================================================= +# DSL Exceptions +# ============================================================================= + + +class DSLBaseError(Exception): + """ + Base exception for DSL-related errors. + Provides optional contextual metadata to aid in debugging. + """ + + def __init__( + self, + message: str, + line: Optional[int] = None, + snippet: Optional[str] = None, + filename: Optional[str] = None, + error_code: Optional[Union[str, int]] = None, + context: Optional[Union[Dict[str, Any], str]] = None, + suggestion: Optional[str] = None, + cause: Optional[BaseException] = None, + ) -> None: + self.message = message + self.line = line + self.filename = filename + self.snippet = snippet + self.error_code = error_code + self.context = context + self.suggestion = suggestion + self.cause = cause + + super().__init__(self._format_message()) + + def _format_message(self): + """ + Formats the complete error message with available metadata. + Override this in subclasses if you want to change formatting logic. + """ + parts = [f"{self.__class__.__name__}: {self.message}"] + + if self.error_code is not None: + parts.append(f"{Colors.BOLD}Error Code:{Colors.RESET} {self.error_code}\n") + + if self.line is not None: + parts.append(f" Line: {self.line}") + + if self.filename is not None: + parts.append(f" File: {self.filename}") + + if self.snippet: + # Optionally truncate long snippets for readability + parts.append(f" Snippet: \n {self.snippet}") + + if self.cause: + parts.append(f" Caused exception: {self.cause}") + + if self.context: + if isinstance(self.context, dict): + parts.append(f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET}\n") + for key, value in self.context.items(): + parts.append(f" {key}: {value}") + else: + parts.append( + f"{Colors.BLUE}🔍 Additional Context:{Colors.RESET} {self.context}" + ) + + if self.suggestion: + parts.append(f"{Colors.GREEN}💡 Suggestions:{Colors.RESET}") + if isinstance(self.suggestion, (list, tuple)): + for suggestion in self.suggestion: + parts.append(f" {Colors.GREEN}{suggestion}{Colors.RESET}") + else: + parts.append(f" {self.suggestion}") + + return "\n".join(parts) + + +class DSLRuntimeError(DSLBaseError): + """ + Raised when an error occurs during JIT-time code generation in the DSL. + """ + + # Inherits all logic from DSLBaseError; override methods if you need + # specialized behavior or formatting for runtime errors. + pass + + +def _get_friendly_cuda_error_message(error_code, error_name): + # Avoid circular dependency + from .runtime.cuda import get_device_info + + """Get a user-friendly error message for common CUDA errors.""" + # Strip the byte string markers if present + if isinstance(error_name, bytes): + error_name = error_name.decode("utf-8") + elif ( + isinstance(error_name, str) + and error_name.startswith("b'") + and error_name.endswith("'") + ): + error_name = error_name[2:-1] + + # Add target architecture info + target_arch = os.getenv("CUTE_DSL_ARCH", "unknown") + + error_messages = { + "CUDA_ERROR_INVALID_SOURCE": ( + f"{Colors.RED}❌ Failed to load CUDA kernel - likely architecture mismatch.{Colors.RESET}\n\n" + ), + "CUDA_ERROR_NO_BINARY_FOR_GPU": ( + f"{Colors.RED}❌ CUDA kernel not compatible with your GPU.{Colors.RESET}\n\n" + ), + "CUDA_ERROR_OUT_OF_MEMORY": ( + f"{Colors.RED}💾 CUDA out of memory error.{Colors.RESET}\n\n" + ), + "CUDA_ERROR_INVALID_DEVICE": ( + f"{Colors.RED}❌ Invalid CUDA device.{Colors.RESET}\n\n" + ), + "CUDA_ERROR_NOT_INITIALIZED": ( + f"{Colors.RED}❌ CUDA context not initialized.{Colors.RESET}\n\n" + ), + "CUDA_ERROR_INVALID_VALUE": ( + f"{Colors.RED}⚠️ Invalid parameter passed to CUDA operation.{Colors.RESET}\n\n" + f"{Colors.YELLOW}This is likely a bug - please report it with:{Colors.RESET}" + ), + } + + error_suggestions = { + "CUDA_ERROR_INVALID_SOURCE": ( + f"1. Ensure env CUTE_DSL_ARCH matches your GPU architecture", + f"2. Clear the compilation cache and regenerate the kernel", + f"3. Check CUDA toolkit installation", + ), + "CUDA_ERROR_NO_BINARY_FOR_GPU": ( + f"Set env CUTE_DSL_ARCH to match your GPU architecture", + ), + "CUDA_ERROR_OUT_OF_MEMORY": ( + f"1. Reduce batch size", + f"2. Reduce model size", + f"3. Free unused GPU memory", + ), + "CUDA_ERROR_INVALID_DEVICE": ( + f"1. Check if CUDA device is properly initialized", + f"2. Verify GPU is detected: nvidia-smi", + f"3. Check CUDA_VISIBLE_DEVICES environment variable", + ), + "CUDA_ERROR_NOT_INITIALIZED": ( + f"1. Check CUDA driver installation", + f"2. call `cuda.cuInit(0)` before any other CUDA operation", + f"3. Run nvidia-smi to confirm GPU status", + ), + "CUDA_ERROR_INVALID_VALUE": ( + f"1. Your GPU model", + f"2. SM ARCH setting", + f"3. Steps to reproduce", + ), + } + + message = error_messages.get( + error_name, f"{Colors.RED}Unknown CUDA error{Colors.RESET}" + ) + + # Add debug information + debug_info = f"\n- {Colors.BOLD}Error name: {error_name}\n" + debug_info += f"- CUDA_TOOLKIT_PATH: {os.getenv('CUDA_TOOLKIT_PATH', 'not set')}\n" + debug_info += ( + f"- Target SM ARCH: {os.getenv('CUTE_DSL_ARCH', 'not set')}{Colors.RESET}\n" + ) + + try: + # Get GPU information using CUDA Python API + debug_info += f"\n{Colors.BLUE}📊 GPU Information:{Colors.RESET}\n" + gpu_info = get_device_info() + debug_info += gpu_info.pretty_str() + + if target_arch and gpu_info.compatible_archs: + debug_info += f"\n{Colors.BOLD}Compatibility Check:{Colors.RESET}\n" + + if target_arch not in gpu_info.compatible_archs: + debug_info += ( + f"{Colors.RED}❌ Error: Target SM ARCH {target_arch} is not compatible\n" + f"💡 Please use one of SM ARCHs: " + f"{Colors.GREEN}{', '.join(gpu_info.compatible_archs or [])}{Colors.RESET}\n" + ) + elif target_arch != gpu_info.sm_arch: + debug_info += ( + f"{Colors.YELLOW}⚠️ Warning: Using compatible but non-optimal architecture\n" + f"• Current: {target_arch}\n" + f"• Recommended: {Colors.GREEN}{gpu_info.sm_arch}{Colors.RESET} (native)\n" + ) + else: + debug_info += f"{Colors.GREEN}✓ Using optimal architecture: {gpu_info.sm_arch}{Colors.RESET}\n" + + except Exception as e: + debug_info += ( + f"\n{Colors.YELLOW}ℹ️ Could not retrieve GPU info: {str(e)}{Colors.RESET}" + ) + + return message, debug_info, error_suggestions.get(error_name, "") + + +class DSLCudaRuntimeError(DSLBaseError): + """ + Raised when an error occurs during CUDA runtime code generation in the DSL. + """ + + # Inherits all logic from DSLRuntimeError; override methods if you need + # specialized behavior or formatting for runtime errors. + def __init__(self, error_code, error_name) -> None: + self._error_code = error_code + self._error_name = error_name + message, debug_info, suggestion = _get_friendly_cuda_error_message( + error_code, error_name + ) + + super().__init__( + message, error_code=error_code, context=debug_info, suggestion=suggestion + ) + + +class DSLAstPreprocessorError(DSLBaseError): + """ + Raised when an error occurs during AST preprocessing or visiting in the DSL. + """ + + # Same approach: You could override _format_message if you want + # to emphasize AST node details or anything specific to preprocessing. + pass + + +class DSLNotImplemented(DSLBaseError): + """ + Raised when a feature of the DSL is not implemented yet. + """ + + # Useful for stubs in your DSL that you plan to implement in the future. + pass diff --git a/python/CuTeDSL/base_dsl/compiler.py b/python/CuTeDSL/base_dsl/compiler.py new file mode 100644 index 00000000..2e5b75cd --- /dev/null +++ b/python/CuTeDSL/base_dsl/compiler.py @@ -0,0 +1,221 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides a class that compiles generated IR using MLIR's PassManager +and executes it using MLIR's ExecutionEngine. + +""" + +from typing import Sequence, Optional, Tuple +import os +import sys +import inspect +from .common import DSLRuntimeError + +_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_PATH) + +from .._mlir import ir + + +# ============================================================================= +# Compiler Class +# ============================================================================= + + +class CompilationError(RuntimeError): + """Custom error class for compilation failures""" + + # Add ANSI color codes + RED = "\033[91m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + GREEN = "\033[92m" + BOLD = "\033[1m" + RESET = "\033[0m" + + def __init__( + self, + message: str, + nvvm_error: Optional[str] = None, + ir_context: Optional[str] = None, + cuda_toolkit: Optional[str] = None, + arch: Optional[str] = None, + ): + self.nvvm_error = nvvm_error + self.ir_context = ir_context + self.cuda_toolkit = cuda_toolkit + self.arch = arch + # Call parent with formatted error to avoid showing class name + super().__init__("") # Empty string to avoid class name + # Store formatted error for str() representation + self._formatted_error = self._format_error() + + def __str__(self) -> str: + """Override string representation to avoid showing class name""" + return self._formatted_error + + def __repr__(self) -> str: + """Override repr representation to avoid showing class name""" + return self._formatted_error + + def _format_error(self) -> str: + if not self.nvvm_error: + return str(self.args[0]) + + return f"""NVVM Compilation Error: +---------------------- + +{self.BLUE}⚙️ Current Settings:{self.RESET} +{self.BOLD}- CUDA Toolkit Path: {self.cuda_toolkit or "Not Set"} +- Target Architecture: {self.arch}{self.RESET} + +IR Context (truncated): +{self.ir_context} + +{self.YELLOW}💡 Possible Solutions:{self.RESET} +{self.GREEN}1. Check if CUDA_TOOLKIT_PATH is set correctly +2. Verify target architecture ({self.arch}) is supported by your CUDA toolkit +3. Make sure CUDA toolkit version matches the target architecture requirements{self.RESET}""" + + +class Compiler: + """Compiler class for compiling and building MLIR modules.""" + + def __init__(self, passmanager, execution_engine): + self.passmanager = passmanager + self.execution_engine = execution_engine + + def __call__(self, module): + """Convenience application method.""" + self.compile(module) + + def _process_error(self, error_msg: str) -> Tuple[Optional[str], Optional[str]]: + """Process error message to extract NVVM error and IR context""" + nvvm_error = None + ir_msg = "" + + if "NVVM_ERROR" in error_msg: + # Extract the specific NVVM error + nvvm_error = ( + error_msg.split("libNVVM extra log:")[1].strip() + if "libNVVM extra log:" in error_msg + else error_msg + ) + + # Extract IR context + if "see current operation:" in error_msg: + # Get the IR section + ir_section = error_msg.split("see current operation:")[1].strip() + # Remove duplicate IR section + ir_section = ir_section.split("error: unknown: Failed translating")[ + 0 + ].strip() + + # Get first few lines and last few lines of the IR + ir_lines = ir_section.split("\n") + if len(ir_lines) > 10: + ir_msg = "\n".join(ir_lines[:5] + [" ..."] + ir_lines[-5:]) + else: + ir_msg = ir_section + + return nvvm_error, ir_msg + + def compile( + self, + module, + pipeline: str, + cuda_toolkit: str = "", + arch: str = "", + enable_verifier=False, + ): + """Compiles the module by invoking the pipeline.""" + try: + pm = self.passmanager.PassManager.parse(pipeline) + pm.enable_verifier(enable_verifier) + pm.run(module.operation) + except Exception as e: + error_msg = str(e) + nvvm_error, ir_msg = self._process_error(error_msg) + + if nvvm_error: + raise CompilationError( + error_msg, + nvvm_error=nvvm_error, + ir_context=ir_msg, + cuda_toolkit=cuda_toolkit, + arch=arch, + ) from e + raise e + + def jit(self, module, opt_level: int = 2, shared_libs: Sequence[str] = ()): + """Wraps the module in a JIT execution engine.""" + return self.execution_engine.ExecutionEngine( + module, opt_level=opt_level, shared_libs=shared_libs + ) + + def compile_and_jit( + self, + module, + pipeline: str, + shared_libs: Sequence[str] = (), + opt_level: int = 2, + cuda_toolkit: str = "", + arch: str = "", + ): + """Compiles and jits the module.""" + self.compile( + module, + pipeline, + cuda_toolkit, + arch, + ) + return self.jit(module, opt_level, shared_libs) + + +def compile(func, *args, **kwargs): + if func is None: + raise DSLRuntimeError("Function is not set or invalid.") + + if not callable(func): + raise DSLRuntimeError("Object is not callable.") + + kwargs["compile_only"] = True + kwargs["no_cache"] = True + + if inspect.isfunction(func): + # regular function + pass + elif inspect.ismethod(func): + # if it's a method, add the instance to the first argument + args = [func.__self__] + list(args) + func = func.__func__ + elif inspect.isclass(type(func)) and hasattr(func, "__call__"): + # If it's a class instance, get the class's __call__ method + args = [func] + list(args) + # Get the actual function from the class definition + func = func.__call__.__func__ + else: + raise DSLRuntimeError( + "Invalid function type, only function, method and module are supported, but got", + func, + ) + + # If it's a wrapped function created by jit decorator, get the original function + if hasattr(func, "__wrapped__"): + func = func.__wrapped__ + + if not hasattr(func, "_dsl_object"): + raise DSLRuntimeError("Function is not decorated with jit decorator.") + + fcn_ptr = func._dsl_object._preprocess_and_execute(func) + return func._dsl_object._func(fcn_ptr, *args, **kwargs) diff --git a/python/CuTeDSL/base_dsl/dsl.py b/python/CuTeDSL/base_dsl/dsl.py new file mode 100644 index 00000000..619ed4c8 --- /dev/null +++ b/python/CuTeDSL/base_dsl/dsl.py @@ -0,0 +1,1637 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides a main DSL class for any Dialect. +The DSL should be inherited as a new class, and its initialization requires dialects. +It handles most of the mechanics for the DSL in an agnostic way, +for example, it can handle various dialect-specific tasks. +""" + + +# Standard library imports +from dataclasses import dataclass, field +import atexit +import os +import io +import sys +import errno +import ctypes +import re +import inspect +import argparse +import hashlib +from functools import lru_cache, wraps +from collections import namedtuple +from abc import ABC, abstractmethod +from typing import Any, Union, Tuple, get_origin, get_args +from types import FunctionType +import warnings + +from . import typing as t +from .env_manager import EnvironmentVarManager + +# ============================================================================= +# CUDA Python +# ============================================================================= + +from ..base_dsl._mlir_helpers.arith import const + +# ============================================================================= +# Local module imports +# ============================================================================= + +from .cache_helpers import * +from .jit_executor import JitExecutor +from .utils.timer import timer +from .utils.logger import setup_log, log +from .utils.stacktrace import filter_exception, walk_to_top_module, filter_stackframe +from .runtime.jit_arg_adapters import is_argument_constexpr, JitArgAdapterRegistry +from .runtime.tensor_descriptor import TensorDescriptor +from .ast_preprocessor import DSLPreprocessor +from .common import * +from .typing import ( + get_c_pointers, + get_mlir_types, +) + +# ============================================================================= +# MLIR modules +# ============================================================================= + +from .._mlir import ir +from .._mlir import runtime as rt +from .._mlir.extras import types as T +from .._mlir.dialects import arith, math, func + +# ============================================================================= +# cutlass.dlpack_runtime +# ============================================================================= + +from .runtime.dlpack_runtime import dlpack_to_tensor_desc, mark_layout_dynamic + +# ============================================================================= +# Global Variables +# ============================================================================= + +MLIR_DYNAMIC = -9223372036854775808 + +# ============================================================================= +# Codegen Utils +# ============================================================================= + + +def _numpy_type_to_mlir_type(dtype): + if dtype == np.float64: + return T.f64() + if dtype == np.float16: + return T.f16() + if dtype == np.float32: + return T.f32() + if dtype == np.int64: + return T.i64() + if dtype == np.int32: + return T.i32() + if dtype == np.int16: + return T.i16() + if dtype == np.int8: + return T.i8() + if dtype == np.uint64: + return T.ui64() + if dtype == np.uint32: + return T.ui32() + if dtype == np.uint16: + return T.ui16() + if dtype == np.uint8: + return T.ui8() + if dtype == np.bool_: + return T.bool() + if dtype == f8E5M2: + return T.f8E5M2() + if dtype == f8E4M3FN: + return T.f8E4M3FN() + if dtype == f8E8M0FNU: + return T.f8E8M0FNU() + if dtype == f6E3M2FN: + return T.f6E3M2FN() + if dtype == f6E2M3FN: + return T.f6E2M3FN() + if dtype == f4E2M1FN: + return T.f4E2M1FN() + assert False, f"Unknown type {type}" + + +def _mlir_type_to_numpy_type(type): + if type == T.f64(): + return np.float64 + if type == T.f16(): + return np.float16 + if type == T.f32(): + return np.float32 + if type == T.i64(): + return np.int64 + if type == T.i32(): + return np.int32 + if type == T.i16(): + return np.int16 + if type == T.i8(): + return np.int8 + if type == T.ui64(): + return np.uint64 + if type == T.ui32(): + return np.uint32 + if type == T.ui16(): + return np.uint16 + if type == T.ui8(): + return np.uint8 + if type == T.bool(): + return np.bool_ + assert False, f"Unknown type {type}" + + +# ============================================================================= +# Main DSL Class +# ============================================================================= + + +def is_dynamic_expression(value): + """ + Check if the value is an MLIR's SSA value. + """ + # Case 1: If the value has MLIR's SSA value, return True + # Case 2: If the value supports __extract_mlir_values__ then it's possible to get SSA value + return ( + isinstance(value, ir.Value) + or hasattr(value, "__extract_mlir_values__") + or len(extract_mlir_values(value)) > 0 + ) + + +def extract_mlir_values(obj): + """ + Given the `obj`, recursively go through it to extract all contained IR values as list of MLIR values + """ + res = [] + if hasattr(obj, "__extract_mlir_values__"): + res = obj.__extract_mlir_values__() + elif isinstance(obj, (tuple, list)): + res = sum((extract_mlir_values(x) for x in obj), []) + # Can't call is_dynamic_expression as _is_dynamic_expression depends on extract_mlir_values + elif isinstance(obj, set): + raise DSLRuntimeError( + "Sets are not supported in extract_mlir_values to ensure order preservation", + context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.", + suggestion="Consider using a list or tuple instead", + ) + elif isinstance(obj, ir.Value): + res = [obj] + elif isinstance(obj, ir.BlockArgumentList): + res = list(obj) # type: ignore + + return res + + +def new_from_mlir_values(obj, values): + """ + Create a new python object by populating containing MLIR values with list of new values + """ + if hasattr(obj, "__new_from_mlir_values__"): + return obj.__new_from_mlir_values__(values) + elif isinstance(obj, (tuple, list)): + res = [] + for x in obj: + n_items = len(get_mlir_types(x)) + res.append(new_from_mlir_values(x, values[:n_items])) + values = values[n_items:] + obj_ty = type(obj) + return obj_ty(res) + elif isinstance(obj, set): + raise DSLRuntimeError( + "Sets are not supported in new_from_mlir_values to ensure order preservation", + context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.", + suggestion="Consider using a list or tuple instead", + ) + elif is_dynamic_expression(obj): + + if len(values) == 0: + return obj + + assert len(values) == 1 + return values[0] + else: + assert len(values) == 0, f"{obj} expects 0 values, but got {values}" + return obj + + +class BaseDSL: + gpu_module = None + + def __init__( + self, + name: str, + compiler_provider: Any, + pass_sm_arch_name: str, + device_compilation_only=False, + preprocess=False, + ): + """ + Constructor for initializing the class with required providers and environment settings. + + Parameters: + - name (str): Name of DSL, used for environment variables and logging. + - compiler_provider (MLIR dialect): Provider for compiler. + - pass_sm_arch_name (str): The keyword name of the SM. + - device_compilation_only (bool) : Only device code, and call it via cuda driver + - preprocess (bool): Enable AST transformation. + + This constructs a DSL instance and sets up environment management, + warning configurations, and logging functionalities. It reads + environment variables using `EnvironmentVarManager` and configures + a logger with settings from the environment. If environment warnings + are detected, they are escalated to errors to ensure strict handling. + """ + # Enforcing initialization of instance variables + if not all([name, compiler_provider, pass_sm_arch_name]): + raise DSLRuntimeError( + "All required parameters must be provided and non-empty" + ) + + self.name = name + self.compiler_provider = compiler_provider + self.pass_sm_arch_name = pass_sm_arch_name + self.frame = None + self.no_cache = False + self.device_compilation_only = device_compilation_only + self.num_kernels = 0 + # Read environment variables + self.envar = EnvironmentVarManager(self.name) + self.enable_preprocessor = preprocess + # This cache uses hash of original ir and env as key, allows dump/load to/from file. Enabled by default + self.jit_cache = ( + dict() + if self.envar.disable_file_caching + else load_cache_from_path(self.name, self.envar.file_caching_capacity) + ) + self.host_jit_decorator_name = f"@{BaseDSL.jit.__name__}" + self.device_jit_decorator_name = f"@{BaseDSL.kernel.__name__}" + + # set warning + if self.envar.warnings_as_errors: + warnings.filterwarnings("error") + if self.envar.warnings_ignore: + warnings.filterwarnings("ignore") + + # Initialize logger + if self.envar.log_to_console == False and self.envar.jitTimeProfiling: + self.envar.log_to_console = True + self.envar.log_level = 20 # info level + setup_log( + self.name, + self.envar.log_to_console, + self.envar.log_to_file, + f"{self.name}.log", + self.envar.log_level, + ) + + # kernel symbols are temporary symbol string variables, their values are valid until the compilation is done. + self.kernel_symbols = [] + # used to generate unique name for gpu.launch + self.launch_inner_count = 0 + + if preprocess: + self.preprocessor = DSLPreprocessor() + log().info(f"Initializing {name} DSL") + log().debug(f"Logger initialized for {self.name}") + + # Hook excepthook + if self.envar.filterStacktrace: + origin_excepthook = sys.excepthook + module_dir = walk_to_top_module(os.path.dirname(os.path.abspath(__file__))) + + def excepthook(excep_type, value, traceback): + filter_exception(value, module_dir) + if hasattr(value, "__traceback__"): + origin_excepthook(excep_type, value, value.__traceback__) + else: + origin_excepthook( + excep_type, value, filter_stackframe(traceback, module_dir) + ) + + sys.excepthook = excepthook + + # Restore original excepthook + def restore_excepthook(hook): + sys.excepthook = hook + + atexit.register(restore_excepthook, origin_excepthook) + + def dump_cache(self): + if not self.envar.disable_file_caching: + dump_cache_to_path( + self.name, self.jit_cache, self.envar.file_caching_capacity + ) + + @lru_cache(maxsize=1) + def print_warning_once(self, message): + log().warning(f"Warning: {message}") + warnings.warn(message, UserWarning) + + def print_warning(self, message): + log().warning(f"Warning: {message}") + warnings.warn(message, UserWarning) + + @classmethod + @lru_cache(maxsize=1) + def _get_dsl(cls): + # Instantiate the DSL Class once + main_dsl = cls() + if not main_dsl.no_cache: + # register atexit callback + atexit.register(main_dsl.dump_cache) + return main_dsl + + @staticmethod + def _can_preprocess(**dkwargs): + """ + Check if AST transformation is enabled or not for `jit` and `kernel` decorators. + """ + return dkwargs.pop("preprocess", True) + + @staticmethod + def _get_original_function(fcn_ptr, name): + """ + Get the original function from the decorated function + """ + while fcn_ptr.__name__ != name: + # If the function is wrapped with functools, get from __wrapped__ + if hasattr(fcn_ptr, "__wrapped__"): + fcn_ptr = fcn_ptr.__wrapped__ + # If the function is wrapped manually, it's the first in clousure + elif callable(fcn_ptr.__closure__[0].cell_contents): + fcn_ptr = fcn_ptr.__closure__[0].cell_contents + else: + raise DSLRuntimeError( + f"Cannot find the original function {name} in the closure chain" + ) + return fcn_ptr + + @staticmethod + def _preprocess_and_execute(func): + """ + Run ast transformation and return the materialized function pointer + """ + if hasattr(func, "_transformed_ast"): + # If the function ptr is already materialized, use the existing one + func._dsl_object.frame = func._decorator_frame + + if func._transformed_ast is None: + func._transformed_ast = func._dsl_object.run_preprocessor(func) + if func._transformed_ast is None: + del func._decorator_frame + del func._transformed_ast + return func + + fcn_ptr = func._dsl_object.get_function_ptr(func, func._transformed_ast) + # If the function is decorated, de-decorate it + fcn_ptr = BaseDSL._get_original_function(fcn_ptr, func.__name__) + return fcn_ptr + return func + + def jit_runner(self, frame, executor, *dargs, **dkwargs): + """ + Decorator to mark a function for JIT compilation. + """ + # Set the frame, that can be used AST preprocessor + self.frame = frame + log().info("jit_runner") + + def jit_runner_decorator(func): + func._dsl_object = self + # Run preprocessor that alters AST + if self.enable_preprocessor and BaseDSL._can_preprocess(**dkwargs): + # For an annotated function, add some DSL attributes + # When materializing the AST, we need decorator's frame + func._decorator_frame = frame + # No transformed ast at this point + func._transformed_ast = None + + @wraps(func) + def jit_wrapper(*args, **kwargs): + func_ptr = BaseDSL._preprocess_and_execute(func) + return executor(func_ptr, *args, **kwargs) + + return jit_wrapper + + if len(dargs) == 1 and callable(dargs[0]): + return jit_runner_decorator(dargs[0]) + else: + return jit_runner_decorator + + @classmethod + def jit(cls, *dargs, **dkwargs): + """ + Decorator to mark a function for JIT compilation for Host code. + """ + frame = inspect.currentframe().f_back + # Instantiate the DSL Class + main_dsl = cls._get_dsl() + return main_dsl.jit_runner(frame, main_dsl._func, *dargs, **dkwargs) + + @classmethod + def kernel(cls, *dargs, **dkwargs): + """ + Decorator to mark a function for JIT compilation for GPU. + """ + frame = inspect.currentframe().f_back + # Instantiate the DSL Class + main_dsl = cls._get_dsl() + return main_dsl.jit_runner(frame, main_dsl._kernel_helper, *dargs, **dkwargs) + + @abstractmethod + def _kernel_helper(self, func, *args, **kwargs): + """ + Helper function to handle kernel generation logic + """ + pass + + @abstractmethod + def _build_gpu_module(self, attrs): + """ + Build the module op that contains the kernels. + """ + pass + + @abstractmethod + def _get_pipeline(self, pipeline): + """ + Get the pipeline from the other configuration options. + """ + if pipeline != None: + return pipeline + return None + + @staticmethod + def log_additions(func_type, operands=None, types=None, arg_attrs=None): + if operands is not None and operands != []: + log().debug( + f"Added {func_type} operands: [%s]", ", ".join(map(str, operands)) + ) + if types is not None: + log().debug( + f"Added {func_type} arg_types: [%s]", ", ".join(map(str, types)) + ) + if arg_attrs is not None: + log().debug( + f"Added {func_type} arg_attrs: [%s]", ", ".join(map(str, arg_attrs)) + ) + + def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec): + """Does simple name mangling""" + + for spec_arg, arg in zip(args_spec.args, args): + spec_ty = args_spec.annotations.get(spec_arg, None) + if spec_ty != None: + if issubclass(type(spec_ty), (t.IRValue, t.IRVariadic)): + continue + if isinstance(spec_ty, (ir.Type, ir.Value)): + continue + if isinstance(arg, (ir.Type, ir.Value, ir.OpResult)): + continue + if isinstance(type(arg), (ir.Type, ir.Value, ir.OpResult)): + continue + if self._is_tensor_descriptor(arg): + continue + if inspect.isclass(spec_ty): + class_name = str(arg).replace("class", "") + class_name = class_name.replace(" ", "") + function_name = f"{function_name}_{class_name}" + elif isinstance(arg, (list, tuple)): + function_name = f"{function_name}_{'_'.join(map(str, arg))}" + else: + function_name = f"{function_name}_{arg}" + # we would need a dedicated MR to follow up + unwanted_chars = r"'-![]#,.<>()\":{}=%?@;" + translation_table = str.maketrans("", "", unwanted_chars) + function_name = function_name.translate(translation_table) + # identify address and drop + function_name = re.sub(r"0x[a-f0-9]{8,16}", "", function_name) + function_name = re.sub(r"\s+", " ", function_name) + function_name = function_name.replace(" ", "_") + function_name = function_name.replace("\n", "_") + # max fname is 256 character, leave space + function_name = function_name[:180] + log().info(f"Final mangled function name: {function_name}") + return function_name + + def _generate_execution_arguments_for_known_types( + self, arg, arg_spec, arg_name, i, fop_args, iv_block_args + ): + """ + Generate MLIR arguments for known types. + + Sub-DSLs can override this method to handle types that are not + natively supported by the Base DSL. + """ + ir_arg = [] + if is_argument_constexpr(arg, arg_spec, arg_name, i, func): + ir_arg.append(arg) + + return ir_arg, iv_block_args + + def generate_execution_arguments( + self, + args, + kwargs, + fop, + args_spec: inspect.FullArgSpec, + ): + """Create list of arguments that will be passed to MLIR's func.func op""" + + def gen_exec_args(input_args, arg_names, annotations, fop_args): + assert len(input_args) == len(arg_names) + + ir_args = [] + iv_block_args = 0 + for i, arg in enumerate(input_args): + arg_name = arg_names[i] + arg_spec = annotations.get(arg_name, None) + log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, arg_spec) + + # Implicit cast to NumericMeta + if isinstance(arg_spec, t.NumericMeta): + arg = t.cast(arg, arg_spec) + + ir_arg, iv_block_args = ( + self._generate_execution_arguments_for_known_types( + arg, arg_spec, arg_name, i, fop_args, iv_block_args + ) + ) + + if not ir_arg: + # If it's not a known type, try JIT argument adapter + # to convert the argument if possible + adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg)) + arg = adapter(arg) if adapter else arg + + n_args = len(get_mlir_types(arg)) + blk_args = fop_args[iv_block_args : iv_block_args + n_args] + ir_arg.append(new_from_mlir_values(arg, blk_args)) + iv_block_args += n_args + + self.log_additions(ir_arg) + ir_args.extend(ir_arg) + + return ir_args + + fop_args = list(fop.regions[0].blocks[0].arguments) + ir_args = gen_exec_args(args, args_spec.args, args_spec.annotations, fop_args) + ir_kwargs = gen_exec_args( + [kwargs[arg] for arg in args_spec.kwonlyargs], + args_spec.kwonlyargs, + args_spec.annotations, + fop_args[len(ir_args) :], + ) + ir_kwargs = {k: v for k, v in zip(args_spec.kwonlyargs, ir_kwargs)} + + log().debug("execution args: %s", ", ".join(map(str, ir_args))) + log().debug("execution kwargs: %s", ", ".join(map(str, ir_kwargs))) + return ir_args, ir_kwargs + + @abstractmethod + def _generate_mlir_type_for_tensor_descriptor(self, tensor: TensorDescriptor): + """ + Generate MLIR type for the tensor descriptor. + """ + pass + + @abstractmethod + def _generate_executable_arg_for_tensor_descriptor( + self, mlir_value=None, ptr_tensor_ty=None, tensor=None + ): + """ + Generates executable value for the given tensor descriptor. + """ + pass + + @abstractmethod + def _get_globals(self): + """ + Combines global and local variables from the current context and the + caller's frame comes. This includes the current module's globals, the + global variables from the caller's frame, and the local variables from + the caller's frame. + + "self.frame" is used to fetch the caller's frame. + + AST preprocessor generates a new python code, so the resulting globals + dictionary is used to execute the python code. + """ + pass + + def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool: + return isinstance( + maybe_tensor_descriptor, TensorDescriptor + ) or TensorDescriptor.can_transformed_to_dlpack(maybe_tensor_descriptor) + + def _handle_tensor_descriptor( + self, maybe_tensor, arg_name: str, need_gpu_memory: bool + ) -> TensorDescriptor: + if self._is_tensor_descriptor(maybe_tensor): + tensor = ( + maybe_tensor + if isinstance(maybe_tensor, TensorDescriptor) + else TensorDescriptor(maybe_tensor) + ) + if need_gpu_memory and not tensor.is_in_device: + log().info( + "FAIL name=[%s] tensor=[%s] in_gpu=[%s]", + arg_name, + tensor, + tensor.is_in_device, + ) + raise DSLRuntimeError( + f'Tensor "{arg_name}" is tensor "{tensor}" ' + "is not in the GPU memory. " + ) + + return tensor + + raise DSLRuntimeError( + f"Argument {arg_name} could not be transformed into a TensorDescriptor." + ) + + def _validate_arg(self, arg, arg_index, arg_name, arg_spec): + """ + Validates if the arg is really of the annotated type for type safety. + + The default implementation is empty. Subclasses can override this method to add more validation logic. + Returns None if validation passes, otherwise returns an error derived from DSLBaseError. + """ + pass + + def _generate_jit_func_args_for_known_types( + self, + func, + arg, + arg_name, + arg_spec, + arg_index, + *, + is_host=True, + ): + """ + Generate JIT function arguments for known types. + + Sub-DSLs can override this method to handle types that are not + natively supported by the Base DSL. + """ + + jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], [] + default_attr = ir.DictAttr.get({}) + + if is_argument_constexpr(arg, arg_spec, arg_name, arg_index, func): + jit_exec_arg = jit_arg_type = jit_arg_attr = None + + return jit_exec_arg, jit_arg_type, jit_arg_attr + + def _generate_jit_func_args( + self, + func, + function_name, + args, + kwargs, + args_spec: inspect.FullArgSpec, + *, + is_host=True, + ): + """Generate JIT function arguments.""" + + assert len(args) == len(args_spec.args) and len(kwargs) == len( + args_spec.kwonlyargs + ), f"Input args {len(args)=} and kwargs {len(kwargs)=} must match arg_spec.args " + f"{len(args_spec.args)=} and arg_spec.kwonlyargs {len(args_spec.kwonlyargs)=}" + + jit_arg_types, jit_arg_attrs, jit_exec_args = [], [], [] + default_attr = ir.DictAttr.get({}) + + input_args = [*args, *kwargs.values()] + input_arg_names = [*args_spec.args, *args_spec.kwonlyargs] + for i, (arg_name, arg) in enumerate(zip(input_arg_names, input_args)): + spec_ty = args_spec.annotations.get(arg_name, None) + log().debug("Processing [%d] Argument [%s : %s]", i, arg_name, spec_ty) + + # Implicitly convert into Numeric type if possible + if isinstance(spec_ty, t.NumericMeta): + arg = t.cast(arg, spec_ty) + + # Type safety check + if spec_ty is not None: + err = self._validate_arg(arg, i, arg_name, spec_ty) + if err is not None: + raise err + + jit_exec_arg, jit_arg_type, jit_arg_attr = ( + self._generate_jit_func_args_for_known_types( + func, + arg, + arg_name, + spec_ty, + i, + is_host=is_host, + ) + ) + + if jit_arg_type is not None and len(jit_arg_type) == 0: + # If not any known type, try JIT argument adapter + # to convert the argument + adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg)) + arg = adapter(arg) if adapter else arg + + if is_host: + jit_exec_arg.extend(get_c_pointers(arg)) + jit_arg_type.extend(get_mlir_types(arg)) + else: + dyn_vals = extract_mlir_values(arg) + jit_exec_arg.extend(dyn_vals) + jit_arg_type.extend([v.type for v in dyn_vals]) + + if not jit_arg_type or not jit_exec_arg: + if (is_host and hasattr(arg, "__c_pointers__")) or ( + not is_host + and hasattr(arg, "__extract_mlir_values__") + and hasattr(arg, "__new_from_mlir_values__") + ): + pass + else: + raise DSLRuntimeError( + f"failed to generate argument #{i+1} ({arg_name}) for JIT function '{function_name}'.", + context={ + f"Argument {arg_name}": "The DSL attempted to convert it into Dynamic Expression (aka MLIR values) but failed.", + f"Call-site argument value": arg, + f"Call-site argument type": type(arg), + }, + suggestion=f"Consider annotating the argument with `{arg_name} : Constexpr` " + "if it's a value known at compile-time. " + f"Otherwise, implement the {'`JitArgument`' if is_host else '`DynamicExpression`'} " + f"protocol or register a custom JIT argument adapter for type `{type(arg)}` to " + "enable dynamic value conversion at runtime.", + ) + + jit_arg_attr.extend([default_attr] * len(jit_arg_type)) + + if jit_arg_type is not None: + jit_exec_args.extend(jit_exec_arg) + jit_arg_types.extend(jit_arg_type) + jit_arg_attrs.extend(jit_arg_attr) + + return jit_exec_args, jit_arg_types, jit_arg_attrs + + def generate_mlir_function_types( + self, func, function_name, input_args, kwargs, args_spec: inspect.FullArgSpec + ): + """Convert input arguments to MLIR function signature also convert numpy arrays to memref.""" + + exe_args, types, _ = self._generate_jit_func_args( + func, function_name, input_args, kwargs, args_spec, is_host=True + ) + + log().debug("Execution Arguments: %s", ", ".join(map(str, exe_args))) + log().debug("Types: %s", ", ".join(map(str, types))) + + assert len(exe_args) == len( + types + ), "expects the same number of arguments and function parameters" + + return exe_args, types + + @dataclass + class LaunchConfig: + cluster: list = None + grid: list = field(default_factory=lambda: [1, 1, 1]) + block: list = field(default_factory=lambda: [1, 1, 1]) + smem: int = 0 + async_deps: list = field(default_factory=list) + has_cluster: bool = False + min_blocks_per_mp: int = 0 + + def __post_init__(self): + if len(self.grid) != 3: + raise DSLRuntimeError(f"Expect 3d grid!") + if len(self.block) != 3: + raise DSLRuntimeError(f"Expect 3d block!") + + self.has_cluster = self.cluster is not None + if self.cluster is None: + self.cluster = [None, None, None] + elif len(self.cluster) != 3: + raise DSLRuntimeError(f"Expect 3d cluster!") + + def diagnostic(self): + """Check command line parameters and enables diagnostic""" + # Check command line arguments "-diagnostic" + parser = argparse.ArgumentParser(description="Process diagnostic status.") + parser.add_argument( + "-diagnostic", + nargs="?", + const="all", + choices=["all", "fail", "success", "info", "suggestion"], + help="Set diagnostic status (fail, success, info, suggestion).", + ) + + args, _ = parser.parse_known_args() + ctx = ir.Context.current + + def callback(d): + print(f" [{self.name} Diagnostic] : {d.message}") + + ctx.attach_diagnostic_handler(callback) + + # Early return, don't enable diagnostics + if args.diagnostic is None: + return + + # Enable MLIR Flags + ctx.emit_error_diagnostics = True + ir._GlobalDebug.flag = True + if args.diagnostic == "all": + ir._GlobalDebug.set_types("diagnostic") + else: + ir._GlobalDebug.set_types(f"diagnostic-{args.diagnostic}") + + def get_location(self): + """ + Get python location information and generate MLIR location + """ + + frame = self.frame + if frame is None: + print("Frame is None") + return None + + file_loc = ir.Location.file(frame.f_code.co_filename, frame.f_lineno, 0) + + def print_all_frames(): + for i, frame in enumerate(inspect.stack()): + print( + f"Frame {i}: {frame.function} in {frame.filename}, line {frame.lineno}" + ) + + loc = ir.Location.name(frame.f_code.co_name, childLoc=file_loc) + return loc + + def compile_and_jit(self, module, pipeline, shared_libs, function_name=""): + """ + Compile and JIT an MLIR module. + """ + + try: + self.diagnostic() + + orig_stdout = sys.stdout + orig_stderr = sys.stderr + sys.stderr = redirect_stderr = io.StringIO() + sys.stdout = redirect_stdout = io.StringIO() + + try: + kernel = self.compiler_provider.compile_and_jit( + module, + pipeline, + shared_libs=shared_libs, + cuda_toolkit=self.envar.cuda_toolkit, + arch=self.envar.arch, + ) + + finally: + sys.stdout = orig_stdout + sys.stderr = orig_stderr + ir._GlobalDebug.flag = False + + # Print captured output. + print(redirect_stdout.getvalue(), file=sys.stdout, end="") + print(redirect_stderr.getvalue(), file=sys.stderr, end="") + + return kernel + + except Exception as e: + raise DSLRuntimeError("🧊🧊🧊 ICE 🧊🧊🧊", cause=e) + finally: + pass + + def preprocess_pipeline(self, pipeline, arch) -> str: + + if self.envar.cuda_toolkit is None: + self.print_warning( + "CUDA_TOOLKIT_PATH environment variable is not set. Cannot set toolkitPath." + ) + + options = { + "toolkitPath": self.envar.cuda_toolkit if self.envar.cuda_toolkit else None, + self.pass_sm_arch_name: arch, + } + + opt_str = "" + for k, v in options.items(): + if v: + opt_str += f"{k}={v} " + + if opt_str: + # Automatically append the pipeline options if any is specified through env var + pattern = re.compile(r"{(.+)}") + match = pattern.search(pipeline) + if match: + opt_str = f"{{{match[1]} {opt_str}}}" + pipeline = re.sub(r"{.+}", opt_str, pipeline) + else: + pipeline = pipeline.rstrip(")") + f"{{{opt_str}}})" + log().debug(f"Using pipeline = {pipeline}") + return pipeline + + def get_shared_libs(self) -> list: + shared_libs = [] + support_libs = self.envar.shared_libs + if support_libs is not None: + _libs = support_libs.split(":") + for lib in _libs: + if not os.path.exists(lib): + raise FileNotFoundError( + errno.ENOENT, os.strerror(errno.ENOENT), lib + ) + shared_libs.append(lib) + else: + self.print_warning(f"{self.name}_LIBS environment variable is not set") + + return shared_libs + + @lru_cache(maxsize=1) + def get_version(self): + version_hash = hashlib.sha256() + + return version_hash + + def get_module_hash(self, module, function_name): + s = io.BytesIO() + module.operation.write_bytecode(s) + for attr, value in self.envar.__dict__.items(): + if value is not None: + s.write(str(value).encode()) + module_hash = self.get_version().copy() + module_hash.update(s.getvalue()) + module_hash = module_hash.hexdigest() + + log().debug("Bytecode=[%s]", s.getvalue().hex()) + log().debug("Version=[%s]", self.get_version().hexdigest()) + log().info( + "Function=[%s] Computed module_hash=[%s]", function_name, module_hash + ) + return module_hash + + def build_module(self, module, function_name: str): + """ + Build the MLIR module, verify and return the module + """ + + # Save IR in a file + if self.envar.keepIR: + save_ir(self.name, module, function_name) + + if self.envar.printIR: + print("\n//===--- ------ Generated IR ------ ---====\n") + module.operation.print( + enable_debug_info=self.envar.generate_source_location + ) + print("\n//===--- --- End of Generated IR -- ---====\n") + + # Verify the module + try: + module.operation.verify() + except Exception as e: + raise DSLRuntimeError(f"🧊🧊🧊 ICE IR Verification Failed 🧊🧊🧊", cause=e) + + return module + + def generate_original_ir( + self, + ir, + func, + funcBody, + kwargs, + function_name, + func_types, + gpu_module_attrs, + args, + args_spec, + ): + # This location is set to None for now; otherwise, calls to the same + # function on different lines would produce different line numbers, + # which would break the cache. + loc = None # self.get_location() + + def build_ir_module(): + module = ir.Module.create(loc=loc) + unit_attr = ir.UnitAttr.get() + module.operation.attributes["gpu.container_module"] = unit_attr + + with ir.InsertionPoint(module.body): + # Always generate gpu module. It's canonicalized by the compiler when it's not used. + self._build_gpu_module(gpu_module_attrs) + + fop = func.FuncOp(function_name, (func_types, []), loc=loc) + fop.attributes["llvm.emit_c_interface"] = ir.UnitAttr.get() + log().debug("Generated Function OP [%s]", fop) + with ir.InsertionPoint(fop.add_entry_block()): + ir_args, ir_kwargs = self.generate_execution_arguments( + args, kwargs, fop, args_spec + ) + # Call user function body + try: + result = funcBody(*ir_args, **ir_kwargs) + func.ReturnOp([]) + except DSLAstPreprocessorError as pp_error: + raise pp_error + except NameError as name_error: + raise DSLRuntimeError( + f"💥💥💥 Error during runtime code generation for function `{funcBody.__name__}` 💥💥💥", + cause=name_error, + suggestion="Using variables defined in dynamic control flow is not supported. Please give an initial value before control flow.", + ) + except DSLRuntimeError as dsl_error: + # Throw it's already a DSL error + raise dsl_error + except Exception as general_e: + # Transform internal error to a DSL error + raise DSLRuntimeError( + f"💥💥💥 Error during runtime code generation for function `{funcBody.__name__}` 💥💥💥" + ) from general_e + return module, result + + # Build IR module + profiler = timer(enable=self.envar.jitTimeProfiling) + module, result = profiler(build_ir_module)() + module_hash = self.get_module_hash(module, function_name) + + module = self.build_module(module, function_name) + + return module, module_hash, result + + def compile_and_cache( + self, module, module_hash, function_name, pipeline, args_spec, no_cache + ): + arch = self.envar.arch + pipeline = self.preprocess_pipeline(self._get_pipeline(pipeline), arch) + shared_libs = self.get_shared_libs() + profiler = timer(enable=self.envar.jitTimeProfiling) + if ( + no_cache + or module_hash not in self.jit_cache + or self.jit_cache[module_hash].ir_module is None + ): + log().info( + "JIT cache miss function=[%s] module_hash=[%s]", + function_name, + module_hash, + ) + # Compile and JIT MLIR module + engine = profiler(self.compile_and_jit)( + module, pipeline, shared_libs, function_name=function_name + ) + else: + log().info( + "JIT cache hit IN-FILE function=[%s] module_hash=[%s]", + function_name, + module_hash, + ) + module = self.jit_cache[module_hash].ir_module + engine = self.compiler_provider.jit(module, shared_libs=shared_libs) + capi_func = profiler(engine.lookup)(function_name) + jit_executor = JitExecutor( + self, + engine, + capi_func, + module, + args_spec, + function_name, + jit_time_profiling=self.envar.jitTimeProfiling, + ) + jit_executor = jit_executor.update_jit_cuda_modules(self.kernel_symbols) + + if not no_cache: + # module stored in cache is compiled. + self.jit_cache[module_hash] = jit_executor + + return jit_executor + + def post_compilation_cleanup(self): + """Clean up some internal state after one compilation is completed.""" + # clear the kernel symbols after the compilation is done. + self.kernel_symbols = [] + self.launch_inner_count = 0 + # reset num_kernels to 0 for next compilation. + self.num_kernels = 0 + + def generate_mlir( + self, + funcBody, + kwargs, + function_name, + gpu_module_attrs, + args, + args_spec, + pipeline, + no_cache, + compile_only, + loc=None, + ): + """Generate MLIR module and compile iself.T_provider.""" + with ir.Context(), ir.Location.unknown(): + # Convert input arguments to MLIR arguments + exe_args, func_types = self.generate_mlir_function_types( + funcBody, function_name, args, kwargs, args_spec + ) + + # Generate original ir module and its hash value. + module, module_hash, result = self.generate_original_ir( + ir, + func, + funcBody, + kwargs, + function_name, + func_types, + gpu_module_attrs, + args, + args_spec, + ) + + # dryrun is used to only generate IR + if self.envar.dryrun: + return result + + if ( + no_cache + or module_hash not in self.jit_cache + or self.jit_cache[module_hash].capi_func is None + ): + # no cache or cache miss, do ir generation/compilation/jit engine + jit_executor = self.compile_and_cache( + module, module_hash, function_name, pipeline, args_spec, no_cache + ) + else: + # cache hit + log().info( + "JIT cache hit IN-MEMORY function=[%s] module_hash=[%s]", + function_name, + module_hash, + ) + jit_executor = self.jit_cache[module_hash] + + self.post_compilation_cleanup() + # If compile_only is set, bypass execution return the jit_executor directly + if compile_only: + return jit_executor + # Run the compiled program + jit_executor.run_compiled_program(exe_args) + + return result + + def run_preprocessor(self, funcBody): + if not hasattr(funcBody, "_preprocessed"): + function_name = funcBody.__name__ + self.funcBody = funcBody + log().info("Started preprocessing [%s]", function_name) + exec_globals = self._get_globals() + transformed_ast = self.preprocessor.transform(funcBody, exec_globals) + if self.envar.print_after_preprocessor: + log().info( + f"# Printing unparsed AST after preprocess of func=`{function_name}` id=`{id(funcBody)}`" + ) + DSLPreprocessor.print_ast(transformed_ast) + funcBody._preprocessed = True + return transformed_ast + return None + + def get_function_ptr(self, original_function, transformed_ast): + file_name = inspect.getsourcefile(original_function) + code_object = compile(transformed_ast, filename=file_name, mode="exec") + return self.preprocessor.exec( + original_function.__name__, + original_function, + code_object, + self._get_globals(), + ) + + @lru_cache(maxsize=None) + def _get_function_signature(self, func): + return inspect.signature(func) + + def _get_function_bound_args(self, sig, func_name, *args, **kwargs): + """ + Binds provided arguments to a function's signature and applies default values. + + E.g. given a function signature `def foo(a, b=2, c=3)`, and at call-site if we do + `foo(a=1, c=4)`, the returned BoundArguments object will have args = `[1]` + and kwargs = `{'b': 2, 'c': 4}` + + An exception will be raised if binding fails. + """ + try: + bound_args = sig.bind_partial(*args, **kwargs) + bound_args.apply_defaults() + except Exception as e: + raise DSLRuntimeError( + f"Failed to bind arguments to function `{func_name}` with signature `{sig}`", + cause=e, + ) + return bound_args + + def _canonicalize_args(self, *args, **kwargs): + """ + Canonicalize the input arguments so that returned args only contain + positional arguments and kwargs only contain keyword arguments. + """ + sig = self._get_function_signature(self.funcBody) + function_name = self.funcBody.__name__ + bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs) + canonicalized_args = bound_args.args + canonicalized_kwargs = bound_args.kwargs + return canonicalized_args, canonicalized_kwargs + + def _check_arg_count(self, *args, **kwargs): + if not self.funcBody: + raise DSLRuntimeError("Function body is not set.") + + # Pass the actual function object to _get_function_signature. + sig = self._get_function_signature(self.funcBody) + function_name = self.funcBody.__name__ + + bound_args = self._get_function_bound_args(sig, function_name, *args, **kwargs) + + # Check if all non-default arguments are provided + for param in sig.parameters.values(): + if ( + param.default is inspect.Parameter.empty + and param.name not in bound_args.arguments + ): + raise DSLRuntimeError( + f"Missing required argument in `{function_name}`: '{param.name}'" + ) + + def _func(self, funcBody, *args, **kwargs): + """Decorator for MLIR functions. + It cuts the boilerplate code, does the following: + 1. Generates `func.func` + 2. Types translation (numpy arrays -> cute.memref, float -> , etc.) + 3. Compiles and JITs the MLIR module + 4. Invokes the generated function + 5. Operator overloading (a + b --> arith.addi a, b) + 6. Generates GPU kernel function with GPU module and kernel attributes baked + """ + if ir.Context.current is None: + pass + elif ir.InsertionPoint.current is not None: + return funcBody(*args, **kwargs) + + function_name = funcBody.__name__ + self.funcBody = funcBody + + pipeline = kwargs.pop("pipeline", None) + gpu_module_attrs = kwargs.pop("gpu_module_attrs", {}) + + # Disable cache + no_cache = kwargs.pop("no_cache", False) + + # Always compile(disable cache) and return the result jit_executor + compile_only = kwargs.pop("compile_only", False) + + if not no_cache and compile_only: + no_cache = True + self.print_warning("Cache is disabled as user wants to compile only.") + + # Check the number of arguments + self._check_arg_count(*args, **kwargs) + + args_spec = inspect.getfullargspec(funcBody) + + # Canonicalize the input arguments + canonicalized_args, canonicalized_kwargs = self._canonicalize_args( + *args, **kwargs + ) + + # Simple name mangling + function_name = self.mangle_name(function_name, canonicalized_args, args_spec) + + # Generate MLIR Context and start generating IR + log().debug(f"Generating MLIR for function '{function_name}'") + result = self.generate_mlir( + funcBody, + canonicalized_kwargs, + function_name, + gpu_module_attrs, + canonicalized_args, + args_spec, + pipeline, + no_cache, + compile_only, + ) + + return result + + class _KernelGenHelper(ABC): + def __init__(self): + self.func_op = None + self.func_type = None + + @abstractmethod + def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None): + assert arg_types is not None, "Invalid arg_types!" + assert kernel_name is not None, "kernel name is empty" + pass + + @abstractmethod + def generate_func_ret_op(self): + pass + + @abstractmethod + def generate_launch_op(self, *args, **kwargs): + pass + + @abstractmethod + def get_func_body_start(self): + pass + + @abstractmethod + def enter_gpu_module(module): + """Compute the insertion point into the given module.""" + pass + + @lru_cache(maxsize=1) + def _get_default_stream(self): + """Returns the default stream 0""" + from .runtime import cuda as cuda_helpers + + return cuda_helpers.stream_create() + + def _execute_cuda( + self, fname_cubin, kernel_name, grid_size, block_size, stream=None + ): + """ + Executes a specified CUDA kernel from a cubin file, handling module loading, + kernel retrieval, stream creation, kernel launch, and synchronization. + """ + from .runtime import cuda as cuda_helpers + + # Step 1. Load CUDA Module + module = cuda_helpers.load_cubin_module(fname_cubin) + # Step 2. Find CUDA function + kernel_ptr = cuda_helpers.get_kernel_function(module, kernel_name) + + sync_execution_default = False + if stream is None: + stream = self._get_default_stream() + sync_execution_default = True + + # Step 4. Launch the kernel + cuda_helpers.launch_kernel( + kernel_ptr, + grid_size, + block_size, + stream, + smem_size=16000, + kernel_args=self.exe_args, + ) + + if sync_execution_default: + # Step 5. Optional Sync cuda stream + cuda_helpers.stream_sync(stream) + + def _execute_by_cuda_driver( + self, kernel_generator, generate_cubin, grid_size, block_size, stream=None + ): + """ + This function builds IR and execute the module using cuda driver. + It doesn't use mlir's cuda runtime + """ + ret = None + + # Step 1. Build IR + with ir.Context(), ir.Location.unknown(): + loc = self.get_location() + module = ir.Module.create(loc=loc) + unit_attr = ir.UnitAttr.get() + module.operation.attributes["gpu.container_module"] = unit_attr + with ir.InsertionPoint(module.body): + self._build_gpu_module() + ret, kernel_name = kernel_generator() + log().debug( + f"Kernel generator returned: ret={ret}, kernel_name={kernel_name}" + ) + + module = self.build_module(module, kernel_name) + + # dryrun is used to only generate IR + if self.envar.dryrun: + return ret + + # Generate cubin + fname_cubin = generate_cubin(module, kernel_name) + + # Execute a cuda kernel from cubin + if block_size is None: + # The TileIR driver should set this automatically. + block_size = self.block_size + self._execute_cuda(fname_cubin, kernel_name, grid_size, block_size, stream) + + return ret + + def generate_kernel_operands_and_types( + self, kernel_func, kernel_name, args_spec, args, kwargs + ): + """ + Generate the operands and types for the kernel function + """ + + kernel_operands, kernel_arg_types, kernel_arg_attrs = [], [], [] + + log().debug( + "Processing GPU kernel call in [%s] mode", + ( + f"Only {self.device_jit_decorator_name}" + if self.device_compilation_only + else f"{self.host_jit_decorator_name} + {self.device_jit_decorator_name}" + ), + ) + + if self.device_compilation_only: + return kernel_operands, kernel_arg_types, kernel_arg_attrs + + kernel_operands, kernel_arg_types, kernel_arg_attrs = ( + self._generate_jit_func_args( + kernel_func, kernel_name, args, kwargs, args_spec, is_host=False + ) + ) + + log().debug("Final kernel_operands: %s", ", ".join(map(str, kernel_operands))) + log().debug("Final kernel_arg_types: %s", ", ".join(map(str, kernel_arg_types))) + log().debug("Final kernel_arg_attrs: %s", ", ".join(map(str, kernel_arg_attrs))) + + assert ( + len(kernel_operands) == len(kernel_arg_types) == len(kernel_arg_attrs) + ), "Size of kernel_operands, kernel_arg_types and kernel_arg_attrs must be equal" + + return kernel_operands, kernel_arg_types, kernel_arg_attrs + + def kernel_launcher(self, *dargs, **dkwargs): + def decorator(funcBody): + @wraps(funcBody) + def kernel_wrapper(*args, **kwargs): + """ + Base decorator for generating kernel function + + This decorator provides a template for kernel function generation + including kernel function header/body and kernel launch op at call site + + Optional arguments (with default value in <>): + - requiredArgs <[]>: specifies the mandatory arguments that must present in kernel function signature + the args will be validated and collected as a namedtuple + - optionalArgs <[]>: specifies the optional arguments that might present in kernel function signature + the args will be collected (if present) as a namedtuple + - unitAttrNames <[]>: specifies the name(s) of ir.UnitAttr to be set for kernel function op + - valueAttrDict <{}>: specifies the name(s) and value(s) of ir.Attribute to be set for kernel function op + - kernelGenHelper : specifies the mandatory customized kernel generation helper class (derived from _KernelGenHelper) + + Return value: + A namedtuple "KernelReturns" is returned with following fields: + - kernel_func_ret: the return of the kernel function + - launch_op_ret: the return of the launch op + """ + + requiredArgs = dkwargs.get("requiredArgs", []) + optionalArgs = dkwargs.get("optionalArgs", []) + unitAttrNames = dkwargs.get("unitAttrNames", []) + valueAttrDict = dkwargs.get("valueAttrDict", {}) + kernelGenHelper = dkwargs.get("kernelGenHelper", None) + + kernel_name = funcBody.__name__ + args_spec = inspect.getfullargspec(funcBody) + self.funcBody = funcBody + + # Give each kernel a unique name. (The same kernel may be + # called multiple times, resulting in multiple kernel traces.) + # The mangled name of Python function is part of the name to + # improve readability. + kernel_name = f"kernel_{self.mangle_name(kernel_name, args, args_spec)}_{self.num_kernels}" + self.num_kernels += 1 + + # Step 0. Preprocess the arguments + def extract_args(argNames, assertIfNone=False) -> list: + extracted = [] + for name in argNames: + value = kwargs.pop(name, None) + if assertIfNone and value is None: + raise DSLRuntimeError( + f"{name} is required for {kernel_name}" + ) + extracted.append(value) + + return extracted + + RequiredArgs = namedtuple("RequiredArgs", requiredArgs) + req_args = ( + RequiredArgs._make(extract_args(requiredArgs, assertIfNone=True)) + if requiredArgs + else None + ) + OptionalArgs = namedtuple("OptionalArgs", optionalArgs) + opt_args = ( + OptionalArgs._make(extract_args(optionalArgs)) + if optionalArgs + else None + ) + assert ( + kernelGenHelper is not None + ), "kernelGenHelper should be explicitly specified!" + + # check arguments + self._check_arg_count(*args, **kwargs) + + # Canonicalize the input arguments + canonicalized_args, canonicalized_kwargs = self._canonicalize_args( + *args, **kwargs + ) + + kernel_operands, kernel_types, kernel_arg_attrs = ( + self.generate_kernel_operands_and_types( + funcBody, + kernel_name, + args_spec, + canonicalized_args, + canonicalized_kwargs, + ) + ) + + with self._enter_gpu_module(): + log().debug("Generating device kernel") + if self.device_compilation_only: + log().debug("Generating cuda-python arguments") + # Convert input arguments to MLIR arguments + self.exe_args, kernel_types = self.generate_mlir_function_types( + funcBody, + kernel_name, + canonicalized_args, + canonicalized_kwargs, + args_spec, + ) + + helper = kernelGenHelper() + loc = self.get_location() + fop = helper.generate_func_op( + kernel_types, kernel_arg_attrs, kernel_name, loc + ) + log().debug(f"Kernel function op: {fop}") + for attr in unitAttrNames: + fop.attributes[attr] = ir.UnitAttr.get() + for key, val in valueAttrDict.items(): + fop.attributes[key] = val + + fop.sym_visibility = ir.StringAttr.get("public") + with ir.InsertionPoint(helper.get_func_body_start()): + ir_args, ir_kwargs = self.generate_execution_arguments( + canonicalized_args, canonicalized_kwargs, fop, args_spec + ) + log().debug( + f"IR arguments - args: {ir_args} ; kwargs: {ir_kwargs}" + ) + # Call user function body + kernel_ret = funcBody(*ir_args, **ir_kwargs) + helper.generate_func_ret_op() + + # Step 3. Generate call site `launch_func` + kernel_sym = ir.SymbolRefAttr.get(["kernels", kernel_name]) + launch_ret = helper.generate_launch_op( + kernelSym=kernel_sym, + kernelOperands=kernel_operands, + requiredArgs=req_args, + optionalArgs=opt_args, + ) + + KernelReturns = namedtuple( + "KernelReturns", ["kernel_func_ret", "launch_op_ret"] + ) + result = KernelReturns( + kernel_func_ret=kernel_ret, launch_op_ret=launch_ret + ) + log().debug(f"Kernel result: {result}, kernel name: {kernel_name}") + return result, kernel_name + + return kernel_wrapper + + if len(dargs) == 1 and callable(dargs[0]): + return decorator(dargs[0]) + else: + return decorator diff --git a/python/CuTeDSL/base_dsl/env_manager.py b/python/CuTeDSL/base_dsl/env_manager.py new file mode 100644 index 00000000..ef1fea7a --- /dev/null +++ b/python/CuTeDSL/base_dsl/env_manager.py @@ -0,0 +1,303 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides utilities for the environment variables setup. + +It provides an EnvironmentVarManager, which reads environment variables for the DSL +and caches them for efficient access. + +It also provides utilities to automatically setup a subset of environment variables +based on heuristics. +""" + +import os +import sys +import shutil +import glob +from pathlib import Path +from functools import lru_cache +from typing import Any + +from ..base_dsl.runtime.cuda import get_compute_capability_major_minor +from .utils.logger import log + +IS_WINDOWS = sys.platform == "win32" +CLIB_EXT = ".dll" if IS_WINDOWS else ".so" + +# ============================================================================= +# Environment Variable Helpers +# ============================================================================= + + +@lru_cache(maxsize=None) +def get_str_env_var(var_name, default_value=None): + value = os.getenv(var_name) + return value if value is not None else default_value + + +@lru_cache(maxsize=None) +def get_bool_env_var(var_name, default_value=False): + value = get_str_env_var(var_name) + if value is None: + return default_value + return value not in {"False", "0", ""} + + +@lru_cache(maxsize=None) +def get_int_env_var(var_name, default_value=0): + value = get_str_env_var(var_name) + return int(value) if value and value.isdigit() else default_value + + +def detect_gpu_arch(prefix): + """ + Attempts to detect the machine's GPU architecture. + + Returns: + A string representing the GPU architecture (e.g. "70" for compute capability 7.0), + or a default value(e.g. "sm_100") if the GPU architecture cannot be determined. + """ + arch = (None, None) + try: + arch = get_compute_capability_major_minor() + except Exception as e: + log().info(f"Failed to get CUDA compute capability: {e}") + + if arch == (None, None): + # default to sm_100 + arch = (10, 0) + + major, minor = arch + suffix = "" + if major >= 9 and minor >= 0: + suffix = "a" + elif minor != 0: + # e.g sm_86, belong with sm_80 family + minor = 0 + return f"sm_{major}{minor}{suffix}" + + +def find_libs_in_ancestors(start, target_libs, lib_folder_guesses): + """ + Search ancestor directories for a candidate library folder containing all required libraries. + + Starting from the given path, this function traverses up through each parent directory. + For every ancestor, it checks candidate subdirectories (specified by lib_folder_guesses) + for files that match the required library extension (CLIB_EXT). Library file names are + canonicalized by removing the "lib" prefix from their stem. If a candidate directory contains + all of the required libraries (as specified in target_libs), the function returns a list of + absolute paths to these library files. + + Parameters: + start (str or Path): The starting directory from which to begin the search. + target_libs (iterable of str): A collection of required library names (without the "lib" prefix). + lib_folder_guesses (iterable of str): Relative paths from an ancestor directory that may contain the libraries. + + Returns: + list[str] or None: A list of resolved paths to the required library files if found; otherwise, None. + """ + # Traverse through all parent directories of the resolved starting path. + for ancestor in Path(start).resolve().parents: + # Iterate over each candidate relative directory path. + for rel_path in lib_folder_guesses: + target_dir = ancestor / rel_path + # Skip if the candidate directory does not exist. + if not target_dir.is_dir(): + continue + + # Initialize a list to hold the resolved paths of matching library files. + libs_cand = [] + # Create a set of the remaining libraries we need to find. + remaining_libs = set(target_libs) + + # Iterate over all items in the candidate directory. + for p in target_dir.iterdir(): + # Consider only files with the expected library extension. + if p.suffix == CLIB_EXT: + # Canonicalize the library name by removing the "lib" prefix. + lib_name = p.stem.removeprefix("lib") + # If this library is required, add its resolved path and mark it as found. + if lib_name in remaining_libs: + libs_cand.append(str(p.resolve())) + remaining_libs.remove(lib_name) + + # If all required libraries have been found, return the list of library paths. + if len(remaining_libs) == 0: + return libs_cand + + # Return None if no candidate directory contains all required libraries. + return None + + +def _find_cuda_home(): + """Find the CUDA installation path using a series of heuristic methods. + Methods below are checked in order, and the function returns on first match: + 1. Checking the environment variables CUDA_HOME and CUDA_PATH. + 2. Searching for the 'nvcc' compiler in the system PATH and deriving the path of cuda. + 3. Scanning common installation directories based on the operating system. + - On Windows systems (when IS_WINDOWS is True), it searches in: + C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.* + - On Unix-like systems, it searches in: + /usr/local/cuda* + + Returns: + Optional[str]: The absolute CUDA installation path if found; otherwise, None. + + Note: + The variable IS_WINDOWS is defined in the module scope. + """ + # Guess #1 + cuda_home = get_str_env_var("CUDA_HOME") or get_str_env_var("CUDA_PATH") + if cuda_home is None: + # Guess #2 + nvcc_path = shutil.which("nvcc") + if nvcc_path is not None: + cuda_home = os.path.dirname(os.path.dirname(nvcc_path)) + else: + # Guess #3 + if IS_WINDOWS: + glob_pat = "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*" + else: + glob_pat = "/usr/local/cuda*" + cuda_homes = glob.glob(glob_pat) + if len(cuda_homes) == 0: + cuda_home = "" + else: + cuda_home = cuda_homes[0] + if not os.path.exists(cuda_home): + cuda_home = None + return cuda_home + + +def get_cuda_toolkit_path(): + """ + Get cuda_toolkit_path. It returns get_str_env_var('CUDA_TOOLKIT_PATH') if + set. Otherwise, attempts to discover a valid CUDA toolkit location and + return. If not found, return None. + """ + # Check if the environment variable is already set, if so, return it immediately. + try: + cuda_toolkit_path_existing = get_str_env_var("CUDA_TOOLKIT_PATH") + if cuda_toolkit_path_existing: + return cuda_toolkit_path_existing + + found_cuda_home = _find_cuda_home() + if found_cuda_home: + return found_cuda_home + except Exception as e: + log().info("default_env: exception on get_cuda_toolkit_path", e) + return None + + +def get_prefix_dsl_libs(prefix: str): + """ + Returns get_str_env_var('{prefix}_LIBS') if set. + Otherwise, attempts to discover libs based on heuristics and return + If not found, return None. + """ + # Check if the environment variable is already set, if so, return it immediately. + try: + prefix_libs_existing = get_str_env_var(f"{prefix}_LIBS") + if prefix_libs_existing: + return prefix_libs_existing + + def get_libs_cand(start): + target_libs = { + "mlir_c_runner_utils", + "mlir_runner_utils", + "mlir_cuda_runtime", + } + lib_folder_guesses = [ + "lib", + ] + + libs_cand = find_libs_in_ancestors(start, target_libs, lib_folder_guesses) + if libs_cand: + dsl_libs = ":".join(libs_cand) + return dsl_libs + + return None + + # find from install folder + dsl_libs = get_libs_cand(__file__) + + if not dsl_libs: + # try to find from build folder structure + dsl_libs = get_libs_cand(Path(__file__).parent.parent.resolve()) + + return dsl_libs + + except Exception as e: + log().info(f"default_env: exception on get_prefix_dsl_libs", e) + return None + + +class EnvironmentVarManager: + """Manages environment variables for configuration options. + + Printing options: + - [DSL_NAME]_LOG_TO_CONSOLE: Print logging to stderr (default: False) + - [DSL_NAME]_PRINT_AFTER_PREPROCESSOR: Print after preprocess (default: False) + - [DSL_NAME]_PRINT_IR: Print generated IR (default: False) + - [DSL_NAME]_FILTER_STACKTRACE: Filter internal stacktrace (default: True) + File options: + - [DSL_NAME]_KEEP_IR: Save generated IR in a file (default: False) + - [DSL_NAME]_LOG_TO_FILE: Store all logging into a file, excluding COMPILE_LOGS (default: False) + Other options: + - [DSL_NAME]_LOG_LEVEL: Logging level to set, for LOG_TO_CONSOLE or LOG_TO_FILE (default: 1). + - [DSL_NAME]_DRYRUN: Generates IR only (default: False) + - [DSL_NAME]_ARCH: GPU architecture (default: "sm_100") + - [DSL_NAME]_WARNINGS_AS_ERRORS: Enable warnings as error (default: False) + - [DSL_NAME]_WARNINGS_IGNORE: Ignore warnings (default: False) + - [DSL_NAME]_JIT_TIME_PROFILING: Whether or not to profile the IR generation/compilation/execution time (default: False) + - [DSL_NAME]_DISABLE_FILE_CACHING: Disable file caching (default: False) + - [DSL_NAME]_FILE_CACHING_CAPACITY: Limits the number of the cache save/load files (default: 1000) + - [DSL_NAME]_LIBS: Path to dependent shared libraries (default: None) + - [DSL_NAME]_NO_SOURCE_LOCATION: Generate source location (default: False) + """ + + def __init__(self, prefix="DSL"): + self.prefix = prefix # change if needed + + # Printing options + self.log_to_console = get_bool_env_var(f"{prefix}_LOG_TO_CONSOLE", False) + self.print_after_preprocessor = get_bool_env_var( + f"{prefix}_PRINT_AFTER_PREPROCESSOR", False + ) + self.printIR = get_bool_env_var(f"{prefix}_PRINT_IR", False) + self.filterStacktrace = get_bool_env_var(f"{prefix}_FILTER_STACKTRACE", True) + # File options + self.keepIR = get_bool_env_var(f"{prefix}_KEEP_IR", False) + self.log_to_file = get_bool_env_var(f"{prefix}_LOG_TO_FILE", False) + # Other options + self.log_level = get_int_env_var(f"{prefix}_LOG_LEVEL", 1) + self.dryrun = get_bool_env_var(f"{prefix}_DRYRUN", False) + self.arch = get_str_env_var(f"{prefix}_ARCH", detect_gpu_arch(prefix)) + self.warnings_as_errors = get_bool_env_var( + f"{prefix}_WARNINGS_AS_ERRORS", False + ) + self.warnings_ignore = get_bool_env_var(f"{prefix}_WARNINGS_IGNORE", False) + self.jitTimeProfiling = get_bool_env_var(f"{prefix}_JIT_TIME_PROFILING", False) + self.disable_file_caching = get_bool_env_var( + f"{prefix}_DISABLE_FILE_CACHING", False + ) + self.file_caching_capacity = get_int_env_var( + f"{prefix}_FILE_CACHING_CAPACITY", 1000 + ) + self.generate_source_location = not get_bool_env_var( + f"{prefix}_NO_SOURCE_LOCATION", False + ) + # set cuda + self.cuda_toolkit = get_cuda_toolkit_path() + + # set mlir shared libraries + self.shared_libs = get_prefix_dsl_libs(prefix) diff --git a/python/CuTeDSL/base_dsl/jit_executor.py b/python/CuTeDSL/base_dsl/jit_executor.py new file mode 100644 index 00000000..2c997be3 --- /dev/null +++ b/python/CuTeDSL/base_dsl/jit_executor.py @@ -0,0 +1,301 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides jit executor related classes +""" +import io +import inspect +import ctypes +import numpy as np +from typing import get_origin + +# Local modules imports +from .utils.timer import timer +from .utils.logger import log +from .common import DSLRuntimeError +from .runtime import cuda as cuda_helpers +from .runtime.jit_arg_adapters import JitArgAdapterRegistry, is_arg_spec_constexpr +from .typing import get_c_pointers +from . import typing as t + +# MLIR modules imports +from .._mlir import ir + + +class CudaSingleModule: + def __init__(self, cuda_module, kernel_ptr): + self.cuda_module = cuda_module + self.kernel_ptr = kernel_ptr + + +class CudaModules: + def __init__(self, modules, args): + # list of CudaSingleModule + self.modules = modules + # extra kernel ptr arguments for launch + self.args = args + + +class JitExecutor: + def __init__( + self, + dsl, + engine, + capi_func, + ir_module, + args_spec, + function_name, + cuda_modules: CudaModules = None, + jit_time_profiling=False, + ): + self.dsl = dsl + self.engine = engine + self.capi_func = capi_func + self.ir_module = ir_module + self.args_spec = args_spec + self.function_name = function_name + if args_spec is not None: + self.args_spec = self.filter_runtime_arg_spec(args_spec) + # cuda kernels + self.cuda_modules = cuda_modules + self.jit_time_profiling = jit_time_profiling + + def filter_runtime_arg_spec(self, arg_spec: inspect.FullArgSpec): + runtime_args = [] + runtime_annotations = {} + runtime_defaults = [] + + # Calculate the offset where defaults start in the original args + if arg_spec.defaults: + defaults_start_idx = len(arg_spec.args) - len(arg_spec.defaults) + else: + defaults_start_idx = len(arg_spec.args) + + # Filter arguments and maintain their properties + for i, arg_name in enumerate(arg_spec.args): + arg_type = arg_spec.annotations.get(arg_name, None) + + # Skip compile-time arguments + if is_arg_spec_constexpr(arg_type, arg_name, i, self.function_name): + continue + + # Keep runtime arguments + runtime_args.append(arg_name) + if arg_name in arg_spec.annotations: + runtime_annotations[arg_name] = arg_type + + # Keep corresponding default if it exists + if i >= defaults_start_idx: + default_idx = i - defaults_start_idx + runtime_defaults.append(arg_spec.defaults[default_idx]) + + # Filter kwonlyargs and their defaults + runtime_kwonlyargs = [] + runtime_kwonlydefaults = {} + + if arg_spec.kwonlyargs: + for kwarg in arg_spec.kwonlyargs: + arg_type = arg_spec.annotations.get(kwarg, None) + + # Apply same filtering logic + if is_arg_spec_constexpr(arg_type, kwarg, i, self.function_name): + continue + + runtime_kwonlyargs.append(kwarg) + if kwarg in arg_spec.annotations: + runtime_annotations[kwarg] = arg_type + if arg_spec.kwonlydefaults and kwarg in arg_spec.kwonlydefaults: + runtime_kwonlydefaults[kwarg] = arg_spec.kwonlydefaults[kwarg] + + # Convert runtime_defaults to tuple if not empty (as expected by FullArgSpec) + runtime_defaults = tuple(runtime_defaults) if runtime_defaults else None + + return inspect.FullArgSpec( + args=runtime_args, + varargs=arg_spec.varargs, # Keep original varargs + varkw=arg_spec.varkw, # Keep original varkw + defaults=runtime_defaults, + kwonlyargs=runtime_kwonlyargs, + kwonlydefaults=runtime_kwonlydefaults if runtime_kwonlydefaults else None, + annotations=runtime_annotations, + ) + + def __del__(self): + if self.cuda_modules: + cuda_modules = [module.cuda_module for module in self.cuda_modules.modules] + for module in set(cuda_modules): + cuda_helpers.unload_cubin_module(module) + + def generate_execution_args(self, args, kwargs, args_spec: inspect.FullArgSpec): + """ + This function is the prune version of `generate_mlir_function_types` which only generates execution args + to get rid of mlir context. + """ + + # args/kwargs must match arg_specs + # No canonicalization of args/kwargs to avoid extra latency + if len(args) != len(args_spec.args) or len(kwargs) != len(args_spec.kwonlyargs): + raise DSLRuntimeError( + "input args/kwargs length does not match runtime function signature!", + context={ + "input args length": len(args), + "input kwargs length": len(kwargs), + "function signature args length": len(args_spec.args), + "function signature kwonlyargs length": len(args_spec.kwonlyargs), + }, + ) + + exe_args = [] + input_args = [*args, *kwargs.values()] + input_arg_names = [*args_spec.args, *args_spec.kwonlyargs] + for i, arg in enumerate(input_args): + arg_type = args_spec.annotations.get(input_arg_names[i], None) + + # Implicit cast to NumericMeta + if isinstance(arg_type, t.NumericMeta): + arg = t.cast(arg, arg_type) + + # If not any known type, try registered adapter to do the conversion + adapter = JitArgAdapterRegistry.get_registered_adapter(type(arg)) + adapted_arg = adapter(arg) if adapter else arg + exe_args.extend(get_c_pointers(adapted_arg)) + + return exe_args + + def __call__(self, *args, **kwargs): + exe_args = self.generate_execution_args(args, kwargs, self.args_spec) + + self.run_compiled_program(exe_args) + + # Assume each execution args has type `c_void_p` to reduce the overhead of `ctypes.cast`. + def get_invoke_packed_args(self, exe_args): + if self.cuda_modules: + exe_args += self.cuda_modules.args + packed_args = (ctypes.c_void_p * len(exe_args))() + for argNum in range(len(exe_args)): + packed_args[argNum] = exe_args[argNum] + return packed_args + + def run_compiled_program(self, exe_args): + if self.jit_time_profiling: + profiler = timer(enable=True) + try: + packed_args = profiler(self.get_invoke_packed_args)(exe_args) + profiler(self.capi_func)(packed_args) + except Exception as e: + raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e) + else: + try: + packed_args = self.get_invoke_packed_args(exe_args) + self.capi_func(packed_args) + except Exception as e: + raise DSLRuntimeError(f"💥💥💥 Runtime Crash 💥💥💥", cause=e) + + def update_jit_cuda_modules(self, kernel_symbols): + # preload cuda module from compiled cubin in ir and store to jit_executor.kernels. + if len(kernel_symbols) > 0: + extra_args = [] + module = self.ir_module + cuda_kernel_cache = dict() + cuda_driver_version = cuda_helpers.get_driver_version() + for sym in kernel_symbols: + if sym not in cuda_kernel_cache: + log().debug(f"Loading CUDA module for symbol: {sym}") + + # load cuda module/get function pointer from module and cache + def walk_callback(sym, func_sym, cubin_data): + cubin_module = cuda_helpers.load_cubin_module_data(cubin_data) + kernel_ptr = cuda_helpers.get_kernel_function( + cubin_module, func_sym + ) + # Enable non-portable cluster size for CUDA version 11.8 or higher. + if cuda_driver_version >= 11080: + cuda_helpers.set_kernel_attribute( + kernel_ptr, + cuda_helpers.cuda.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, + 1, + ) + cuda_kernel_cache[sym] = CudaSingleModule( + cubin_module, kernel_ptr + ) + + self.walk_module_and_get_cubin_data(module, sym, walk_callback) + else: + log().debug(f"Symbol {sym} already in cache") + # check if kernel is empty. + if sym in cuda_kernel_cache: + extra_args.append( + ctypes.c_void_p(cuda_kernel_cache[sym].kernel_ptr.getPtr()) + ) + # store to the jit result if jit result is cached. + self.cuda_modules = CudaModules(cuda_kernel_cache.values(), extra_args) + + return self + + def _get_escaped_cubin_bytes(self, cubin_data): + """This function escapes cubin data from mlir raw bytecode to executable binary bytes""" + + def ishex(inp): + return ( + inp in range(0x30, 0x3A) + or inp in range(0x61, 0x67) + or inp in range(0x41, 0x47) + ) + + converted = bytearray() + idx = 0 + while idx < len(cubin_data): + # escape the original bytes + if cubin_data[idx] == 0x5C: + # if data of idx is b'\\' + if ishex(cubin_data[idx + 1]) and ishex(cubin_data[idx + 2]): + converted += bytearray.fromhex( + cubin_data[idx + 1 : idx + 3].decode() + ) + idx += 3 + elif cubin_data[idx + 1] == 0x5C: + converted.append(cubin_data[idx]) + idx += 2 + else: + # no escape, directly write + converted.append(cubin_data[idx]) + idx += 1 + return bytes(converted) + + def walk_module_and_get_cubin_data(self, module, sym, callback): + """This function is used to walk gpu binary op, extract the cubin inside, and process cubin data with callback.""" + + def walk_gpu_binary_op(op): + if op.name != "gpu.binary": + return ir.WalkResult.ADVANCE + s = io.BytesIO() + op.write_bytecode(s) + cubin_data = s.getvalue() + if sym.encode() not in cubin_data: + return ir.WalkResult.ADVANCE + + if ( + "kernels" != op.opview.sym_name.value + and sym != op.opview.sym_name.value + ): + return ir.WalkResult.ADVANCE + # function symbol of kernel(gpu.launch_func) is equal to sym name in mlir + func_sym = sym + if sym == op.opview.sym_name.value and not sym.endswith("_kernel"): + func_sym = sym.rsplit("_", 1)[0] + + cubin_data = cubin_data.split(b'bin = "')[1].split(b'">')[0] + cubin_data = self._get_escaped_cubin_bytes(cubin_data) + callback(sym, func_sym, cubin_data) + return ir.WalkResult.ADVANCE + + module.operation.walk(walk_gpu_binary_op) diff --git a/python/CuTeDSL/base_dsl/runtime/__init__.py b/python/CuTeDSL/base_dsl/runtime/__init__.py new file mode 100644 index 00000000..6f8e2feb --- /dev/null +++ b/python/CuTeDSL/base_dsl/runtime/__init__.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides a runtime utility functions that are needed for +the DSL. +""" + +from . import device_tensor +from . import dlpack_types +from . import cuda +from . import tensor_descriptor +from . import jit_arg_adapters + +__all__ = [ + "device_tensor", + "dlpack_types", + "cuda", + "tensor_descriptor", + "jit_arg_adapters", +] diff --git a/python/CuTeDSL/base_dsl/runtime/cuda.py b/python/CuTeDSL/base_dsl/runtime/cuda.py new file mode 100644 index 00000000..c4f88b58 --- /dev/null +++ b/python/CuTeDSL/base_dsl/runtime/cuda.py @@ -0,0 +1,470 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides CUDA Python helper functions +""" + + +from functools import lru_cache +from dataclasses import dataclass +from typing import List, Optional +import numpy as np +import os +import ctypes + +import cuda.bindings.driver as cuda +import cuda.bindings.nvrtc as nvrtc + +# MLIR imports +from ..._mlir import ir +from ..._mlir.dialects import gpu + +# Local module imports +from ..utils.logger import log as _log +from ..common import * +from .jit_arg_adapters import JitArgAdapterRegistry + + +# ============================================================================= +# Utils +# ============================================================================= + + +def _cudaGetErrorEnum(error): + if isinstance(error, cuda.CUresult): + err, name = cuda.cuGetErrorName(error) + return name if err == cuda.CUresult.CUDA_SUCCESS else "" + elif isinstance(error, nvrtc.nvrtcResult): + return nvrtc.nvrtcGetErrorString(error)[1] + else: + raise DSLRuntimeError("Unknown error type: {}".format(error)) + + +def _get_gpu_arch_info(major, minor): + """Get GPU architecture information and compatibility details.""" + gpu_arch_map = { + (7, 0): ("Volta", "sm_70", ["sm_70"]), # V100 + (7, 5): ("Turing", "sm_75", ["sm_75"]), # RTX 20 Series, Quadro RTX + (8, 0): ("Ampere", "sm_80", ["sm_80"]), # A100 + (8, 6): ("Ampere", "sm_86", ["sm_86", "sm_80"]), # RTX 30 Series + (8, 9): ("Ada", "sm_89", ["sm_89", "sm_86"]), # RTX 40 Series + (8, 7): ("Ampere", "sm_87", ["sm_87", "sm_86", "sm_80"]), # A10, A40 + (9, 0): ("Hopper", "sm_90a", ["sm_90a"]), # H100 + (10, 0): ("Blackwell", "sm_100a", ["sm_100a"]), # B200 + } + return gpu_arch_map.get( + (major, minor), ("Unknown", f"sm_{major}{minor}", [f"sm_{major}{minor}"]) + ) + + +def get_compute_capability_major_minor(device_id: int = 0): + """ + Returns the compute capability of the CUDA device as a tuple of (major, minor). + For example: (8, 0) for Ampere, (9, 0) for Hopper, (10, 0) for Blackwell. + Returns None on failure. + """ + try: + checkCudaErrors(cuda.cuInit(0)) + device = checkCudaErrors(cuda.cuDeviceGet(device_id)) + major = checkCudaErrors( + cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + device, + ) + ) + minor = checkCudaErrors( + cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + device, + ) + ) + return major, minor + except RuntimeError as e: + _log().info(f"Failed to get CUDA compute capability: {e}") + return None, None + + +@dataclass +class DeviceInfo: + """Data class to store CUDA device information.""" + + device_count: int = 0 + current_device: int = 0 + device_name: Optional[str] = None + major_version: Optional[int] = None + minor_version: Optional[int] = None + arch_name: Optional[str] = None + sm_arch: Optional[str] = None + compatible_archs: Optional[List[str]] = None + memory_gb: Optional[float] = None + target_arch: Optional[str] = None + error_message: Optional[str] = None + initialization_failed: bool = False + + def pretty_str(self) -> str: + """ + Convert DeviceInfo to a formatted string for display. + """ + info = "" + + if self.initialization_failed: + return f"{Colors.BOLD}- CUDA initialization failed{Colors.RESET}" + + if self.error_message: + return f"{Colors.BOLD}- Failed to get GPU info: {self.error_message}{Colors.RESET}" + + if self.device_count > 0: + info += f"{Colors.BOLD}- CUDA devices available: {self.device_count} (current: {self.current_device})\n" + + if self.major_version is not None and self.minor_version is not None: + info += f"- Architecture: {Colors.BLUE}{self.arch_name}{Colors.RESET} ({Colors.GREEN}{self.sm_arch}{Colors.RESET})\n" + info += f"- Compatible SM archs: {Colors.GREEN}{', '.join(self.compatible_archs or [])}{Colors.RESET}\n" + + if self.memory_gb is not None: + info += f"- Total Memory: {Colors.BLUE}{self.memory_gb:.2f} GB{Colors.RESET}\n" + + else: + info += f"- Compute capability: unknown\n" + info += f"- SM arch: unknown{Colors.RESET}\n" + else: + info += f"- No devices available\n" + + return info + + +def get_device_info() -> DeviceInfo: + """ + Get detailed information about CUDA devices. + Returns a DeviceInfo dataclass with device information. + """ + device_info = DeviceInfo() + + # Initialize CUDA if not already initialized + try: + result = cuda.cuInit(0) + if result[0].value: # Check for error + device_info.initialization_failed = True + return device_info + except: + pass + + try: + # Get device count + result = cuda.cuDeviceGetCount() + device_info.device_count = result[1] if result[0].value == 0 else 0 + + if device_info.device_count > 0: + # Get current device + try: + result = cuda.cuCtxGetDevice() + if result[0].value == 0: + device_info.current_device = result[1] + except: + pass + + # Get device name + try: + name_result = cuda.cuDeviceGetName(100, device_info.current_device) + if name_result[0].value == 0: + device_info.device_name = name_result[1] + except: + pass + + # Get compute capability and architecture info + try: + major, minor = get_compute_capability_major_minor( + device_info.current_device + ) + + # Check if we successfully got the compute capability + if major is not None and minor is not None: + device_info.major_version = major + device_info.minor_version = minor + + arch_name, sm_arch, compatible_archs = _get_gpu_arch_info( + device_info.major_version, device_info.minor_version + ) + + device_info.arch_name = arch_name + device_info.sm_arch = sm_arch + device_info.compatible_archs = compatible_archs + + # Get memory info + try: + total_mem = cuda.cuDeviceGetAttribute( + cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TOTAL_MEMORY, + device_info.current_device, + ) + if total_mem[0].value == 0: + device_info.memory_gb = total_mem[1] / ( + 1024 * 1024 * 1024 + ) # Convert to GB + except: + pass + + except Exception as e: + pass # Compute capability info will remain None + + except Exception as e: + device_info.error_message = str(e) + + return device_info + + +def checkCudaErrors(result): + """Check CUDA errors and provide detailed error messages.""" + if result[0].value: + error_code = result[0].value + error_name = _cudaGetErrorEnum(result[0]) + + raise DSLCudaRuntimeError(error_code, error_name) + + if len(result) == 1: + return None + elif len(result) == 2: + return result[1] + else: + return result[1:] + + +# ============================================================================= +# Driver Helpers +# ============================================================================= + + +@lru_cache(maxsize=1) +def initialize_cuda_context(device_id: int = 0, flags: int = 0): + """ + Initializes the CUDA context for a specified device. + """ + # Initialize CUDA Driver API + _log().info(f"cuInit {flags}") + checkCudaErrors(cuda.cuInit(flags)) + # Retrieve handle for device + _log().info(f"cuDeviceGet {device_id}") + cuDevice = checkCudaErrors(cuda.cuDeviceGet(device_id)) + _log().info(f"{cuDevice} <-- cuDeviceGet") + # Create context + _log().info(f"cuCtxCreate {0} {cuDevice}") + context = checkCudaErrors(cuda.cuCtxCreate(0, cuDevice)) + _log().info(f"{context} <-- cuCtxCreate") + + return context + + +def load_cubin_module(cubin_file): + """ + Loads a CUBIN file and returns the module. + """ + # Load CUBIN file as binary data + _log().info(f"read cubin {cubin_file}") + with open(cubin_file, "rb") as f: + cubin_data = f.read() + # Load module data + _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}") + module = checkCudaErrors( + cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data) + ) + return module + + +def unload_cubin_module(module): + """ + Unloads a CUBIN module. + """ + _log().info(f"cuModuleUnload {module}") + checkCudaErrors(cuda.cuModuleUnload(module)) + + +def load_cubin_module_data(cubin_data): + """ + Loads a CUBIN from data and returns the module. + """ + # Load module data + _log().info(f"cuModuleLoadData {np.char.array(cubin_data).ctypes.data}") + module = checkCudaErrors( + cuda.cuModuleLoadData(np.char.array(cubin_data).ctypes.data) + ) + return module + + +def get_kernel_function(module, kernel_name): + """ + Retrieves the kernel function from the module. + """ + _log().info(f"cuModuleGetFunction {module} {kernel_name}") + kernel = checkCudaErrors( + cuda.cuModuleGetFunction(module, bytes(kernel_name, "utf-8")) + ) + _log().info(f"{kernel} <-- cuModuleGetFunction") + return kernel + + +def launch_kernel(kernel, grid_dims, block_dims, stream, smem_size=0, kernel_args=None): + """ + Launches the CUDA kernel. + """ + _log().info( + f"cuLaunchKernel {kernel} grid={grid_dims} blocks={block_dims} smem_size={smem_size} stream={stream} {kernel_args}" + ) + checkCudaErrors( + cuda.cuLaunchKernel( + kernel, + grid_dims[0], + grid_dims[1], + grid_dims[2], + block_dims[0], + block_dims[1], + block_dims[2], + smem_size, # Shared memory size + stream, + kernel_args, + 0, # Extra parameters + ) + ) + + +def stream_sync(stream): + """ + Synchronizes the CUDA stream. + """ + _log().info(f"cuStreamSynchronize {stream}") + checkCudaErrors(cuda.cuStreamSynchronize(stream)) + + +def stream_create(id=0): + """ + Creates the CUDA stream. + """ + _log().info(f"cuStreamCreate {id}") + stream = checkCudaErrors(cuda.cuStreamCreate(id)) + _log().info(f"{stream} <-- cuStreamCreate") + return stream + + +def stream_destroy(stream): + """ + Destroys the CUDA stream. + """ + _log().info(f"cuStreamDestroy {stream}") + checkCudaErrors(cuda.cuStreamDestroy(stream)) + + +def context_destroy(context): + """ + Destroys the CUDA context. + """ + _log().info(f"cuCtxDestroy {context}") + checkCudaErrors(cuda.cuCtxDestroy(context)) + + +def allocate(size_in_bytes: int, stream=None): + """ + Allocate device memory based on numpy host array size. + """ + _log().info("Allocate size_in_bytes=[%s] stream=[%s]", size_in_bytes, stream) + if stream is None: + device_memory = checkCudaErrors(cuda.cuMemAlloc(size_in_bytes)) + else: + device_memory = checkCudaErrors(cuda.cuMemAllocAsync(size_in_bytes, stream)) + _log().info("Allocated [%s]", device_memory) + return device_memory + + +def deallocate(device_pointer, stream=None): + """ + Deallocate the specified device memory pointer. + """ + _log().info( + "Deallocate device_pointer=[%s] stream=[%s]", hex(int(device_pointer)), stream + ) + if stream is None: + checkCudaErrors(cuda.cuMemFree(device_pointer)) + else: + checkCudaErrors(cuda.cuMemFreeAsync(device_pointer, stream)) + + +def memcpy_h2d(host_pointer, device_pointer, size_in_bytes, stream=None): + """ + Copy data from host to device memory. + """ + _log().info( + "Copy host-to-device host_pointer[%s] device_ptr=[%s] size_in_bytes=[%s] stream=[%s]", + hex(host_pointer), + hex(int(device_pointer)), + size_in_bytes, + stream, + ) + if stream is None: + checkCudaErrors(cuda.cuMemcpyHtoD(device_pointer, host_pointer, size_in_bytes)) + else: + checkCudaErrors( + cuda.cuMemcpyHtoDAsync(device_pointer, host_pointer, size_in_bytes, stream) + ) + + +def memcpy_d2h(host_pointer, device_pointer, size_in_bytes, stream=None): + """ + Copy data from device to host memory. + """ + _log().info( + "Copy device-host-to device_pointer=[%s] host_pointer[%s] size_in_bytes=[%s] stream=[%s]", + hex(int(device_pointer)), + hex(host_pointer), + size_in_bytes, + stream, + ) + if stream is None: + checkCudaErrors(cuda.cuMemcpyDtoH(host_pointer, device_pointer, size_in_bytes)) + else: + checkCudaErrors( + cuda.cuMemcpyDtoHAsync(host_pointer, device_pointer, size_in_bytes, stream) + ) + + +def default_stream(): + return cuda.CUstream(0) + + +def get_driver_version(): + """ + Returns the CUDA driver version. + """ + return checkCudaErrors(cuda.cuDriverGetVersion()) + + +def set_kernel_attribute(kernel, attribute, value): + """ + Sets a CUDA kernel attribute. + """ + return checkCudaErrors(cuda.cuFuncSetAttribute(kernel, attribute, value)) + + +@JitArgAdapterRegistry.register_jit_arg_adapter(cuda.CUstream) +class StreamAdapter: + """ + Convert a CUDA stream to a stream representation for JIT arg generation. + """ + + def __init__(self, arg): + self._arg = arg + self._c_pointer = ctypes.cast(self._arg.getPtr(), ctypes.c_void_p) + + def __new_from_mlir_values__(self, values): + assert len(values) == 1 + return values[0] + + def __c_pointers__(self): + return [self._c_pointer] + + def __get_mlir_types__(self): + return [gpu.AsyncTokenType.get()] diff --git a/python/CuTeDSL/base_dsl/runtime/device_tensor.py b/python/CuTeDSL/base_dsl/runtime/device_tensor.py new file mode 100644 index 00000000..5addb275 --- /dev/null +++ b/python/CuTeDSL/base_dsl/runtime/device_tensor.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import copy + +from . import cuda as cuda_helpers +from .tensor_descriptor import * +from ..common import * + + +def allocate(tensor: TensorDescriptor, stream=None): + """ + Allocates GPU memory + """ + if tensor._check_is_managed_by_framework(): + raise DSLRuntimeError( + "GPU tensors are managed by the framework and cannot be modified." + ) + if not tensor.device_pointer is None: + raise DSLRuntimeError("Tensor is already allocated on the device.") + + tensor.device_pointer = cuda_helpers.allocate(tensor.size_in_bytes, stream) + + log().info("Allocate done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer) + + +def deallocate(tensor: TensorDescriptor, stream=None): + """ + Deallocates GPU memory + """ + if tensor._check_is_managed_by_framework(): + raise DSLRuntimeError( + "GPU tensors are managed by the framework and cannot be modified." + ) + if tensor.device_pointer is None: + raise DSLRuntimeError("Tensor is not allocated on the device.") + + log().info( + "Deallocating done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer + ) + + cuda_helpers.deallocate(tensor.device_pointer, stream) + tensor.device_pointer = None + + +def copy_to_gpu(tensor: TensorDescriptor, do_allocate=True, stream=None): + """ + Copies data from host memory to the GPU memory. + If do_allocate is True, it first calls allocate + """ + log().info("copyin tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer) + if do_allocate: + allocate(tensor, stream) + cuda_helpers.memcpy_h2d( + tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream + ) + log().info("copyin done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer) + return tensor + + +def copy_from_gpu(tensor: TensorDescriptor, do_deallocate=True, stream=None): + """ + Copies data from GPU memory back to the host. + If do_deallocate is True, it calls deallocate + """ + log().info("copyout tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer) + if tensor._check_is_managed_by_framework(): + raise DSLRuntimeError( + "GPU tensors are managed by the framework and cannot be modified." + ) + if tensor.device_pointer is None: + raise DSLRuntimeError("Tensor is not allocated on the device.") + + cuda_helpers.memcpy_d2h( + tensor.data_ptr, tensor.device_pointer, tensor.size_in_bytes, stream + ) + if do_deallocate: + deallocate(tensor, stream) + log().info("copyout done tensor=[%s] dev_ptr=[%s]", tensor, tensor.device_pointer) + + +def to_gpu(tensor, stream=None) -> TensorDescriptor: + """ + Copies the tensor to the GPU memory from Host memory + """ + if isinstance(tensor, TensorDescriptor): + new_tensor = copy.copy(tensor) + copy_to_gpu(new_tensor, stream=stream) + return new_tensor + + if TensorDescriptor.can_transformed_to_dlpack(tensor): + new_tensor = TensorDescriptor(tensor) + copy_to_gpu(new_tensor, stream=stream) + return new_tensor + + raise DSLRuntimeError("Unsupported type") + + +def from_gpu(tensor, stream=None) -> TensorDescriptor: + """ + Copies the tensor to the GPU memory from Host memory + """ + if isinstance(tensor, TensorDescriptor): + new_tensor = copy.copy(tensor) + copy_from_gpu(new_tensor, stream=stream) + return new_tensor + + if TensorDescriptor.can_transformed_to_dlpack(tensor): + new_tensor = TensorDescriptor(tensor) + copy_from_gpu(new_tensor, stream=stream) + return new_tensor + + raise DSLRuntimeError("Unsupported type") diff --git a/python/CuTeDSL/base_dsl/runtime/dlpack_types.py b/python/CuTeDSL/base_dsl/runtime/dlpack_types.py new file mode 100644 index 00000000..168c2a99 --- /dev/null +++ b/python/CuTeDSL/base_dsl/runtime/dlpack_types.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides helper structs for dlpack. +DLPack is an open standard for in-memory tensor structures, enabling +seamless sharing of tensors across different frameworks. +Learn more at: https://github.com/dmlc/dlpack +""" + +import ctypes +import enum + + +class DLDeviceType(enum.IntEnum): + """Enums for device types based on the DLPack specification.""" + + kDLCPU = 1 + kDLGPU = 2 + kDLCPUPinned = 3 + + +class DLDataTypeCode: + """Enums for data type codes based on the DLPack specification. + + see https://github.com/dmlc/dlpack/blob/main/include/dlpack/dlpack.h + """ + + kDLInt = 0 + kDLUInt = 1 + kDLFloat = 2 + kDLOpaqueHandle = 3 + kDLBfloat = 4 + kDLComplex = 5 + kDLBool = 6 + + +class DLDevice(ctypes.Structure): + """Structure representing the device information in DLPack.""" + + _fields_ = [ + ("device_type", ctypes.c_int), # kDLCPU, kDLGPU, etc. + ("device_id", ctypes.c_int), # Device ID (e.g., GPU ID) + ] + + +class DLDataType(ctypes.Structure): + """Structure representing the data type in DLPack.""" + + _fields_ = [ + ("code", ctypes.c_uint8), # Data type code (e.g., kDLFloat) + ("bits", ctypes.c_uint8), # Number of bits per value + ("lanes", ctypes.c_uint16), # Number of lanes + ] + + +class DLTensor(ctypes.Structure): + """Structure representing the DLTensor in DLPack.""" + + _fields_ = [ + ("data", ctypes.c_void_p), # Pointer to tensor data + ("device", DLDevice), # Device info + ("ndim", ctypes.c_int), # Number of dimensions + ("dtype", DLDataType), # Data type + ("shape", ctypes.POINTER(ctypes.c_int64)), # Shape of tensor + ("strides", ctypes.POINTER(ctypes.c_int64)), # Strides of tensor + ("byte_offset", ctypes.c_uint64), # Byte offset to tensor data + ] diff --git a/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py b/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py new file mode 100644 index 00000000..eb998d16 --- /dev/null +++ b/python/CuTeDSL/base_dsl/runtime/jit_arg_adapters.py @@ -0,0 +1,188 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides runtime utilities for JIT argument conversion in DSL. +""" + +from functools import wraps +from typing import get_origin + +# Local modules imports +from ..common import DSLRuntimeError +from ..typing import ( + Constexpr, + Int32, + Float32, + Boolean, +) + + +def is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func): + """ + Check if the argument spec is a constexpr. + """ + + def _is_reserved_python_func_arg(arg_index, arg_name, func): + """ + Check if the argument is a reserved python function argument. + """ + + if arg_index != 0: + return False + + if arg_name == "self": + return True + + is_classmethod = isinstance(func, classmethod) or ( + hasattr(func, "__func__") and isinstance(func.__func__, classmethod) + ) + return arg_name == "cls" and is_classmethod + + return ( + _is_reserved_python_func_arg(arg_index, arg_name, owning_func) + or (isinstance(arg_spec, type) and issubclass(arg_spec, Constexpr)) + or (get_origin(arg_spec) is Constexpr) + ) + + +def is_argument_constexpr(arg, arg_spec, arg_name, arg_index, owning_func): + """ + Check if the argument is a constexpr. + """ + + def _is_type_argument(arg, arg_annotation): + """ + Check if the argument is a type argument like Type[X] + """ + + return isinstance(arg, type) and ( + arg_annotation is None or get_origin(arg_annotation) is type + ) + + return ( + is_arg_spec_constexpr(arg_spec, arg_name, arg_index, owning_func) + or _is_type_argument(arg, arg_spec) + or arg is None + ) + + +class JitArgAdapterRegistry: + """ + A registry to keep track of the JIT argument adapters. + + An adapter is a callable that converts a Python type to a type with following protocols supported: + - JitArgument + - DynamicExpression + The converted type can then be further processed by DSL to generate arguments for JIT functions. + """ + + # A dictionary with key=type and value=callable + jit_arg_adapter_registry = {} + + @classmethod + def register_jit_arg_adapter(cls, *dargs, **dkwargs): + """ + Register a JIT argument adapter callable + + This can be used as a decorator on any callable like: + + @register_jit_arg_adapter(my_py_type) + def my_adapter_for_my_py_type(arg): + ... + + @register_jit_arg_adapter(my_py_type) + class MyAdapterForMyPythonType: + ... + + The adapters are registered per type. If a type is already registerd, an error will be raised. + """ + + def decorator(*dargs, **dkwargs): + darg_python_ty = dargs[0] + + @wraps(darg_python_ty) + def wrapper(*args, **kwargs): + if len(args) != 1 or not callable(args[0]): + raise DSLRuntimeError( + "a callable must be provided for registering JIT argument adapter" + ) + adapter = args[0] + + if darg_python_ty in cls.jit_arg_adapter_registry: + raise DSLRuntimeError( + f"JIT argument adapter for {darg_python_ty} is already registered!", + context={ + "Registered adapter": cls.jit_arg_adapter_registry[ + darg_python_ty + ], + "Adapter to be registered": adapter, + }, + ) + cls.jit_arg_adapter_registry[darg_python_ty] = adapter + return adapter + + return wrapper + + if len(dargs) > 0: + return decorator(*dargs, **dkwargs) + else: + raise DSLRuntimeError( + "a Python type must be provided for registering JIT argument adapter" + ) + + @classmethod + def get_registered_adapter(cls, ty): + """ + Get the registered JIT argument adapter for the given type. + """ + return cls.jit_arg_adapter_registry.get(ty, None) + + +# ============================================================================= +# JIT Argument Adapters +# ============================================================================= + + +@JitArgAdapterRegistry.register_jit_arg_adapter(int) +@JitArgAdapterRegistry.register_jit_arg_adapter(float) +@JitArgAdapterRegistry.register_jit_arg_adapter(bool) +def _convert_python_scalar(arg): + """ + Convert a Python scalar to a DSL type. + """ + conversion_map = { + int: Int32, + float: Float32, + bool: Boolean, + } + return conversion_map.get(type(arg))(arg) + + +@JitArgAdapterRegistry.register_jit_arg_adapter(tuple) +@JitArgAdapterRegistry.register_jit_arg_adapter(list) +def _convert_python_sequence(arg): + """ + Go through each element in the sequence and convert it to a type that can be + further processed by DSL to generate the corresponding JIT argument(s). + """ + adapted_arg = [] + for elem in arg: + adapter = JitArgAdapterRegistry.get_registered_adapter(type(elem)) + if adapter is not None: + converted_elem = adapter(elem) + adapted_arg.append(converted_elem) + else: + # If no registered adapter is found, just return the original element + adapted_arg.append(elem) + + assert len(adapted_arg) == len(arg) + return type(arg)(adapted_arg) diff --git a/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py b/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py new file mode 100644 index 00000000..b09d2fcb --- /dev/null +++ b/python/CuTeDSL/base_dsl/runtime/tensor_descriptor.py @@ -0,0 +1,201 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +# Helpers +import itertools, operator +import ctypes +from . import dlpack_types as _dpack +from .dlpack_runtime import ( + dlpack_to_tensor_desc, + get_tensor_desc_data_ptr, + get_tensor_desc_is_in_device, + get_tensor_desc_element_type, + get_tensor_desc_shape, + get_tensor_desc_stride, + get_tensor_desc_element_size_in_bytes, + get_tensor_desc_ndim, + get_tensor_desc_dtype_code, + get_tensor_desc_dtype_bits, + get_tensor_desc_device_type, + get_tensor_desc_device_id, +) + +from ..utils.logger import log +from ..common import * +from ..typing import ( + Boolean, + Float8E5M2, + Int64, + Int32, + Int16, + Int8, + Uint64, + Uint32, + Uint16, + Uint8, + Float64, + Float32, + Float16, + BFloat16, +) + + +class TensorDescriptor: + def __init__(self, tensor): + """Initialize with a tensor that supports the DLPack protocol. + + Args: + tensor: Any tensor object that implements __dlpack__ and __dlpack_device__ + """ + + self.tensor = tensor + self._capsule = dlpack_to_tensor_desc(tensor) + + self.data_ptr = get_tensor_desc_data_ptr(self._capsule) + self.device_type = get_tensor_desc_device_type(self._capsule) + self.device_type = _dpack.DLDeviceType(self.device_type) + + if self.device_type == _dpack.DLDeviceType.kDLGPU: + self.device_pointer = self.data_ptr + elif self.device_type == _dpack.DLDeviceType.kDLCPU: + self.device_pointer = None + else: + raise DSLRuntimeError( + f"DLPack device type is not supported {self.dl_tensor.device.device_type}" + ) + + log().info("TensorDescriptor is created = [%s]", self) + + @staticmethod + def can_transformed_to_dlpack(dl_tensor): + if not hasattr(dl_tensor, "__dlpack__") or not hasattr( + dl_tensor, "__dlpack_device__" + ): + return False + return True + + @property + def is_in_device(self): + """Check if the tensor is stored on a device.""" + return not self.device_pointer is None + + @property + def device_id(self): + """Return device id where tensor resides.""" + if self.is_in_device: + return get_tensor_desc_device_id(self._capsule) + return -1 + + @property + def element_type(self): + """Return the corresponding Python type based on DLPack dtype metadata.""" + str_element_type = get_tensor_desc_element_type(self._capsule) + dtype_map = { + # bool is 8bit from numpy and torch + "Bool": Boolean, + "Int64": Int64, + "Int32": Int32, + "Int16": Int16, + "Int8": Int8, + "UInt64": Uint64, + "UInt32": Uint32, + "UInt16": Uint16, + "UInt8": Uint8, + "Float64": Float64, + "Float32": Float32, + "Float16": Float16, + "BFloat16": BFloat16, + "Float8E5M2": Float8E5M2, + } + + if str_element_type not in dtype_map: + raise KeyError( + f"Unsupported element type in dlpack: '{str_element_type}'. Supported types are: {list(dtype_map.keys())}" + ) + + return dtype_map[str_element_type] + + @property + def shape(self): + """Return the shape of the tensor.""" + return get_tensor_desc_shape(self._capsule) + + @property + def rank(self): + """Return the rank of the tensor.""" + return get_tensor_desc_ndim(self._capsule) + + @property + def strides(self): + """Return the rank of the tensor.""" + return get_tensor_desc_stride(self._capsule) + + @property + def element_size_in_bytes(self): + """Calculate the element size in bytes of the DLPack tensor.""" + return get_tensor_desc_element_size_in_bytes(self._capsule) + + @property + def size_in_bytes(self): + """Calculate the total size in bytes of the DLPack tensor.""" + # Calculate the number of elements using the shape + ndim = get_tensor_desc_ndim(self._capsule) + shape = get_tensor_desc_shape(self._capsule) + num_elements = 1 + for i in range(ndim): + num_elements *= shape[i] + + # Total bytes + total_bytes = self.element_size_in_bytes * num_elements + return total_bytes + + def __str__(self): + """Return a compact string representation of the device_tensor with a tensor prefix.""" + # Extract shape + shape = "x".join(map(str, self.shape)) + + # Extract dtype + dtype_code = get_tensor_desc_dtype_code(self._capsule) + dtype_bits = get_tensor_desc_dtype_bits(self._capsule) + dtype = ( + f"i{dtype_bits}" + if dtype_code == _dpack.DLDataTypeCode.kDLInt + else f"f{dtype_bits}" + ) + + # Extract device + device_type = "cpu" if not self.is_in_device else "gpu" + + return f"tensor<{shape}x{dtype}>_{device_type}" + + def _check_is_managed_by_framework(self): + """ + Ensure the tensor is not managed by the framework (e.g., GPU tensor). + Raises an exception if the tensor is framework-managed. + """ + return self.device_type == _dpack.DLDeviceType.kDLGPU + + +def from_tensor(tensor) -> TensorDescriptor: + """Create a TensorDescriptor from a tensor object.""" + return TensorDescriptor(tensor) + + +def to_tensor(tensor_descriptor: TensorDescriptor): + """Return tensor object from tensor descriptor.""" + return tensor_descriptor.tensor + + +def is_tensor_descriptor(maybe_tensor_descriptor) -> bool: + """Check if the object is a TensorDescriptor.""" + return isinstance( + maybe_tensor_descriptor, TensorDescriptor + ) or TensorDescriptor.can_transformed_to_dlpack(maybe_tensor_descriptor) diff --git a/python/CuTeDSL/base_dsl/typing.py b/python/CuTeDSL/base_dsl/typing.py new file mode 100644 index 00000000..7fc2b4d7 --- /dev/null +++ b/python/CuTeDSL/base_dsl/typing.py @@ -0,0 +1,1897 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import ctypes +import numpy as np +import operator +from typing_extensions import deprecated +from functools import reduce +from typing import ( + Generic, + Protocol, + Union, + Any, + List, + Type, + TypeVar, + overload, + runtime_checkable, + get_origin, +) +from types import FunctionType +from dataclasses import dataclass +from abc import ABC, abstractmethod + +from .common import * +from .ast_helpers import const_expr +from ._mlir_helpers import arith as arith_helper, lru_cache_ir +from ._mlir_helpers.arith import ArithValue + +from .._mlir import ir +from .._mlir.extras import types as T +from .._mlir.dialects import arith, math + +# ============================================================================= +# Dynamic Expression Protocol +# ============================================================================= + + +@runtime_checkable +class DynamicExpression(Protocol): + """ + This is a protocol class that provides a common interface + to generate user-defined dynamic expressions. + + The DSL checks this protocol to determine if a class is a dynamic expression (SSA value) or not. + """ + + def __extract_mlir_values__(self): + """ + Generate a dynamic expression for the current object. + + :return: List of MLIR values + :rtype: List[ir.Value] + """ + raise NotImplementedError + + def __new_from_mlir_values__(self, values): + """ + Create a new object from MLIR values. + + :param values: List of MLIR values + :type values: List[ir.Value] + :return: A new instance of the class that implements this protocol + :rtype: Any + """ + raise NotImplementedError + + +@runtime_checkable +class JitArgument(Protocol): + """ + This is a protocol class that provides a common interface + for JIT function arguments generation for Python to call JIT functions. + + The DSL checks this protocol to determine if a class is capable of providing information + needed for generating JIT function arguments. + + See breakdowns below for JitArgument protocol based JIT function calls. + + .. code-block:: python + + @jit + def foo(x: CustomData): + return x.int_value + 1 + + # Emit: `%c0 = arith.constant(1, i32)` + c1 = const(1, Int32) + # `c1` tracks `%c0` defined outside of function body of `foo` + # `%c0` can't be used directly in function body of `foo` + x = CustomData(c1, ...) + + When called like ``y = foo(x)``, the following steps occur: + + 1. JIT compiler generates MLIR function definition using ``__get_mlir_types__``: + + .. code-block:: mlir + + func @foo(%arg0: i32, ...) -> i32 { + ... + } + + 2. Function is traced in Python, wrapping MLIR values with ``__new_from_mlir_values__``: + + .. code-block:: python + + # Implementation of IR tracing + new_x = CustomData(ir.Value(%arg0), ...) + y = foo(new_x) + # `x.int_value` is %arg0 rather than `c1` defined outside + + 3. For Python runtime execution, JIT engine invokes compiled function using ``__c_pointers__``: + + .. code-block:: python + + jit_engine.invoke(foo, concat([x.__c_pointers__(), ...])) + """ + + def __c_pointers__(self): + """ + Generate a list of ctypes pointers for the current object. + + :return: List of ctypes pointers + :rtype: List[ctypes.c_void_p] + """ + raise NotImplementedError + + def __get_mlir_types__(self): + """ + Generate a list of MLIR types for the current object. + + :return: List of MLIR types + :rtype: List[ir.Type] + """ + raise NotImplementedError + + def __new_from_mlir_values__(self, values): + """ + Create a new object from MLIR values. + + :param values: List of MLIR values + :type values: List[ir.Value] + :return: A new object that represents the given MLIR values + :rtype: Any + """ + raise NotImplementedError + + +def get_c_pointers(obj): + """ + Given the `obj`, recursively go through it to extract all contained C pointers + """ + if hasattr(obj, "__c_pointers__"): + return obj.__c_pointers__() + elif isinstance(obj, (tuple, list)): + return sum((get_c_pointers(x) for x in obj), []) + elif isinstance(obj, set): + raise DSLRuntimeError( + "Sets are not supported in get_c_pointers to ensure order preservation", + context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.", + suggestion="Consider using a list or tuple instead", + ) + return [] + + +def get_mlir_types(obj): + """ + Given the `obj`, recursively go through it to extract all contained MLIR types + """ + if hasattr(obj, "__get_mlir_types__"): + return obj.__get_mlir_types__() + elif hasattr(obj, "__extract_mlir_values__"): + return [v.type for v in obj.__extract_mlir_values__()] + elif isinstance(obj, ir.Value): + return [obj.type] + elif isinstance(obj, (tuple, list)): + return sum((get_mlir_types(x) for x in obj), []) + elif isinstance(obj, set): + raise DSLRuntimeError( + "Sets are not supported in get_mlir_types to ensure order preservation", + context="The DSL attempted to generate JIT function argument(s) for an argument of type set but failed.", + suggestion="Consider using a list or tuple instead", + ) + return [] + + +class DslType(type): + """Metaclass for all DSL types in the system. + + This metaclass provides type system infrastructure for DSL types, handling MLIR + type mappings and NumPy type conversions. + + All data types in DSL must provide the following methods: + + :param mlir_type: Corresponding MLIR type for this DSL type + :type mlir_type: Any, optional + :param is_abstract: Whether this type is abstract, defaults to False + :type is_abstract: bool, optional + + **Required Methods** + + * ``__str__`` (classmethod): Return string representation of the type + * ``__c_pointers__`` (optional): Return list of ctypes pointers of data used to invoke JIT function + * ``__get_mlir_types__``: Return list of MLIR types of the MLIR values contained in the instance + * ``__extract_mlir_values__``: Return list of MLIR values contained in the instance + * ``__new_from_mlir_values__``: Return a new instance from list of MLIR values + + **Attributes** + + :ivar _ir: MLIR provider + :vartype _ir: Any + :ivar _T: MLIR Type system provider + :vartype _T: Any + + **Properties** + + :property mlir_type: Returns the corresponding MLIR type for this DSL type + :type mlir_type: Any + + **Examples** + + Define a custom data type: + + .. code-block:: python + + class CustomData(metaclass=DslType, ...): + def __init__(self, int_value, ...): + self.int_value = int_value + ... + + def __str__(cls): + return "CustomData[int, ...]" + + def __c_pointers__(self): + return [ctypes.pointer(ctypes.c_int32(self.int_value)), ...] + + def __get_mlir_types__(self): + return [_T.i32(), ...] + + def __extract_mlir_values__(self): + return [self.int_value, ...] + + def __new_from_mlir_values__(self, values): + return CustomData(values[0], ...) + + For JIT function calls, MLIR values are extracted with ``__extract_mlir_values__``: + + .. code-block:: python + + @jit + def caller(): + x = CustomData(1, ...) + return foo(x) + + .. code-block:: mlir + + func @caller() -> i32 { + %0 = func.call @foo(%arg0, ...) : (i32, ...) -> i32 + return %0 : i32 + } + """ + + _is_abstract: bool + + def __new__(cls, name, bases, attrs, is_abstract=False, **kwargs): + new_cls = super().__new__(cls, name, bases, attrs) + + new_cls._is_abstract = is_abstract + + return new_cls + + @property + def is_abstract(cls): + return cls._is_abstract + + +class NumericMeta(DslType): + """Metaclass for numeric types providing width and numpy dtype information. + + :param width: Bit width of the numeric type, defaults to 8 + :type width: int + :param np_dtype: Corresponding NumPy dtype + :type np_dtype: numpy.dtype, optional + :param mlir_type: Corresponding MLIR type + :type mlir_type: Any, optional + :param is_abstract: Whether the type is abstract, defaults to False + :type is_abstract: bool, optional + + :ivar width: Bit width of the numeric type + :type width: int + :ivar _np_dtype: Corresponding NumPy dtype + :type _np_dtype: Union[numpy.dtype, None] + + :property numpy_dtype: Returns the corresponding NumPy dtype + :rtype numpy_dtype: numpy.dtype + """ + + width: int + + # Placeholder type + _mlir_type = Any + _np_dtype: Union[np.dtype, None] + + def __new__( + cls, + name, + bases, + attrs, + width=8, + np_dtype=None, + mlir_type=None, + is_abstract=False, + **kwargs, + ): + def _extract_mlir_values(self): + return [self.ir_value()] + + def _new_from_mlir_values(self, values: list) -> "Numeric": + res_ty = type(self) + return res_ty(values[0]) + + new_attrs = { + "__extract_mlir_values__": _extract_mlir_values, + "__new_from_mlir_values__": _new_from_mlir_values, + } + new_cls = super().__new__( + cls, + name, + bases, + new_attrs | attrs, + is_abstract=is_abstract, + **kwargs, + ) + + if mlir_type is not None: + new_cls._mlir_type = staticmethod(mlir_type) + + new_cls.width = width + new_cls._np_dtype = np_dtype + return new_cls + + @property + def numpy_dtype(cls): + return cls._np_dtype + + @property + def is_integer(cls) -> bool: ... + + @property + def is_float(cls) -> bool: ... + + def is_same_kind(cls, other: Type) -> bool: + return cls.is_integer == other.is_integer or cls.is_float == other.is_float + + @staticmethod + def from_python(value: Any) -> Type["Numeric"]: + """ + Deduce the DSL type from a Python value. + """ + if isinstance(value, int): + return Int32 + elif isinstance(value, float): + return Float32 + elif isinstance(value, bool): + return Boolean + raise DSLRuntimeError( + f"Could not deduce Type[Numeric] from python value: {value} :{type(value)}" + ) + + @property + def mlir_type(cls): + return cls._mlir_type() # type: ignore + + +Value = TypeVar("Value") + + +def cast(obj: Union[bool, int, float, Value], type_: Type["Numeric"]) -> "Numeric": + """Cast an object to the specified numeric type. + + :param obj: Object to be cast + :type obj: Union[bool, int, float, Value] + :param type_: Target numeric type + :type type_: Type[Numeric] + :raises TypeError: If casting to an abstract type or unsupported type conversion + :return: Object cast to the target numeric type + :rtype: Numeric + + Example:: + >>> x = cast(5, Int32) # Cast integer to Int32 + >>> y = cast(3.14, Float32) # Cast float to Float32 + """ + if type_.is_abstract: + if not isinstance(obj, type_): + raise TypeError( + f"can't cast {obj} to {type_}. Pass in concrete type instead, " + "e.g. Int32, Float32, etc." + ) + # If target_type is abstract, and value is instance of target_type, + # then we can return value as is + else: + # Implicit cast based on using annotation type + obj = type_(obj) + return obj + + +# Option 1: use ir.Value as base +# class IntegerMeta(DslType, type(ir.Value)): +class IntegerMeta(NumericMeta): + """Metaclass for integer types providing signedness information. + + :param width: Bit width of the integer type, defaults to 32 + :type width: int + :param signed: Whether the integer type is signed, defaults to True + :type signed: bool + :param mlir_type: Corresponding MLIR type, defaults to None + :type mlir_type: Any, optional + + :ivar signed: Whether the integer type is signed + :vartype signed: bool + :ivar arith: Arithmetic operations interface + :vartype arith: Any + """ + + signed: bool + + def __new__( + cls, + name, + bases, + attrs, + width=32, + signed=True, + mlir_type=None, + is_abstract=False, + ): + if width == 1: + np_dtype = np.bool_ + elif width == 128: + np_dtype = None + elif signed: + np_dtype = getattr(np, f"int{width}") + else: + np_dtype = getattr(np, f"uint{width}") + + def _c_pointers(self): + if width == 1: + c_value = ctypes.c_bool(self.value) + elif signed: + c_value = getattr(ctypes, f"c_int{width}")(self.value) + else: + c_value = getattr(ctypes, f"c_uint{width}")(self.value) + + return [ctypes.cast(ctypes.pointer(c_value), ctypes.c_void_p)] + + new_attrs = { + "__c_pointers__": _c_pointers, + } + new_cls = super().__new__( + cls, name, bases, attrs | new_attrs, width, np_dtype, mlir_type, is_abstract + ) + new_cls.signed = signed + return new_cls + + def __str__(cls): + return f"{cls.__name__}" + + @property + def is_integer(cls) -> bool: + return True + + @property + def is_float(cls) -> bool: + return False + + @property + def zero(cls) -> int: + return 0 + + @property + def min(cls) -> int: + if cls.signed: + return -(2 ** (cls.width - 1)) + else: + return 0 + + @property + def max(cls) -> int: + if cls.signed: + return 2 ** (cls.width - 1) - 1 + else: + return 2**cls.width - 1 + + def recast_width(cls, width): + return eval(f"Int{width}") + + +class FloatMeta(NumericMeta): + """Metaclass for floating-point types. + + This metaclass provides type system infrastructure for floating-point types in the DSL, + handling MLIR type mappings and NumPy type conversions. + + :param width: Bit width of the float type, defaults to 32 + :type width: int + :param mlir_type: Corresponding MLIR type, defaults to None + :type mlir_type: Any, optional + :param is_abstract: Whether this is an abstract base class, defaults to False + :type is_abstract: bool, optional + + :ivar _arith: Arithmetic operations interface + :vartype _arith: Any + """ + + _exponent_width: int + _mantissa_width: int + + def __new__(cls, name, bases, attrs, width=32, mlir_type=None, is_abstract=False): + np_dtype = getattr(np, name.lower(), None) + new_cls = super().__new__( + cls, name, bases, attrs, width, np_dtype, mlir_type, is_abstract + ) + # Extract exponent and mantissa bits from class name if it follows Float pattern + # For example: Float8E4M3 -> exponent_width=4, mantissa_width=3 + import re + + if not is_abstract: + match = re.match(r"Float(\d+)E(\d+)M(\d+)(?:.*)", name) + if match: + exp_bits = int(match.group(2)) + mant_bits = int(match.group(3)) + + # Store extracted values as class attributes + new_cls._exponent_width = exp_bits + new_cls._mantissa_width = mant_bits + # Don't have 1-to-1 mapping of narrow precision types like bfloat16, tfloat32, etc. + return new_cls + + def __str__(cls): + return f"{cls.__name__}" + + @property + def is_integer(cls) -> bool: + return False + + @property + def is_float(cls) -> bool: + return True + + @property + def zero(cls) -> float: + return 0.0 + + @property + def inf(cls) -> float: + return float("inf") + + @property + def nan(cls) -> float: + return float("nan") + + @property + def exponent_width(cls) -> int: + return cls._exponent_width + + @property + def mantissa_width(cls) -> int: + return cls._mantissa_width + + def recast_width(cls, width): + return eval(f"Float{width}") + + +def _arith_signless_to_int(a, target_type): + # is_signed: sign of result type + if target_type.width > a.type.width: + # arith dialect consider `1` in `i1` as `-1`, treat it as unsigned for DSL + if target_type.signed and a.type.width > 1: + return arith.extsi(target_type.mlir_type, a) + else: + return arith.extui(target_type.mlir_type, a) + elif target_type.width < a.type.width: + return arith.trunci(target_type.mlir_type, a) + else: + return a + + +def _binary_op_type_promote(a, b, promote_bool: bool = False): + """Promote two numeric operands following type promotion rules. + + :param a: First numeric operand + :type a: Numeric + :param b: Second numeric operand + :type b: Numeric + :param promote_bool: Whether to promote boolean types to Int32 for arithmetic operations, defaults to False + :type promote_bool: bool, optional + :raises ValueError: If implicit float promotion is not supported between the given types + :return: Tuple containing promoted operands and their resulting type + :rtype: tuple[Numeric, Numeric, Type[Numeric]] + + Type promotion rules: + 1. If operands are same type and not bools needing promotion: + - No promotion needed, return original types + 2. If either operand is float: + a. If one is float and one is int: + - Convert int to the float type + b. If both are float: + - Promote to higher precision float if width >= 16 + - For same width, promote to more general type (Float32 over TFloat32) + - Otherwise raise ValueError for unsupported promotion + 3. Otherwise, both operands are integers. Integer promotion rules: + a. If promote_bool is True and either operand is bool: + - Promote bool to Int32 for arithmetic operations + + Exceptions for numpy dtype casting: + - array(dtype=np.bool_) + array(dtype=np.bool_) -> array(dtype=np.bool_) + + What is not supported: + - promotion with narrow precision float types which requires explicit cast by user + """ + a_type = a.dtype + b_type = b.dtype + + # Early return for same types (except when they're bools that need promotion) + if a_type == b_type and not (promote_bool and a_type.width == 1): + return a, b, a_type + + # Handle floating point promotions + if a_type.is_float or b_type.is_float: + # Get highest precision float type based on bitwidth + a_width = getattr(a_type, "width", 0) + b_width = getattr(b_type, "width", 0) + + # If one type is integer, convert it to the float type + if a_type.is_float and not b_type.is_float: + b_type = a_type.recast_width(max(a_width, b_width)) + elif b_type.is_float and not a_type.is_float: + a_type = b_type.recast_width(max(a_width, b_width)) + + # Both are float types - handle precision promotion + if a_width > b_width and a_width >= 16: + res_type = a_type + elif b_width > a_width and b_width >= 16: + res_type = b_type + elif a_width == b_width: + # Same bitwidth - handle special cases like TFloat32 -> Float32 and BFloat16 -> Float16 + if a_type is Float64 or b_type is Float64: + res_type = Float64 + elif a_type is Float32 or b_type is Float32: + res_type = Float32 + elif a_type is Float16 or b_type is Float16: + res_type = Float16 + else: + raise ValueError( + f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly" + ) + else: + raise ValueError( + f"implicit float promotion of {a_type} or {b_type} is not supported, cast explicitly" + ) + + # Only convert if type is different + new_a = a.to(res_type) if a.dtype != res_type else a + new_b = b.to(res_type) if b.dtype != res_type else b + return new_a, new_b, res_type + + # Handle bool promotion for arithmetic operations + if promote_bool: + if a_type is Boolean and b_type is Boolean: + # Only promote to Int32 when both are bool + a = a.to(Int32) + b = b.to(Int32) + a_type = b_type = a.dtype + + # If both were bools, they're now same type (Int32) + if a_type == b_type: + return a, b, a_type + + # Same type, no promotion needed + if a_type == b_type: + return a, b, a_type + + a_signed = a_type.signed + b_signed = b_type.signed + a_width = a_type.width + b_width = b_type.width + + # Mixed signedness case + if a_signed != b_signed: + unsigned_type = a_type if not a_signed else b_type + signed_type = a_type if a_signed else b_type + unsigned_width = a_width if not a_signed else b_width + + if unsigned_width >= signed_type.width: + # Promote both to unsigned of larger width + res_type = unsigned_type + else: + # Promote both to signed of larger width + res_type = signed_type + + new_a = a.to(res_type) if a.dtype != res_type else a + new_b = b.to(res_type) if b.dtype != res_type else b + return new_a, new_b, res_type + + # Same signedness, different width - promote to larger width + if a_width >= b_width: + return a, b.to(a.dtype), a.dtype + else: + return a.to(b.dtype), b, b.dtype + + +def _binary_op(op, promote_operand=True, promote_bool=False, flip=False): + """Wrapper for binary operations on Numeric types. + + This wrapper handles type promotion, operation execution, and result type determination + for binary operations between Numeric types. + + :param op: The binary operation to perform (e.g., operator.add, operator.sub) + :type op: callable + :param emitter: Function that emits the MLIR operation for dynamic values + :type emitter: callable + :param promote_operand: Whether to promote operands to the same type, defaults to True + :type promote_operand: bool, optional + :param promote_bool: Whether to promote boolean results to Boolean type, defaults to False + :type promote_bool: bool, optional + :param flip: Whether to flip the operands when calling the operation, defaults to False + :type flip: bool, optional + + :raises TypeError: When an unsupported operation is attempted on specific numeric types + + .. note:: + Not all operations are supported for all numeric types. In particular: + + - Subtraction is not fully supported for Integer types + - Multiplication, floor division, and modulo operations may have limited support + - Division (truediv) with integer types is not fully supported and converts to Float32 + """ + + def wrapper(lhs, rhs, *, loc=None, ip=None): + orig_lhs_type = type(lhs) + orig_rhs_type = type(rhs) + + # When called directly with self and other + ty = type(lhs) + # Canonicalize to Numeric type for promotion + if not isinstance(rhs, Numeric): + if not isinstance(rhs, (ArithValue, int, float, bool)): + # This allows rhs class to implement __rmul__ + return NotImplemented + + if isinstance(rhs, ArithValue): + if isinstance(rhs.type, ir.VectorType): + return NotImplemented + + rhs = as_numeric(rhs) + + # default result type to left-hand-side + res_type = ty + + if promote_operand: + lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool) + else: + rhs = ty(rhs) + + if op in ( + operator.lt, + operator.le, + operator.gt, + operator.ge, + operator.eq, + operator.ne, + ): + res_type = Boolean + elif op == operator.truediv and isinstance(lhs, Integer): + res_type = Float32 + elif promote_bool and orig_lhs_type == Boolean and orig_rhs_type == Boolean: + res_type = Boolean + + if isinstance(lhs.value, ArithValue) and isinstance(lhs, Integer): + lhs_val = lhs.value.with_signedness(lhs.signed) + else: + lhs_val = lhs.value + + if isinstance(rhs.value, ArithValue) and isinstance(rhs, Integer): + rhs_val = rhs.value.with_signedness(rhs.signed) + else: + rhs_val = rhs.value + + if flip: + lhs_val, rhs_val = rhs_val, lhs_val + + # Check if the operation is supported by the operands + res_val = op(lhs_val, rhs_val) + return res_type(res_val, loc=loc, ip=ip) + + return wrapper + + +class Numeric(metaclass=NumericMeta, is_abstract=True): + """Base class for all numeric types in the DSL. + + This class provides the foundation for both Integer and Float types, + implementing basic arithmetic operations. + + :param value: The value to store in the numeric type + :type value: Union[bool, int, float, Value] + + :ivar value: The stored numeric value + :vartype value: Union[bool, int, float, Value] + """ + + def __init__(self, value: Union[bool, int, float, Value], *, loc=None, ip=None): + self.value = value + + def __str__(self) -> str: + # Use member's pretty-str method if member object has method. + # This can be extended in future to have better support for IDE, jupyter notebook, etc. + pretty_str = getattr(self.value, "pretty_str", None) + if pretty_str is not None: + return pretty_str() + else: + return "?" + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({repr(self.value)})" + + def __hash__(self): + return hash(type(self).__class__) ^ hash(self.value) + + @property + def dtype(self) -> Type["Numeric"]: + return type(self) + + @overload + def to(self, dtype: Type["Numeric"], *, loc=None, ip=None) -> "Numeric": ... + + @overload + def to(self, dtype: Type[int], *, loc=None, ip=None) -> int: ... + + @overload + def to(self, dtype: Type[float], *, loc=None, ip=None) -> float: ... + + @overload + def to(self, dtype: Type[bool], *, loc=None, ip=None) -> bool: ... + + @overload + def to(self, dtype: Type[ir.Value], *, loc=None, ip=None) -> ir.Value: ... + + def to(self, dtype: Type, *, loc=None, ip=None): + """Convert this numeric value to another numeric type. + + If the target type is the same as the current type, returns self. + Otherwise, creates a new instance of the target type with the same value. + + :param dtype: The target numeric type to convert to + :type dtype: Union[Type["Numeric"], Type[int], Type[float], Type[bool]] + :return: A new instance of the target type, or self if types match + :rtype: Numeric + :raises TypeError: If trying to convert an MLIR value to a static Python type + :raises TypeError: If trying to convert to unsupported float types like Float8E4M3, + Float8E4M3B11FNUZ, Float4E2M1FN, Float6E3M2FN, or Float6E2M3FN + + .. note:: + + Unsupported destination float types: + - Float8E4M3 + - Float8E4M3B11FNUZ + - Float4E2M1FN + - Float6E3M2FN + - Float6E2M3FN + + Example:: + + .. code-block:: python + + # Convert between DSL numeric types + x = Int32(5) + y = x.to(Float32) # Converts to Float32(5.0) + + # Convert to Python primitive types + # They are considered as static values at JIT time + z = x.to(int) # Returns Python int 5 + w = y.to(float) # Returns Python float 5.0 + + # This will raise a ValueError + mlir_val = arith.constant(T.i32(), 42) + num = Int32(mlir_val) + num.to(int) # ValueError: unable to convert MLIR value to static type: + """ + if dtype in _unsupported_dst_float_types: + raise TypeError(f"Unsupported destination float type: {dtype}") + + if isinstance(dtype, type(self)): + return self + elif isinstance(dtype, NumericMeta): + return dtype(self) + elif dtype is ir.Value: + if isinstance(self.value, (int, float, bool)): + res = arith_helper.const( + self.value, self.dtype.mlir_type, loc=loc, ip=ip + ) + elif isinstance(self.value, ir.Value): + res = self.value + else: + raise ValueError( + f"cannot convert {type(self)} to {dtype}, " + f"self.value is {self.value.type}" + ) + + if not isinstance(res, ArithValue): + raise ValueError(f"Expected ArithValue, got {type(res)} as {res.type}") + + return res.with_signedness(getattr(type(self), "signed", None)) + elif dtype in (int, float, bool): + if isinstance(self.value, ir.Value): + raise ValueError( + f"unable to convert {self.value} to static type: {dtype}" + ) + return dtype(self.value) + else: + raise ValueError(f"unable to convert {type(self)} to {dtype}") + + def ir_value(self, *, loc=None, ip=None) -> ir.Value: + return self.to(ir.Value, loc=loc, ip=ip) + + @property + def zero(self) -> "Numeric": ... + + def __dsl_not__(self, *, loc=None, ip=None): + """DSL implementation of Python's `not` operator. + + Returns True if the value is equal to zero, False otherwise. + This matches Python's behavior where any non-zero number is considered True. + + :param loc: The source location information, defaults to None + :type loc: Optional[Location] + :param ip: The insertion point for the operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: The result of the logical not operation + :rtype: Boolean + """ + ty = type(self) + zero_val = arith.constant(ty.mlir_type, ty.zero) + return self.__eq__(ty(zero_val), loc=loc, ip=ip) + + def __dsl_and__(self, other, *, loc=None, ip=None): + """DSL implementation of Python's `and` operator. + + Returns the second operand if the first is truthy, otherwise returns the first operand. + A numeric value is considered truthy if it is non-zero. + + :param other: The right-hand operand + :type other: Numeric + :param loc: The source location information, defaults to None + :type loc: Optional[Location] + :param ip: The insertion point for the operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: The result of the logical and operation + :rtype: Boolean + + Example:: + + 5 and 3 -> 3 + 0 and 3 -> 0 + 3 and 0 and ... -> 0 + """ + is_true = self.__dsl_bool__(loc=loc, ip=ip) + + def and_op(lhs, rhs): + if isinstance(lhs, (int, float, bool)): + if isinstance(rhs, (int, float, bool)): + return lhs and rhs + else: + lhs = arith.constant(rhs.type, lhs) + return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip) + else: + if isinstance(rhs, (int, float, bool)): + rhs = arith.constant(lhs.type, rhs) + return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip) + else: + return arith.select(is_true.ir_value(), rhs, lhs, loc=loc, ip=ip) + + return _binary_op(and_op, promote_bool=True)(self, other, loc=loc, ip=ip) + + def __dsl_or__(self, other, *, loc=None, ip=None): + """DSL implementation of Python's `or` operator. + + Returns the first operand if it is truthy, otherwise returns the second operand. + A numeric value is considered truthy if it is non-zero. + + :param other: The right-hand operand + :type other: Numeric + :param loc: The source location information, defaults to None + :type loc: Optional[Location] + :param ip: The insertion point for the operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: The result of the logical or operation + :rtype: Boolean + + Example:: + + 5 or 3 -> 5 + 0 or 3 -> 3 + 3 or 0 -> 3 + """ + is_true = self.__dsl_bool__(loc=loc, ip=ip) + + def or_op(lhs, rhs): + if isinstance(lhs, (int, float, bool)): + if isinstance(rhs, (int, float, bool)): + return lhs or rhs + else: + lhs = arith.constant(rhs.type, lhs) + return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip) + else: + if isinstance(rhs, (int, float, bool)): + rhs = arith.constant(lhs.type, rhs) + return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip) + else: + return arith.select(is_true.ir_value(), lhs, rhs, loc=loc, ip=ip) + + return _binary_op(or_op, promote_bool=True)(self, other, loc=loc, ip=ip) + + def __dsl_bool__(self, *, loc=None, ip=None) -> "Boolean": + """DSL implementation of Python's __bool__ method. + + Returns a Boolean indicating whether this value is considered truthy. + For numeric types, returns True if the value is non-zero. + + :param loc: The source location information, defaults to None + :type loc: Optional[Location] + :param ip: The insertion point for the operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: True if this value is truthy (non-zero), False otherwise + :rtype: Boolean + """ + zero = type(self).zero + return self.__ne__(zero, loc=loc, ip=ip) + + def __bool__(self): + if isinstance(self.value, (int, float, bool)): + return bool(self.value) + else: + raise DSLRuntimeError( + f"Unable to convert dynamic `{type(self).__name__}` value to bool at compile time.", + suggestion=[ + "Decorate the parent function with `jit` decorator and with `preprocess` enabled.", + "Ensure not using patterns that DSL does not support.", + "Otherwise, please file a bug report.", + ], + ) + + def __neg__(self, *, loc=None, ip=None): + if isinstance(self, (bool, int, float)): + return type(self)(-self.value) # type: ignore + else: + return type(self)(-self.value, loc=loc, ip=ip) # type: ignore + + @staticmethod + def _from_python_value(value): + if isinstance(value, Numeric): + return value + + if isinstance(value, bool): + res_type = Boolean + elif isinstance(value, int): + res_type = Int32 + elif isinstance(value, float): + res_type = Float32 + elif isinstance(value, ArithValue): + res_type = Numeric.from_mlir_type(value.type) + else: + raise ValueError( + f"unable to convert {value} in type {type(value)} to Numeric" + ) + return res_type(value) + + def __add__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.add, promote_bool=True)(self, other, loc=loc, ip=ip) + + def __sub__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.sub, promote_bool=True)(self, other, loc=loc, ip=ip) + + def __mul__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.mul, promote_bool=True)(self, other, loc=loc, ip=ip) + + def __floordiv__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.floordiv, promote_bool=True)( + self, other, loc=loc, ip=ip + ) + + def __truediv__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.truediv, promote_bool=True)( + self, other, loc=loc, ip=ip + ) + + def __mod__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.mod, promote_bool=True)(self, other, loc=loc, ip=ip) + + def __radd__(self, other, *, loc=None, ip=None) -> "Numeric": + return self.__add__(other, loc=loc, ip=ip) + + def __rsub__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.sub, promote_bool=True, flip=True)( + self, other, loc=loc, ip=ip + ) + + def __rmul__(self, other, *, loc=None, ip=None) -> "Numeric": + return self.__mul__(other, loc=loc, ip=ip) + + def __rfloordiv__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.floordiv, promote_bool=True, flip=True)( + self, other, loc=loc, ip=ip + ) + + def __rtruediv__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.truediv, promote_bool=True, flip=True)( + self, other, loc=loc, ip=ip + ) + + def __rmod__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.mod, promote_bool=True, flip=True)( + self, other, loc=loc, ip=ip + ) + + def __eq__(self, other, *, loc=None, ip=None) -> "Boolean": + return _binary_op(operator.eq)(self, other, loc=loc, ip=ip) # type: ignore + + def __ne__(self, other, *, loc=None, ip=None) -> "Boolean": + return _binary_op(operator.ne)(self, other, loc=loc, ip=ip) # type: ignore + + def __lt__(self, other, *, loc=None, ip=None) -> "Boolean": + return _binary_op(operator.lt)(self, other, loc=loc, ip=ip) # type: ignore + + def __le__(self, other, *, loc=None, ip=None) -> "Boolean": + return _binary_op(operator.le)(self, other, loc=loc, ip=ip) # type: ignore + + def __gt__(self, other, *, loc=None, ip=None) -> "Boolean": + return _binary_op(operator.gt)(self, other, loc=loc, ip=ip) # type: ignore + + def __ge__(self, other, *, loc=None, ip=None) -> "Boolean": + return _binary_op(operator.ge)(self, other, loc=loc, ip=ip) # type: ignore + + def __pow__(self, other, *, loc=None, ip=None) -> "Numeric": + return _binary_op(operator.pow)(self, other, loc=loc, ip=ip) # type: ignore + + def __c_pointers__(self): + raise ValueError( + f"only support built-in types: bool, (u)int{8, 16, 32, 64}, float{32, 64}, but got {type(self)}" + ) + + def __get_mlir_types__(self): + return [type(self).mlir_type] + + @staticmethod + def from_mlir_type(mlir_type): + type_map = { + T.bool(): Boolean, + T.f64(): Float64, + T.f32(): Float32, + T.tf32(): TFloat32, + T.f16(): Float16, + T.bf16(): BFloat16, + T.i(128): Int128, + T.i64(): Int64, + T.i32(): Int32, + T.i16(): Int16, + T.i8(): Int8, + T.si(128): Int128, + T.si64(): Int64, + T.si32(): Int32, + T.si16(): Int16, + T.si8(): Int8, + T.ui(128): Uint128, + T.ui64(): Uint64, + T.ui32(): Uint32, + T.ui16(): Uint16, + T.ui8(): Uint8, + T.f8E5M2(): Float8E5M2, + T.f8E4M3(): Float8E4M3, + T.f8E4M3FN(): Float8E4M3FN, + T.f8E4M3B11FNUZ(): Float8E4M3B11FNUZ, + T.f4E2M1FN(): Float4E2M1FN, + T.f6E2M3FN(): Float6E2M3FN, + T.f6E3M2FN(): Float6E3M2FN, + T.f8E8M0FNU(): Float8E8M0FNU, + } + if mlir_type not in type_map: + raise DSLRuntimeError(f"Unsupported DSL type: {mlir_type}") + return type_map[mlir_type] + + +def as_numeric(obj: Union[bool, int, float, ir.Value, Numeric]) -> Numeric: + """Convert a Python primitive value to a Numeric type. + + :param obj: Python primitive value to convert + :type obj: Union[bool, int, float] + :return: The converted Numeric object + :rtype: Numeric + + Example:: + + .. code-block:: python + + x = as_numeric(5) # Converts to Int32 + y = as_numeric(3.14) # Converts to Float32 + z = as_numeric(True) # Converts to Boolean + """ + if isinstance(obj, Numeric): + return obj + return Numeric._from_python_value(obj) + + +class Integer(Numeric, metaclass=IntegerMeta, mlir_type=T.i32, is_abstract=True): + """A class representing integer values with specific width and signedness. + + This class provides functionality to create and manipulate integer values with + configurable width and signedness. It supports conversion from various input types + including Python scalars, MLIR Values, and other numeric types. + + :param x: The input value to convert to this integer type + :type x: Union[bool, int, float, ir.Value, Integer, Float] + + :return: A new Integer instance with the converted value + :rtype: Integer + + :raises AssertionError: If the type's numpy_dtype is None + :raises NotImplementedError: If converting between different Integer types + :raises ValueError: If the input type is not supported for conversion + :raises OverflowError: If converting float infinity to integer + + Type conversion behavior: + + * Python scalars (bool, int, float): + * Converted through numpy dtype casting + * NaN and infinity values are rejected + * Example: Int8(256) -> -256 (overflow behavior) + + * MLIR Value with IntegerType: + * Width differences handled by signless to signed/unsigned conversion + * Example: i8 -> i8/ui8 depending on target type + + * MLIR Value with FloatType: + * Uses MLIR float-to-int conversion + * NaN and infinity values is undefined behavior + * Example: f32 -> i32/ui32 depending on target type + + * Integer: + * Uses MLIR float-to-int conversion or numpy dtype casting + * Example: Int32(Int32(5)) => 5 + + * Float: + * Uses MLIR float-to-int conversion + * Example: Int32(Float(5.7)) -> 5 + + Example usage: + + .. code-block:: python + + x = Int32(5) # From integer + y = Int32(True) # From boolean + z = Int32(3.7) # From float (truncates) + w = Int32(x) # From same Integer type + c5 = arith.constant(5, T.i32()) + a = Int32(c5) # Treat c5 as int32 bitwise + """ + + def __init__(self, x, *, loc=None, ip=None): + ty = type(self) + + if isinstance(x, (bool, int, float)): + # Add check for NaN before numpy conversion + if isinstance(x, float): + if np.isnan(x): + raise ValueError("Cannot convert float NaN to integer") + elif np.isinf(x): + raise OverflowError("Cannot convert float infinity to integer") + + np_dtype = ty.numpy_dtype + assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}" + x_val = int(np.array(x).astype(np_dtype)) + elif type(x) == ty: + x_val = x.value + elif isinstance(x, ir.Value): # type: ignore + x_val = x + if isinstance(x.type, ir.IntegerType): # type: ignore + if x.type.width != ty.width: + # signless -> (u)int + x_val = _arith_signless_to_int(x, ty) + elif isinstance(x.type, ir.FloatType): # type: ignore + # float -> (u)int + x_val = arith_helper.fptoi(x, ty.signed, ty.mlir_type, loc=loc, ip=ip) + elif isinstance(x, Integer): + if isinstance(x.value, ir.Value): + x_val = arith_helper.int_to_int(x.ir_value(), ty) + else: + # For non-MLIR values, use numpy casting + src_val = np.array(x.value, dtype=type(x).numpy_dtype) + x_val = int(src_val.astype(ty.numpy_dtype)) + elif isinstance(x, Float): + # float -> int is handled by Integer.__init__ recursively + Integer.__init__(self, x.value) + return + else: + raise DSLRuntimeError(f"{x} to integer conversion is not supported") + + super().__init__(x_val) + + def __invert__(self, *, loc=None, ip=None): + res_type = type(self) + # Create a constant of -1 (all bits set to 1) of the same type as value + all_ones = arith.constant(res_type.mlir_type, -1) + # XOR with -1 gives us bitwise NOT + return res_type(arith.xori(self.ir_value(), all_ones, loc=loc, ip=ip)) + + def __lshift__(self, other, *, loc=None, ip=None): + return _binary_op(operator.lshift)(self, other, loc=loc, ip=ip) + + def __rlshift__(self, other, *, loc=None, ip=None): + other_ = as_numeric(other) + if not isinstance(other_, Integer): + raise ValueError(f"Cannot left shift {other_} with {self}") + return other_.__lshift__(self, loc=loc, ip=ip) + + def __rshift__(self, other, *, loc=None, ip=None): + return _binary_op(operator.rshift)(self, other, loc=loc, ip=ip) + + def __rrshift__(self, other, *, loc=None, ip=None): + other_ = as_numeric(other) + if not isinstance(other_, Integer): + raise ValueError(f"Cannot right shift {other_} with {self}") + return other_.__rshift__(self, loc=loc, ip=ip) + + def __and__(self, other, *, loc=None, ip=None): + return _binary_op(operator.and_)(self, other, loc=loc, ip=ip) + + def __rand__(self, other, *, loc=None, ip=None): + return self.__and__(other, loc=loc, ip=ip) + + def __or__(self, other, *, loc=None, ip=None): + return _binary_op(operator.or_)(self, other, loc=loc, ip=ip) + + def __ror__(self, other, *, loc=None, ip=None): + return self.__or__(other, loc=loc, ip=ip) + + def __xor__(self, other, *, loc=None, ip=None): + return _binary_op(operator.xor)(self, other, loc=loc, ip=ip) + + def __rxor__(self, other, *, loc=None, ip=None): + return self.__xor__(other, loc=loc, ip=ip) + + +class Float(Numeric, metaclass=FloatMeta, mlir_type=T.f32, is_abstract=True): + """A class representing floating-point values. + + :param x: The input value to convert to this float type. + :type x: Union[bool, int, float, ir.Value, Integer, Float] + + Type conversion behavior: + + 1. Python scalars (bool, int, float): + - Converted through numpy dtype casting + - Example: Float32(1.7) -> 1.7 + + 2. MLIR Value with FloatType: + - If width differs: converts between float types + - Example: f16 -> f32 + + 3. MLIR Value with IntegerType: + - Not supported, raises ValueError + + 4. Integer: + - Converts using MLIR int-to-float operation + - Example: Float32(Int32(5)) -> 5.0 + + 5. Float: + - Direct conversion between float types + - Example: Float32(Float32(1.5)) -> 1.5 + + .. note:: + The following narrow precision types are only supported in device code: + + 8-bit float types: + - Float8E5M2 + - Float8E4M3 + - Float8E4M3FN + - Float8E8M0FNU + - Float8E4M3B11FNUZ + + 6-bit float types: + - Float6E3M2FN + - Float6E2M3FN + + 4-bit float types: + - Float4E2M1FN + + Narrow precision types and special floating-point formats support matrix on device: + + :raises AssertionError: If the type's numpy_dtype is None + :raises ValueError: If conversion from the input type is not supported + """ + + def __init__(self, x, *, loc=None, ip=None): + ty = type(self) + + if isinstance(x, (bool, int, float)): # type: ignore + # Why we need to convert x to with numpy? + # np_dtype = ty.numpy_dtype + # assert np_dtype is not None, f"expects numpy.dtype, but got {np_dtype}" + # x = float(np.array(x).astype(np_dtype)) + super().__init__(float(x)) + elif isinstance(x, ir.Value): # type: ignore + if isinstance(x.type, ir.IntegerType): # type: ignore + raise DSLRuntimeError("signless to float conversion is not implemented") + elif isinstance(x.type, ir.FloatType): # type: ignore + if x.type != ty.mlir_type: + x = arith_helper.cvtf(x, ty.mlir_type, loc=loc, ip=ip) + super().__init__(x) + elif isinstance(x, Integer): + if isinstance(x.value, ir.Value): # type: ignore + x = arith_helper.itofp( + x.value, type(x).signed, ty.mlir_type, loc=loc, ip=ip + ) + else: + x = float(x.value) + super().__init__(x) + elif isinstance(x, Float): + Float.__init__(self, x.value) + else: + raise DSLRuntimeError(f"{x} to Float conversion is not supported") + + +class Boolean(Integer, metaclass=IntegerMeta, width=1, signed=True, mlir_type=T.bool): + """Boolean type representation in the DSL. + + This class represents boolean values in the DSL, with a width of 1 bit. + It supports conversion from various types to boolean values. + + :param a: Value to convert to Boolean + :type a: Union[bool, int, float, "Value", Numeric] + :param loc: Source location information, defaults to None + :type loc: Optional[Location], optional + :param ip: Insertion point for MLIR operations, defaults to None + :type ip: Optional[InsertionPoint], optional + :raises DSLRuntimeError: If the input value cannot be converted to Boolean + + Conversion rules: + + 1. Python bool/int/float: + - Converted using Python's bool() function + - Example: Boolean(1) -> True, Boolean(0) -> False + + 2. Boolean: + - Direct value assignment + - Example: Boolean(Boolean(True)) -> True + + 3. Numeric: + - Uses the __dsl_bool__ method of the Numeric type + + 4. MLIR Value with IntegerType: + - If width is 1: Direct assignment + - Otherwise: Compares with 0 using arith.cmpi + + 5. MLIR Value with FloatType: + - Compares with 0.0 using arith.cmpf + - Uses unordered comparison to handle NaN values + """ + + def __init__( + self, a: Union[bool, int, float, ir.Value, Numeric], *, loc=None, ip=None + ): + value = None + if isinstance(a, (bool, int, float)): + value = bool(a) + elif isinstance(a, Boolean): + value = a.value + elif isinstance(a, Numeric): + value = a.__dsl_bool__(loc=loc, ip=ip) + elif isinstance(a, ArithValue): + if a.type == T.bool(): + value = a + else: + value = a != arith_helper.const(0, a.type) + + if value is None: + raise DSLRuntimeError(f"Cannot convert {a} to Boolean") + super().__init__(value, loc=loc, ip=ip) + + def __neg__(self, *, loc=None, ip=None): + """Negation operator is not supported for boolean type. + + :param loc: Source location information, defaults to None + :type loc: Optional[Location], optional + :param ip: Insertion point for MLIR operations, defaults to None + :type ip: Optional[InsertionPoint], optional + :raises TypeError: Always raises this error as negation is not supported + """ + raise TypeError("Negation, the operator `-` is not supported for boolean type") + + +class Int8(Integer, metaclass=IntegerMeta, width=8, signed=True, mlir_type=T.i8): ... + + +class Int16(Integer, metaclass=IntegerMeta, width=16, signed=True, mlir_type=T.i16): ... + + +class Int32(Integer, metaclass=IntegerMeta, width=32, signed=True, mlir_type=T.i32): ... + + +class Int64(Integer, metaclass=IntegerMeta, width=64, signed=True, mlir_type=T.i64): ... + + +class Int128( + Integer, metaclass=IntegerMeta, width=128, signed=True, mlir_type=lambda: T.i(128) +): ... + + +class Uint8(Integer, metaclass=IntegerMeta, width=8, signed=False, mlir_type=T.i8): ... + + +class Uint16( + Integer, metaclass=IntegerMeta, width=16, signed=False, mlir_type=T.i16 +): ... + + +class Uint32( + Integer, metaclass=IntegerMeta, width=32, signed=False, mlir_type=T.i32 +): ... + + +class Uint64( + Integer, metaclass=IntegerMeta, width=64, signed=False, mlir_type=T.i64 +): ... + + +class Uint128( + Integer, metaclass=IntegerMeta, width=128, signed=False, mlir_type=lambda: T.i(128) +): ... + + +class Float64(Float, metaclass=FloatMeta, width=64, mlir_type=T.f64): + def __c_pointers__(self): + if not isinstance(self.value, float): + raise ValueError("only float is supported") + + return [ + ctypes.cast(ctypes.pointer(ctypes.c_double(self.value)), ctypes.c_void_p) + ] + + +class Float32(Float, metaclass=FloatMeta, width=32, mlir_type=T.f32): + @staticmethod + def _get_c_pointer(value: float): + return ctypes.cast(ctypes.pointer(ctypes.c_float(value)), ctypes.c_void_p) + + def __c_pointers__(self): + if not isinstance(self.value, float): + raise ValueError("only float is supported") + + return [Float32._get_c_pointer(self.value)] + + +class TFloat32(Float, metaclass=FloatMeta, width=32, mlir_type=T.tf32): + def __c_pointers__(self): + if not isinstance(self.value, float): + raise ValueError("only float is supported") + return [Float32._get_c_pointer(self.value)] + + +class Float16(Float, metaclass=FloatMeta, width=16, mlir_type=T.f16): + @staticmethod + def _get_c_pointer(value: float): + # Convert float to float16 binary representation + # First convert to numpy float16 to handle the conversion + f16_val = np.float16(value) + # Get the raw bits as a 16-bit integer + bits = f16_val.view(np.uint16) + # Create a short (16-bit int) with those bits + c_val = ctypes.c_short(bits) + return ctypes.cast(ctypes.pointer(c_val), ctypes.c_void_p) + + def __c_pointers__(self): + if not isinstance(self.value, float): + raise ValueError("only float is supported") + return [Float16._get_c_pointer(self.value)] + + +class BFloat16(Float, metaclass=FloatMeta, width=16, mlir_type=T.bf16): + def __c_pointers__(self): + if not isinstance(self.value, float): + raise ValueError("only float is supported") + + return Float.__c_pointers__(self) + + +class Float8E5M2(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E5M2): ... + + +class Float8E4M3FN(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3FN): ... + + +class Float8E4M3B11FNUZ( + Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3B11FNUZ +): ... + + + +# Added missing float types +class Float8E4M3(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E4M3): ... + + +class Float8E8M0FNU(Float, metaclass=FloatMeta, width=8, mlir_type=T.f8E8M0FNU): ... + + +class Float4E2M1FN(Float, metaclass=FloatMeta, width=4, mlir_type=T.f4E2M1FN): ... + + +class Float6E3M2FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E3M2FN): ... + + +class Float6E2M3FN(Float, metaclass=FloatMeta, width=6, mlir_type=T.f6E2M3FN): ... + + +_unsupported_dst_float_types = [ + Float8E4M3, + Float8E4M3B11FNUZ, + Float4E2M1FN, + Float6E3M2FN, + Float6E2M3FN, +] + + +ALL_DTYPES = { + Int8, + Int16, + Int32, + Int64, + Int128, + Uint8, + Uint16, + Uint32, + Uint64, + Uint128, + BFloat16, + Float16, + Float32, + TFloat32, + Float64, + Float8E5M2, + Float8E4M3, + Float8E4M3FN, + Float8E8M0FNU, + Float8E4M3B11FNUZ, + Float4E2M1FN, + Float6E2M3FN, + Float6E3M2FN, +} +__STR_TO_DTYPE__ = {dt.__name__: dt for dt in ALL_DTYPES} + + +def dtype(dtype_) -> Type[Numeric]: + t = None + if const_expr(isinstance(dtype_, str) and dtype_ in __STR_TO_DTYPE__): + t = __STR_TO_DTYPE__[dtype_] + else: + raise TypeError(f"can't interpret {dtype_} as data type") + + return t + + +############################################################## +# Tensor +############################################################## + + +class TensorMeta(DslType): + _element_type = Any + _shape = Any + + """ + Examples: + >>> Tensor[Int32, (3,)] + >>> Tensor[Float32, (3, 4)] + >>> T = TypeVar("T") + >>> Tensor[T, (3, 4, 5)] + """ + + def __new__(cls, name, bases, attrs, element_type=Any, shape=Any): + new_cls = super().__new__(cls, name, bases, attrs) + new_cls._element_type = element_type + new_cls._shape = shape + return new_cls + + +# Generic type +TY = TypeVar("TY") + + +class Constexpr(Generic[TY]): + """Value is passed and computed by python interpreter""" + + pass + + +class align: + def __init__(self, value: int): + if value <= 0 or (value & (value - 1)) != 0: + raise DSLRuntimeError("expects align be power of 2 as positive value") + self._value = value + + def __str__(self): + return f"align({self._value})" + + +class PointerMeta(DslType): + def __new__(cls, name, bases, attrs, value_type=Int32, align_=align(1)): + new_cls = super().__new__( + cls, + name, + bases, + attrs, + mlir_type=lambda: getattr(ir, "UnrankedMemRefType").get( + value_type.mlir_type, getattr(ir, "Attribute").parse("0") + ), + ) + new_cls._value_type = value_type + new_cls._align = align_ + return new_cls + + def __eq__(cls, other): + if not isinstance(other, PointerMeta): + return False + return ( + cls._value_type == other._value_type + and cls._align._value == other._align._value + ) # Compare alignment values + + def __hash__(cls): + return hash((cls._value_type, cls._align._value)) # Hash alignment value + + def __getitem__(cls, params) -> Type["Pointer"]: + value_type, align_ = params + + if not isinstance(align_, align): + raise DSLRuntimeError(f"expects align but got {align_}") + + # Create new class with proper name and parameters + new_cls = type( + f"Pointer[{value_type.__name__}, {align_}]", + (Pointer,), + {}, + value_type=value_type, + align_=align_, # Pass alignment to __new__ + ) + return new_cls + + def __str__(cls): + return f"ptr<{cls._value_type}, {cls._align}>" + + +class Pointer(metaclass=PointerMeta): + """ + A pointer to a memory location. + + Examples: + + def foo(a : Pointer[Int32, align=8]): + ... + + """ + + def __init__(self, value): + self.value = value + + def __str__(self): + return f"{self.value} : {type(self)}" + + +class IRConst(Generic[TY]): + """Value is passed as MLIR constant value for (arith.constant).""" + + def __init__(self, ty: TY): + self.ty = ty + + +class IRValue(Generic[TY]): + """Value is passed as MLIR dynamic value.""" + + def __init__(self, ty: TY): + self.ty = ty + + +class IRVariadic: + """ + A helper class to pass a variadic number of arguments to a function. + """ + + def __init__(self, operands): + """ + Create a list of variadic operands. `operands` must be SSA values. + """ + self.operands = operands + + def block_arg_types(self): + """ + Return the list of block args types. + """ + return [operand.type for operand in self.operands] + + def set_func_args(self, block_args): + """ + This function is called after entering a function. `block_args` are the + block arguments that correspond to the passed operands. Derived classes + may implement this function to provide convenience getters for block + arguments. + """ + pass + + def __len__(self): + """ + Return the length of variadic operands. + """ + return len(self.operands) + + +class FuncArgWithAttr(IRValue): + """ + This derived class is specifically for func op arg with attr + """ + + def __init__(self, ty, attr_name, attr_ty, attr_value=None): + super().__init__(ty) + assert attr_name is not None and ( + attr_ty is not None or attr_value is not None + ), "Invalid attr_name and/or attr_ty and/or attr_value for FuncArgWithAttr" + self.attr_name = attr_name + self.attr_ty = attr_ty + self.attr_value = attr_value + + + +def implicitDowncastNumericType(value): + if isinstance(value, Numeric): + return value.ir_value() + return value + + +__all__ = [ + "DslType", + "Numeric", + "NumericMeta", + "IntegerMeta", + "FloatMeta", + "Boolean", + "Integer", + "Int16", + "Int32", + "Int64", + "Int128", + "Int8", + "Uint8", + "Uint16", + "Uint32", + "Uint64", + "Uint128", + "Float", + "Float16", + "BFloat16", + "TFloat32", + "Float32", + "Float64", + "Float8E5M2", + "Float8E4M3", + "Float8E4M3FN", + "Float8E4M3B11FNUZ", + "Float8E4M3", + "Float8E8M0FNU", + "Float4E2M1FN", + "Float6E2M3FN", + "Float6E3M2FN", + "as_numeric", + "align", + "Pointer", + "dtype", + "Constexpr", + "IRConst", + "IRValue", + "IRVariadic", + "implicitDowncastNumericType", +] diff --git a/python/CuTeDSL/base_dsl/utils/__init__.py b/python/CuTeDSL/base_dsl/utils/__init__.py new file mode 100644 index 00000000..c4bfb2b7 --- /dev/null +++ b/python/CuTeDSL/base_dsl/utils/__init__.py @@ -0,0 +1,19 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from . import stacktrace +from . import logger +from . import timer +__all__ = [ + "logger", + "timer", + "stacktrace", +] diff --git a/python/CuTeDSL/base_dsl/utils/logger.py b/python/CuTeDSL/base_dsl/utils/logger.py new file mode 100644 index 00000000..b239f346 --- /dev/null +++ b/python/CuTeDSL/base_dsl/utils/logger.py @@ -0,0 +1,80 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides logging helper functions +""" + +import logging + +logger = None + + +def log(): + return logger + + +def setup_log( + name, log_to_console=False, log_to_file=False, log_file_path=None, log_level=1 +): + """Set up and configure a logger with console and/or file handlers. + + :param name: Name of the logger to create + :type name: str + :param log_to_console: Whether to enable logging to console, defaults to False + :type log_to_console: bool, optional + :param log_to_file: Whether to enable logging to file, defaults to False + :type log_to_file: bool, optional + :param log_file_path: Path to the log file, required if log_to_file is True + :type log_file_path: str, optional + :param log_level: Logging level to set, defaults to 1 + :type log_level: int, optional + :raises ValueError: If log_to_file is True but log_file_path is not provided + :return: Configured logger instance + :rtype: logging.Logger + """ + # Create a custom logger + global logger + logger = logging.getLogger(name) + if log_to_console or log_to_file: + logger.setLevel(log_level) + else: + logger.setLevel(logging.NOTSET) + + # Clear existing handlers to prevent duplicate logs + if logger.hasHandlers(): + logger.handlers.clear() + + # Define formatter + formatter = logging.Formatter( + f"%(asctime)s - %(name)s - %(levelname)s - [%(funcName)s] - %(message)s" + ) + + # Add console handler if enabled + if log_to_console: + console_handler = logging.StreamHandler() + console_handler.setLevel(log_level) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + # Add file handler if enabled + if log_to_file: + if not log_file_path: + raise ValueError("log_file_path must be provided when enable_file is True") + file_handler = logging.FileHandler(log_file_path) + file_handler.setLevel(log_level) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger + + +logger = setup_log("generic") diff --git a/python/CuTeDSL/base_dsl/utils/stacktrace.py b/python/CuTeDSL/base_dsl/utils/stacktrace.py new file mode 100644 index 00000000..d2091098 --- /dev/null +++ b/python/CuTeDSL/base_dsl/utils/stacktrace.py @@ -0,0 +1,165 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" + This module provides stacktrace helper functions +""" + +import os +import re + + +def walk_to_top_module(start_path): + """ + Walk up from the start_path to find the top-level Python module. + + :param start_path: The path to start from. + :return: The path of the top-level module. + """ + current_path = start_path + + while True: + # Check if we are at the root directory + if os.path.dirname(current_path) == current_path: + break + + # Check for __init__.py + init_file_path = os.path.join(current_path, "__init__.py") + if os.path.isfile(init_file_path): + # If __init__.py exists, move up one level + current_path = os.path.dirname(current_path) + else: + # If no __init__.py, we are not in a module; stop + break + + # If we reached the root without finding a module, return None + if os.path.dirname(current_path) == current_path and not os.path.isfile( + os.path.join(current_path, "__init__.py") + ): + return None + + # Return the path of the top-level module + return current_path + + +def _filter_internal_frames(traceback, internal_path): + """ + Filter out stack frames from the traceback that belong to the specified module path. + + This function removes stack frames from the traceback whose file paths start with + the given prefix_path, effectively hiding internal implementation details from + the error traceback shown to users. + """ + iter_prev = None + iter_tb = traceback + while iter_tb is not None: + if os.path.abspath(iter_tb.tb_frame.f_code.co_filename).startswith( + internal_path + ): + if iter_tb.tb_next: + if iter_prev: + iter_prev.tb_next = iter_tb.tb_next + else: + traceback = iter_tb.tb_next + else: + iter_prev = iter_tb + iter_tb = iter_tb.tb_next + return traceback + + +_generated_function_names = re.compile( + r"^(loop_body|while_region|while_before_block|while_after_block|if_region|then_block|else_block|elif_region)_\d+$" +) + + +def _filter_duplicated_frames(traceback): + """ + Filter out duplicated stack frames from the traceback. + The function filters out consecutive frames that are in the same file and have the same line number. + In a sequence of consecutive frames, the logic prefers to keep the non-generated frame or the last frame. + """ + iter_prev = None + iter_tb = traceback + while iter_tb is not None: + skip_current = False + skip_next = False + if iter_tb.tb_next: + current_filename = os.path.abspath(iter_tb.tb_frame.f_code.co_filename) + next_filename = os.path.abspath(iter_tb.tb_next.tb_frame.f_code.co_filename) + # if in the same file, check if the line number is the same + if current_filename == next_filename: + current_lineno = iter_tb.tb_lineno + next_lineno = iter_tb.tb_next.tb_lineno + if current_lineno == next_lineno: + # Same file and line number, check name, if current is generated, skip current, otherwise skip next + name = iter_tb.tb_frame.f_code.co_name + is_generated = bool(_generated_function_names.match(name)) + if is_generated: + # Skip current + skip_current = True + else: + # Skip next if it's generated, otherwise keep both + next_name = iter_tb.tb_next.tb_frame.f_code.co_name + skip_next = bool(_generated_function_names.match(next_name)) + if skip_current: + if iter_prev: + iter_prev.tb_next = iter_tb.tb_next + else: + traceback = iter_tb.tb_next + elif skip_next: + # if next is last frame, don't skip + if iter_tb.tb_next.tb_next: + iter_tb.tb_next = iter_tb.tb_next.tb_next + iter_prev = iter_tb + else: + iter_prev = iter_tb + iter_tb = iter_tb.tb_next + + return traceback + + +def filter_stackframe(traceback, prefix_path): + """ + Filter out stack frames from the traceback that belong to the specified module path. + + This function removes stack frames from the traceback whose file paths start with + the given prefix_path, effectively hiding internal implementation details from + the error traceback shown to users. + + :param traceback: The traceback object to filter. + :param prefix_path: The path prefix to filter out from the traceback. + :return: The filtered traceback with internal frames removed. + """ + # Step 1: filter internal frames + traceback = _filter_internal_frames(traceback, prefix_path) + + # Step 2: consolidate duplicated frames + return _filter_duplicated_frames(traceback) + + +def filter_exception(value, module_dir): + """ + Filter out internal implementation details from exception traceback. + + This function recursively processes an exception and its cause chain, + removing stack frames that belong to the specified module directory. + This helps to present cleaner error messages to users by hiding + implementation details. + + :param value: The exception object to filter. + :param module_dir: The module directory path to filter out from tracebacks. + :return: The filtered exception with internal frames removed. + """ + if hasattr(value, "__cause__") and value.__cause__: + filter_exception(value.__cause__, module_dir) + + if hasattr(value, "__traceback__"): + filter_stackframe(value.__traceback__, module_dir) diff --git a/python/CuTeDSL/base_dsl/utils/timer.py b/python/CuTeDSL/base_dsl/utils/timer.py new file mode 100644 index 00000000..f41d3f74 --- /dev/null +++ b/python/CuTeDSL/base_dsl/utils/timer.py @@ -0,0 +1,56 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides a timing helper functions +""" +from functools import wraps + +from .logger import log + + +# TODO: revisit this part when mlir timing manager is ready for pybind. +def timer(*dargs, **kwargs): + enable = kwargs.get("enable", True) + + def decorator(func): + @wraps(func) + def func_wrapper(*args, **kwargs): + if not enable: + return func(*args, **kwargs) + from time import time + + start = time() + result = func(*args, **kwargs) + end = time() + + # Convert time from seconds to us + spend_us = (end - start) * 1e6 + + # Determine the function type and format the log message + if hasattr(func, "__name__"): + func_name = func.__name__ + log_message = f"[JIT-TIMER] Function: {func_name} | Execution Time: {spend_us:.2f} µs" + elif "CFunctionType" in str(type(func)): + log_message = f"[JIT-TIMER] C API Function: {str(func)} | Execution Time: {spend_us:.2f} µs" + else: + log_message = f"[JIT-TIMER] Anonymous Function | Execution Time: {spend_us:.2f} µs" + + log().info(log_message) + + return result + + return func_wrapper + + if len(dargs) == 1 and callable(dargs[0]): + return decorator(dargs[0]) + else: + return decorator diff --git a/python/CuTeDSL/cutlass/__init__.py b/python/CuTeDSL/cutlass/__init__.py new file mode 100644 index 00000000..d0e7c93b --- /dev/null +++ b/python/CuTeDSL/cutlass/__init__.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .cutlass_dsl import ( + Constexpr, + as_numeric, + min, + max, + and_, + or_, + all_, + any_, + not_, + all_, + any_, + select_, + # Control-flow without AST pre-processor + if_generate, + for_generate, + LoopUnroll, + while_generate, + yield_out, + # Control-flow with AST pre-processor + range_constexpr, + range_dynamic, + const_expr, + dynamic_expr, + # Data types + dtype, # Provides conversions to types inheriting from NumericType + DSLRuntimeError, + JitArgAdapterRegistry, + # Construction utilities for user-defined classes + extract_mlir_values, + new_from_mlir_values, +) + +from .cute.typing import * + +# Utilities not belonging to CuTe +from . import utils as utils + +# Used as internal symbol +from . import cutlass_dsl as _dsl + +# Aliases +LaunchConfig = _dsl.BaseDSL.LaunchConfig +register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter +gpu = _dsl.cutlass_gpu +cuda = _dsl.cuda_helpers diff --git a/python/CuTeDSL/cutlass/cute/__init__.py b/python/CuTeDSL/cutlass/cute/__init__.py new file mode 100644 index 00000000..11496402 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/__init__.py @@ -0,0 +1,310 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +# Use the auto-generated enum AddressSpace +from cutlass._mlir.dialects.cute import AddressSpace + +# Explicitly import types that might be directly used by other modules. +# This is a fix for using Sphinx to generate documentation +# Because Sphinx processes each module in isolation, it won't be able to rely +# on re-exported symbols via wildcard imports (from .typing import *) in the +# same way that Python does at runtime. +from .typing import ( + Shape, + Stride, + IntTuple, + Coord, + Tile, + XTuple, + Tiler, + Layout, + Pointer, + Tensor, +) + +# Import everything else +from .typing import * + +from .core import ( + assume, + is_integer, + is_int_tuple, + is_static, + size, + has_underscore, + slice_, + make_ptr, + make_layout, + recast_layout, + make_fragment_like, + depth, + rank, + flatten_to_tuple, + flatten, + unflatten, + product, + product_like, + shape, + size_in_bytes, + make_identity_layout, + make_ordered_layout, + make_composed_layout, + make_layout_tv, + make_swizzle, + recast_ptr, + make_tensor, + make_identity_tensor, + make_fragment, + recast_tensor, + get, + select, + front, + is_major, + find, + coalesce, + group_modes, + cosize, + dice, + product_each, + prepend, + append, + prepend_ones, + append_ones, + ceil_div, + slice_and_offset, + crd2idx, + domain_offset, + elem_less, + transform_leaf, + filter_zeros, + filter, + tile_to_shape, + shape_div, + composition, + complement, + right_inverse, + left_inverse, + max_common_layout, + max_common_vector, + logical_product, + zipped_product, + tiled_product, + flat_product, + raked_product, + blocked_product, + flat_divide, + logical_divide, + zipped_divide, + tiled_divide, + local_partition, + local_tile, + printf, + print_tensor, + # tiled mma/tiled copy + make_mma_atom, + make_tiled_mma, + make_copy_atom, + make_tiled_copy_tv, + make_tiled_copy, + make_tiled_copy_S, + make_tiled_copy_D, + make_tiled_copy_C_atom, + basic_copy, + basic_copy_if, + autovec_copy, + copy, + gemm, + # Wrapper classes + ComposedLayout, + Swizzle, + E, + Atom, + MmaAtom, + CopyAtom, + TiledCopy, + TiledMma, + TensorSSA, + ReductionOp, + full, + full_like, + empty_like, + ones_like, + zeros_like, + where, + any_, + all_, + # User defined struct + struct, + pretty_str, + make_layout_image_mask, + repeat_like, + round_up, + is_congruent, + is_weakly_congruent, + ScaledBasis, + get_divisibility, + Ratio, +) + +from . import arch +from . import nvgpu +from . import testing +from . import runtime + +# Export all math ops without "math." +from .math import * + +# Used as internal symbol +from .. import cutlass_dsl as _dsl + +# Aliases +jit = _dsl.CuTeDSL.jit +kernel = _dsl.CuTeDSL.kernel +register_jit_arg_adapter = _dsl.JitArgAdapterRegistry.register_jit_arg_adapter +compile = _dsl.compile + +# Explicitly export all symbols for documentation generation +__all__ = [ + # Core types + "AddressSpace", + "Tensor", + "Layout", + "ComposedLayout", + "Swizzle", + "E", + "Atom", + "MmaAtom", + "CopyAtom", + "TiledCopy", + "TiledMma", + "TensorSSA", + # Basic utility functions + "assume", + "is_integer", + "is_int_tuple", + "is_static", + "size", + "has_underscore", + "slice_", + "depth", + "rank", + "shape", + "printf", + "print_tensor", + "pretty_str", + # Layout functions + "make_layout", + "recast_layout", + "make_identity_layout", + "make_ordered_layout", + "make_composed_layout", + "make_layout_tv", + "make_layout_image_mask", + # Tensor functions + "make_ptr", + "make_tensor", + "make_identity_tensor", + "make_fragment", + "make_fragment_like", + "recast_ptr", + "recast_tensor", + # Tensor manipulation + "get", + "select", + "front", + "is_major", + "find", + "coalesce", + "group_modes", + "cosize", + "size_in_bytes", + # Tuple operations + "flatten_to_tuple", + "flatten", + "product", + "product_like", + "product_each", + "prepend", + "append", + "prepend_ones", + "append_ones", + # Math operations + "ceil_div", + "round_up", + # Layout operations + "slice_and_offset", + "crd2idx", + "domain_offset", + "elem_less", + "filter_zeros", + "filter", + "tile_to_shape", + "shape_div", + "dice", + # Layout algebra + "composition", + "complement", + "right_inverse", + "left_inverse", + "max_common_layout", + "max_common_vector", + "is_congruent", + "is_weakly_congruent", + # Product operations + "logical_product", + "zipped_product", + "tiled_product", + "flat_product", + "raked_product", + "blocked_product", + # Division operations + "flat_divide", + "logical_divide", + "zipped_divide", + "tiled_divide", + "local_partition", + "local_tile", + # MMA and Copy operations + "make_mma_atom", + "make_tiled_mma", + "make_copy_atom", + "make_tiled_copy_tv", + "make_tiled_copy", + "make_tiled_copy_C_atom", + "basic_copy", + "basic_copy_if", + "autovec_copy", + "copy", + "gemm", + # Tensor creation + "full", + "full_like", + "empty_like", + "ones_like", + "zeros_like", + "where", + "any_", + "all_", + "repeat_like", + "ScaledBasis", + # User defined struct + "struct", + # Modules + "arch", + "nvgpu", + "testing", + "runtime", + # Decorators and code generation + "jit", + "kernel", + "register_jit_arg_adapter", + "compile", +] diff --git a/python/CuTeDSL/cutlass/cute/arch/__init__.py b/python/CuTeDSL/cutlass/cute/arch/__init__.py new file mode 100644 index 00000000..5114b97f --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/arch/__init__.py @@ -0,0 +1,98 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .elect import * +from .mbar import * +from .nvvm_wrappers import * +from .smem import * +from .tmem import * + +# __all__ is required here for documentation generation +__all__ = [ + # + # elect.py + # + "make_warp_uniform", + "elect_one", + # + # mbar.py + # + "mbarrier_init_arrive_cnt", + "mbarrier_init_fence", + "mbarrier_init_tx_bytes", + "mbarrier_wait", + "mbarrier_try_wait", + "conditional_mbarrier_try_wait", + "mbarrier_arrive", + # + # nvvm_wrappers.py + # + "lane_idx", + "warp_idx", + "thread_idx", + "block_dim", + "block_idx", + "grid_dim", + "cluster_idx", + "cluster_dim", + "block_in_cluster_idx", + "block_in_cluster_dim", + "block_idx_in_cluster", + "shuffle_sync", + "shuffle_sync_up", + "shuffle_sync_down", + "shuffle_sync_bfly", + "barrier", + "sync_threads", + "sync_warp", + "fence_acq_rel_cta", + "fence_acq_rel_cluster", + "fence_acq_rel_gpu", + "fence_acq_rel_sys", + "cp_async_commit_group", + "cp_async_wait_group", + "cp_async_bulk_commit_group", + "cp_async_bulk_wait_group", + "cluster_wait", + "cluster_arrive", + "cluster_arrive_relaxed", + "fence_proxy", + "vote_ballot_sync", + "popc", + "fence_view_async_tmem_load", + "fence_view_async_tmem_store", + "warpgroup_reg_alloc", + "warpgroup_reg_dealloc", + "fma_packed_f32x2", + "mul_packed_f32x2", + "add_packed_f32x2", + "fmax", + "rcp_approx", + "exp2", + # Constants + "WARP_SIZE", + # Forward from auto-generated nvvm python + "ProxyKind", + "SharedSpace", + "RoundingModeKind", + # + # smem.py + # + "alloc_smem", + "get_dyn_smem", + # + # tmem.py + # + "retrieve_tmem_ptr", + "alloc_tmem", + "relinquish_tmem_alloc_permit", + "dealloc_tmem", +] diff --git a/python/CuTeDSL/cutlass/cute/arch/elect.py b/python/CuTeDSL/cutlass/cute/arch/elect.py new file mode 100644 index 00000000..fce82b13 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/arch/elect.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from cutlass.cutlass_dsl import CuTeDSL, T, dsl_user_op + +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir.dialects import nvvm, scf +from cutlass._mlir import ir + +from ..typing import Int, Int32 +from ...impl_utils import check_value_in + + +@dsl_user_op +def make_warp_uniform(value: Int, *, loc=None, ip=None) -> Int32: + """ + Creates a warp-uniform value from the given integer input. + + :param value: The integer to make warp uniform. + :type value: Int + :return: The warp-uniform value equal to the input. + :rtype: Int32 + """ + return Int32( + _cute_nvgpu_ir.arch_make_warp_uniform( + Int32(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + ) + + +class IfOpRegion: + """ + A context manager for if Op. + Automatically inserts `scf.yield([])` when exiting the context. + """ + + def __init__(self, block, *, loc=None, ip=None): + self.block = block + self.insert_point = ir.InsertionPoint(self.block) + self.loc = loc + self.ip = ip + + def __enter__(self): + self.insert_point.__enter__() + return self.block.arguments + + def __exit__(self, exc_type, exc_value, traceback): + scf.yield_([], loc=self.loc, ip=self.ip) + self.insert_point.__exit__(exc_type, exc_value, traceback) + + +@dsl_user_op +def elect_one(*, loc=None, ip=None) -> IfOpRegion: + """ + Elects one thread within a warp. + + .. code-block:: python + + with elect_one(): + # Only one thread in the warp executes the code in this context + pass + """ + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + is_thread_leader = nvvm.elect_sync(T.bool()) + if_op = scf.IfOp(is_thread_leader, loc=loc, ip=ip) + return IfOpRegion(if_op.then_block, loc=loc, ip=ip) diff --git a/python/CuTeDSL/cutlass/cute/arch/mbar.py b/python/CuTeDSL/cutlass/cute/arch/mbar.py new file mode 100644 index 00000000..b4dc3725 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/arch/mbar.py @@ -0,0 +1,208 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op + +from cutlass._mlir.dialects import nvvm +from cutlass._mlir import ir + +from ..typing import Pointer, Int, Boolean, Int32 +from ...impl_utils import check_value_in + + +#################################################################################################### +# +# Mbarrier management utilities +# +#################################################################################################### + + +@dsl_user_op +def mbarrier_init_arrive_cnt(mbar_ptr: Pointer, cnt: Int, *, loc=None, ip=None) -> None: + """ + Initializes a mbarrier with the specified thread arrival count. + + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param cnt: The arrival count of the mbarrier + :type cnt: Int + """ + nvvm.mbarrier_init_shared( + mbar_ptr.llvm_ptr, Int32(cnt).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + + +@dsl_user_op +def mbarrier_init_fence(*, loc=None, ip=None) -> None: + """ + A fence operation that applies to the mbarrier initializations. + """ + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + nvvm.fence_mbarrier_init(loc=loc, ip=ip) + + +@dsl_user_op +def mbarrier_init_tx_bytes( + mbar_ptr: Pointer, bytes: Int, peer_cta_rank_in_cluster=None, *, loc=None, ip=None +) -> None: + """ + Initializes a mbarrier with the specified number of transaction bytes. + + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param bytes: The number of transaction bytes + :type bytes: Int + :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to + the mbarrier is converted to a remote address in the peer CTA's + SMEM. + """ + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + + mbar_llvm_ptr = mbar_ptr.llvm_ptr + if peer_cta_rank_in_cluster is not None: + mbar_llvm_ptr = nvvm.mapa_shared_cluster( + mbar_llvm_ptr.type, + mbar_llvm_ptr, + Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + space = nvvm.MBarrierSpaceKind.CLUSTER + else: + space = nvvm.MBarrierSpaceKind.CTA + + nvvm.mbarrier_txn( + mbar_llvm_ptr, + Int32(bytes).ir_value(loc=loc, ip=ip), + kind=nvvm.MBarrierTxnKind.ARRIVE_EXPECT_TX, + space=space, + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def mbarrier_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> None: + """ + Waits on a mbarrier with a specified phase. + + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param phase: The phase to wait for (either 0 or 1) + :type phase: Int + """ + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + + timeout_ns = 10000000 + # This NVVM Op is a spin-loop wrapping the mbarrier.try_wait.parity.shared.b64 PTX + # The timeout in ns only applies to the latter and this call is truly blocking + nvvm.mbarrier_try_wait_parity_shared( + mbar_ptr.llvm_ptr, + Int32(phase).ir_value(loc=loc, ip=ip), + Int32(timeout_ns).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def mbarrier_try_wait(mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None) -> Boolean: + """ + Attempts to wait on a mbarrier with a specified phase in a non-blocking fashion. + + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param phase: The phase to wait for (either 0 or 1) + :type phase: Int + :return: A boolean value indicating whether the wait operation was successful + :rtype: Boolean + """ + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + + return Boolean( + nvvm.mbarrier_wait_parity( + T.bool(), + mbar_ptr.llvm_ptr, + Int32(phase).ir_value(loc=loc, ip=ip), + nvvm.MBarrierWaitKind.TRY, + loc=loc, + ip=ip, + ) + ) + + +@dsl_user_op +def conditional_mbarrier_try_wait( + cond, mbar_ptr: Pointer, phase: Int, *, loc=None, ip=None +) -> Boolean: + """ + Conditionally attempts to wait on a mbarrier with a specified phase in a non-blocking fashion. + + :param cond: A boolean predicate + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param phase: The phase to wait for (either 0 or 1) + :type phase: Int + :return: A boolean value indicating whether the wait operation was successful + :rtype: Boolean + """ + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + return if_generate( + cond, + lambda: mbarrier_try_wait(mbar_ptr, phase, loc=loc, ip=ip), + lambda: Boolean(True).ir_value(loc=loc, ip=ip), + None, + [Boolean], + ) + + +@dsl_user_op +def mbarrier_arrive( + mbar_ptr: Pointer, peer_cta_rank_in_cluster: Int = None, *, loc=None, ip=None +) -> None: + """ + Arrives on an mbarrier. + + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param peer_cta_rank_in_cluster: An optional CTA rank in cluster. If provided, the pointer to + the mbarrier is converted to a remote address in the peer CTA's + SMEM. + """ + mbar_llvm_ptr = mbar_ptr.llvm_ptr + if peer_cta_rank_in_cluster is not None: + arch = CuTeDSL._get_dsl().envar.arch + check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch") + + mbar_llvm_ptr = nvvm.mapa_shared_cluster( + mbar_llvm_ptr.type, + mbar_llvm_ptr, + Int32(peer_cta_rank_in_cluster).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + space = nvvm.MBarrierSpaceKind.CLUSTER + else: + space = nvvm.MBarrierSpaceKind.CTA + + nvvm.mbarrier_txn( + mbar_llvm_ptr, + Int32(1).ir_value(loc=loc, ip=ip), + kind=nvvm.MBarrierTxnKind.ARRIVE, + space=space, + loc=loc, + ip=ip, + ) diff --git a/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py b/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py new file mode 100644 index 00000000..03d83c26 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/arch/nvvm_wrappers.py @@ -0,0 +1,547 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from functools import partial +from typing import Optional, Tuple, Union, Callable + +from cutlass.cutlass_dsl import T, dsl_user_op + +from cutlass._mlir import ir +from cutlass._mlir.dialects import llvm, nvvm, vector + +# Forward nvvm enums +from cutlass._mlir.dialects.nvvm import ( + ProxyKind, + SharedSpace, + Tcgen05WaitKind, + SetMaxRegisterAction, + RoundingModeKind, +) + +from ..typing import Int, Boolean, Int32, Float32, Numeric, as_numeric + +WARP_SIZE = 32 +FULL_MASK = 0xFFFFFFFF + + +@dsl_user_op +def lane_idx(*, loc=None, ip=None) -> Int32: + """ + Returns the lane index of the current thread within the warp. + """ + return Int32(nvvm.read_ptx_sreg_laneid(T.i32(), loc=loc, ip=ip)) + + +@dsl_user_op +def warp_idx(*, loc=None, ip=None) -> Int32: + """ + Returns the warp index within a CTA. + """ + warp_size = 32 + tid_x = Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip)) + tid_y = Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip)) + tid_z = Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip)) + ntid_x = Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip)) + ntid_y = Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip)) + tid = tid_x + tid_y * ntid_x + tid_z * ntid_x * ntid_y + return tid // warp_size + + +@dsl_user_op +def thread_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the thread index within a CTA. + """ + return ( + Int32(nvvm.read_ptx_sreg_tid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_tid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_tid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def block_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the number of threads in each dimension of the CTA. + """ + return ( + Int32(nvvm.read_ptx_sreg_ntid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_ntid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_ntid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def block_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the CTA identifier within a grid. + """ + return ( + Int32(nvvm.read_ptx_sreg_ctaid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_ctaid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_ctaid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def grid_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the number of CTAs in each dimension of the grid. + """ + return ( + Int32(nvvm.read_ptx_sreg_nctaid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_nctaid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_nctaid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the cluster identifier within a grid. + """ + return ( + Int32(nvvm.read_ptx_sreg_clusterid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_clusterid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_clusterid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the number of clusters in each dimension of the grid. + """ + return ( + Int32(nvvm.read_ptx_sreg_nclusterid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_nclusterid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_nclusterid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def block_in_cluster_idx(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the CTA index within a cluster across all dimensions. + """ + return ( + Int32(nvvm.read_ptx_sreg_cluster_ctaid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_cluster_ctaid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_cluster_ctaid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def block_in_cluster_dim(*, loc=None, ip=None) -> Tuple[Int32, Int32, Int32]: + """ + Returns the dimensions of the cluster. + """ + return ( + Int32(nvvm.read_ptx_sreg_cluster_nctaid_x(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_cluster_nctaid_y(T.i32(), loc=loc, ip=ip)), + Int32(nvvm.read_ptx_sreg_cluster_nctaid_z(T.i32(), loc=loc, ip=ip)), + ) + + +@dsl_user_op +def block_idx_in_cluster(*, loc=None, ip=None) -> Int32: + """ + Returns the linearized identifier of the CTA within the cluster. + """ + return Int32(nvvm.read_ptx_sreg_cluster_ctarank(T.i32(), loc=loc, ip=ip)) + + +@dsl_user_op +def shuffle_sync_op( + value: Numeric, + offset: Int, + mask: Int = FULL_MASK, + mask_and_clamp: Int = WARP_SIZE - 1, + kind: nvvm.ShflKind = nvvm.ShflKind.idx, + *, + loc=None, + ip=None, +) -> Numeric: + """ + Shuffles a value within the threads of a warp. + + :param value: The value to shuffle + :type value: Numeric + :param mask: A mask describing the threads participating in this operation + :type mask: Int + :param offset: A source lane or a source lane offset depending on kind + :type offset: Int + :param mask_and_clamp: An integer containing two packed values specifying a mask for logically + splitting warps into sub-segments and an upper bound for clamping the + source lane index. + :type mask_and_clamp: Int + :param kind: The kind of shuffle, can be idx, up, down, or bfly + :type kind: ShflKind + :return: The shuffled value + :rtype: Numeric + """ + if not isinstance(value, Numeric): + value = as_numeric(value) + return type(value)( + nvvm.shfl_sync( + type(value).mlir_type, + Int32(mask).ir_value(loc=loc, ip=ip), + value.ir_value(loc=loc, ip=ip), + Int32(offset).ir_value(loc=loc, ip=ip), + Int32(mask_and_clamp).ir_value(loc=loc, ip=ip), + kind, + loc=loc, + ip=ip, + ) + ) + + +shuffle_sync = partial(shuffle_sync_op, kind=nvvm.ShflKind.idx) +shuffle_sync_up = partial(shuffle_sync_op, kind=nvvm.ShflKind.up) +shuffle_sync_down = partial(shuffle_sync_op, kind=nvvm.ShflKind.down) +shuffle_sync_bfly = partial(shuffle_sync_op, kind=nvvm.ShflKind.bfly) + + +@dsl_user_op +def barrier(*, barrier_id=None, number_of_threads=None, loc=None, ip=None) -> None: + """ + Creates a barrier, optionally named. + """ + if barrier_id is not None: + barrier_id = Int32(barrier_id).ir_value(loc=loc, ip=ip) + + if number_of_threads is not None: + number_of_threads = Int32(number_of_threads).ir_value(loc=loc, ip=ip) + + nvvm.barrier( + barrier_id=barrier_id, number_of_threads=number_of_threads, loc=loc, ip=ip + ) + +@dsl_user_op +def sync_threads(*, loc=None, ip=None) -> None: + """ + Synchronizes all threads within a CTA. + """ + nvvm.barrier(loc=loc, ip=ip) + + +@dsl_user_op +def sync_warp(mask: Int = FULL_MASK, *, loc=None, ip=None) -> None: + """ + Performs a warp-wide sync with an optional mask. + """ + nvvm.bar_warp_sync(Int32(mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip) + + +@dsl_user_op +def fence_acq_rel_cta(*, loc=None, ip=None) -> None: + """ + Fence operation with acquire-release semantics. + + See the `PTX documentation `__. + """ + nvvm.fence_acq_rel_cta(loc=loc, ip=ip) + + +@dsl_user_op +def fence_acq_rel_cluster(*, loc=None, ip=None) -> None: + """ + Fence operation with acquire-release semantics. + + See the `PTX documentation `__. + """ + nvvm.fence_acq_rel_cluster(loc=loc, ip=ip) + + +@dsl_user_op +def fence_acq_rel_gpu(*, loc=None, ip=None) -> None: + """ + Fence operation with acquire-release semantics. + + See the `PTX documentation `__. + """ + nvvm.fence_acq_rel_gpu(loc=loc, ip=ip) + + +@dsl_user_op +def fence_acq_rel_sys(*, loc=None, ip=None) -> None: + """ + Fence operation with acquire-release semantics. + + See the `PTX documentation `__. + """ + nvvm.fence_acq_rel_sys(loc=loc, ip=ip) + + +@dsl_user_op +def cp_async_commit_group(*, loc=None, ip=None) -> None: + """ + Commits all prior initiated but uncommitted cp.async instructions. + + See the `PTX documentation `__. + """ + nvvm.cp_async_commit_group(loc=loc, ip=ip) + + +@dsl_user_op +def cp_async_wait_group(n, *, loc=None, ip=None) -> None: + """ + Waits till only a specified numbers of cp.async groups are pending. + + See the `PTX documentation `__. + """ + nvvm.cp_async_wait_group(n, loc=loc, ip=ip) + + +@dsl_user_op +def cp_async_bulk_commit_group(*, loc=None, ip=None) -> None: + """ + Commits all prior initiated but uncommitted cp.async.bulk instructions. + + See the `PTX documentation `__. + """ + nvvm.cp_async_bulk_commit_group(loc=loc, ip=ip) + + +@dsl_user_op +def cp_async_bulk_wait_group(group, *, read=None, loc=None, ip=None) -> None: + """ + Waits till only a specified numbers of cp.async.bulk groups are pending. + + See the `PTX documentation `__. + """ + nvvm.cp_async_bulk_wait_group(group, read=read, loc=loc, ip=ip) + + +@dsl_user_op +def cluster_wait(*, loc=None, ip=None) -> None: + """ + A cluster-wide wait operation. + """ + nvvm.cluster_wait(loc=loc, ip=ip) + + +@dsl_user_op +def cluster_arrive(*, aligned=None, loc=None, ip=None) -> None: + """ + A cluster-wide arrive operation. + """ + nvvm.cluster_arrive(aligned=aligned, loc=loc, ip=ip) + + +@dsl_user_op +def cluster_arrive_relaxed(*, aligned=None, loc=None, ip=None) -> None: + """ + A cluster-wide arrive operation with relaxed semantics. + """ + nvvm.cluster_arrive_relaxed(aligned=aligned, loc=loc, ip=ip) + + +@dsl_user_op +def fence_proxy( + kind: ProxyKind, + *, + space: Optional[SharedSpace] = None, + use_intrinsic=None, + loc=None, + ip=None, +) -> None: + nvvm.fence_proxy( + kind=kind, space=space, use_intrinsic=use_intrinsic, loc=loc, ip=ip + ) + + +@dsl_user_op +def vote_ballot_sync( + pred: Boolean, mask: Int = FULL_MASK, *, loc=None, ip=None +) -> Int32: + """ + Performs a ballot operation across the warp. + """ + return Int32( + nvvm.vote_ballot_sync( + T.i32(), + Int32(mask).ir_value(loc=loc, ip=ip), + Boolean(pred).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +@dsl_user_op +def popc(value: Numeric, *, loc=None, ip=None) -> Numeric: + """ + Performs a population count operation. + """ + if not isinstance(value, Numeric): + value = as_numeric(value) + return type(value)(llvm.intr_ctpop(value.ir_value(), loc=loc, ip=ip)) + + +@dsl_user_op +def fence_view_async_tmem_op( + kind: Tcgen05WaitKind, + *, + loc=None, + ip=None, +) -> None: + """ + Perform a fence operation on the async TMEM load or store. + + .. note:: + This function is only available on sm_100a and above. + The fence is required to synchronize the TMEM load/store + and let the pipeline release or commit the buffer. + + Take a mma2acc pipeline as an example of LOAD fence, the ACC tensor is from TMEM. + ``` + # Start to copy ACC from TMEM to register + cute.copy(tmem_load, tACC, rACC) + fence_view_async_tmem_load() + # After fence, we can ensure the TMEM buffer is consumed totally. + # Release the buffer to let the MMA know it can overwrite the buffer. + mma2accum_pipeline.consumer_release(curr_consumer_state) + ``` + Take a TS GEMM kernel as an example of STORE fence, the A tensor is from TMEM. + ``` + # Start to copy A from register to TMEM + cute.copy(tmem_store, rA, tA) + fence_view_async_tmem_store() + # After fence, we can ensure the TMEM buffer is ready. + # Commit the buffer to let the MMA know it can start to load A. + tmem_mma_pipeline.producer_commit(curr_producer_state) + ``` + + + :param kind: The kind of fence operation to perform including LOAD and STORE. + :type kind: Tcgen05WaitKind + """ + nvvm.tcgen05_wait(kind, loc=loc, ip=ip) + + +fence_view_async_tmem_load = partial( + fence_view_async_tmem_op, kind=Tcgen05WaitKind.LOAD +) +fence_view_async_tmem_store = partial( + fence_view_async_tmem_op, kind=Tcgen05WaitKind.STORE +) + + +@dsl_user_op +def warpgroup_reg_realloc_op( + reg_count: int, + kind: SetMaxRegisterAction, + *, + loc=None, + ip=None, +) -> None: + nvvm.setmaxregister(reg_count, kind, loc=loc, ip=ip) + + +warpgroup_reg_alloc = partial( + warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.increase +) +warpgroup_reg_dealloc = partial( + warpgroup_reg_realloc_op, kind=SetMaxRegisterAction.decrease +) + + +@dsl_user_op +def calc_packed_f32x2_op( + src_a: Tuple[Float32, Float32], + src_b: Tuple[Float32, Float32], + src_c: Tuple[Float32, Float32] | None, + calc_func: Callable, + *, + rnd=RoundingModeKind.RZ, + ftz=True, + loc=None, + ip=None, +) -> Tuple[Float32, Float32]: + vec_type = ir.VectorType.get([2], Float32.mlir_type, loc=loc) + vec_src_a = vector.from_elements( + vec_type, tuple(as_numeric(a).ir_value() for a in src_a), loc=loc, ip=ip + ) + vec_src_b = vector.from_elements( + vec_type, tuple(as_numeric(b).ir_value() for b in src_b), loc=loc, ip=ip + ) + if src_c is not None: + vec_src_c = vector.from_elements( + vec_type, tuple(as_numeric(c).ir_value() for c in src_c), loc=loc, ip=ip + ) + vec_res = calc_func( + vec_type, vec_src_a, vec_src_b, vec_src_c, rnd=rnd, ftz=ftz, loc=loc, ip=ip + ) + else: + vec_res = calc_func( + vec_type, vec_src_a, vec_src_b, rnd=rnd, ftz=ftz, loc=loc, ip=ip + ) + + res0 = Float32( + vector.extract( + vec_res, dynamic_position=[], static_position=[0], loc=loc, ip=ip + ) + ) + res1 = Float32( + vector.extract( + vec_res, dynamic_position=[], static_position=[1], loc=loc, ip=ip + ) + ) + return res0, res1 + + +fma_packed_f32x2 = partial(calc_packed_f32x2_op, calc_func=nvvm.fma_packed_f32x2) +mul_packed_f32x2 = partial( + calc_packed_f32x2_op, src_c=None, calc_func=nvvm.mul_packed_f32x2 +) +add_packed_f32x2 = partial( + calc_packed_f32x2_op, src_c=None, calc_func=nvvm.add_packed_f32x2 +) + + +@dsl_user_op +def fmax( + a: Union[float, Float32], b: Union[float, Float32], *, loc=None, ip=None +) -> Float32: + return Float32( + nvvm.fmax( + T.f32(), + Float32(a).ir_value(loc=loc, ip=ip), + Float32(b).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +@dsl_user_op +def rcp_approx(a: Union[float, Float32], *, loc=None, ip=None): + return Float32( + nvvm.rcp_approx_ftz_f( + T.f32(), Float32(a).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + ) + + +@dsl_user_op +def exp2(a: Union[float, Float32], *, loc=None, ip=None) -> Float32: + return Float32( + llvm.inline_asm( + T.f32(), + [Float32(a).ir_value(loc=loc, ip=ip)], + "ex2.approx.ftz.f32 $0, $1;", + "=f,f", + has_side_effects=True, + is_align_stack=False, + asm_dialect=llvm.AsmDialect.AD_ATT, + ) + ) diff --git a/python/CuTeDSL/cutlass/cute/arch/smem.py b/python/CuTeDSL/cutlass/cute/arch/smem.py new file mode 100644 index 00000000..4e5dee7b --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/arch/smem.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Optional, Type + +from cutlass.cutlass_dsl import T, dsl_user_op + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from ..typing import Pointer, Numeric, NumericMeta + + +@dsl_user_op +def alloc_smem( + element_type: Type[Numeric], + size_in_elems: int, + alignment: Optional[int] = None, + *, + loc=None, + ip=None, +) -> Pointer: + """ + Statically allocates SMEM. + + :param element_type: The pointee type of the pointer. + :type element_type: Type[Numeric] + :param size_in_elems: The size of the allocation in terms of number of elements of the + pointee type + :type size_in_elems: int + :param alignment: An optional pointer alignment for the allocation + :type alignment: int + :return: A pointer to the start of the allocation + :rtype: Pointer + """ + if not isinstance(element_type, NumericMeta): + raise TypeError( + f"element_type must be a type of Numeric, but got {element_type}" + ) + + if alignment is None: + # Default alignment based on the element type's width + alignment = element_type.width // 8 + ptr_ty = _cute_ir.PtrType.get( + element_type.mlir_type, _cute_ir.AddressSpace.smem, alignment + ) + return _cute_nvgpu_ir.arch_alloc_smem( + ptr=ptr_ty, + input=ir.IntegerAttr.get(T.i32(), size_in_elems), + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def get_dyn_smem( + element_type: Type[Numeric], + alignment: Optional[int] = None, + *, + loc=None, + ip=None, +) -> Pointer: + """ + Retrieves a pointer to a dynamic SMEM allocation. + + :param element_type: The pointee type of the pointer. + :type element_type: Type[Numeric] + :param alignment: An optional pointer alignment, the result pointer is offset appropriately + :type alignment: int + :return: A pointer to the start of the dynamic SMEM allocation with a correct + alignement + :rtype: Pointer + """ + if not isinstance(element_type, NumericMeta): + raise TypeError( + f"element_type must be a type of Numeric, but got {element_type}" + ) + + if alignment is None: + # Default alignment based on the element type's width + alignment = element_type.width // 8 + ptr_ty = _cute_ir.PtrType.get( + element_type.mlir_type, + _cute_ir.AddressSpace.smem, + alignment, + ) + return _cute_nvgpu_ir.arch_get_dyn_smem(ptr=ptr_ty, loc=loc, ip=ip) diff --git a/python/CuTeDSL/cutlass/cute/arch/tmem.py b/python/CuTeDSL/cutlass/cute/arch/tmem.py new file mode 100644 index 00000000..302616d2 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/arch/tmem.py @@ -0,0 +1,142 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Type + +from cutlass.cutlass_dsl import dsl_user_op + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir + +from ..typing import Pointer, Int, Int32, Numeric, NumericMeta + + +SM100_TMEM_CAPACITY_COLUMNS = 512 +SM100_TMEM_MIN_ALLOC_COLUMNS = 32 + + +@dsl_user_op +def retrieve_tmem_ptr( + element_type: Type[Numeric], + alignment: int, + ptr_to_buffer_holding_addr: Pointer, + *, + loc=None, + ip=None, +) -> Pointer: + """ + Retrieves a pointer to TMEM with the provided element type and alignment. + + :param element_type: The pointee type of the pointer. + :type element_type: Type[Numeric] + :param alignment: The alignment of the result pointer + :type alignment: int + :param ptr_to_buffer_holding_addr: A pointer to a SMEM buffer holding the TMEM address of the + start of the allocation allocation + :type ptr_to_buffer_holding_addr: Pointer + :return: A pointer to TMEM + :rtype: Pointer + """ + if not isinstance(element_type, NumericMeta): + raise TypeError( + f"element_type must be a type of Numeric, but got {element_type}" + ) + + res_ty = _cute_ir.PtrType.get( + element_type.mlir_type, _cute_ir.AddressSpace.tmem, alignment + ) + return _cute_nvgpu_ir.arch_sm100_retrieve_tmem_ptr( + res_ty, ptr_to_buffer_holding_addr.value, loc=loc, ip=ip + ) + + +@dsl_user_op +def alloc_tmem( + num_columns: Int, + smem_ptr_to_write_address: Pointer, + is_two_cta=None, + *, + loc=None, + ip=None, +) -> None: + """ + Allocates TMEM. + + :param num_columns: The number of TMEM columns to allocate + :type num_columns: Int + :param smem_ptr_to_write_address: A pointer to a SMEM buffer where the TMEM address is written + to + :type smem_ptr_to_write_address: Pointer + :param is_two_cta: Optional boolean parameter for 2-CTA MMAs + """ + if isinstance(num_columns, int): + if ( + num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS + or num_columns > SM100_TMEM_CAPACITY_COLUMNS + or not (num_columns & (num_columns - 1) == 0) + ): + raise ValueError( + f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}" + ) + _cute_nvgpu_ir.arch_sm100_alloc_tmem( + Int32(num_columns).ir_value(loc=loc, ip=ip), + smem_ptr_to_write_address.value, + is_two_cta=is_two_cta, + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def relinquish_tmem_alloc_permit(is_two_cta=None, *, loc=None, ip=None) -> None: + """ + Relinquishes the right to allocate TMEM so that other CTAs potentially in a different grid can + allocate. + """ + _cute_nvgpu_ir.arch_sm100_relinquish_tmem_alloc_permit( + is_two_cta=is_two_cta, loc=loc, ip=ip + ) + + +@dsl_user_op +def dealloc_tmem( + tmem_ptr: Pointer, + num_columns: Int, + is_two_cta=None, + *, + loc=None, + ip=None, +) -> None: + """ + Deallocates TMEM using the provided pointer and number of columns. + + :param tmem_ptr: A pointer to the TMEM allocation to de-allocate + :type tmem_ptr: Pointer + :param num_columns: The number of columns in the TMEM allocation + :type num_columns: Int + :param is_two_cta: Optional boolean parameter for 2-CTA MMAs + """ + if isinstance(num_columns, int): + if ( + num_columns < SM100_TMEM_MIN_ALLOC_COLUMNS + or num_columns > SM100_TMEM_CAPACITY_COLUMNS + or not (num_columns & (num_columns - 1) == 0) + ): + raise ValueError( + f"num_columns must be between 32 and 512, and must be pow of 2, but got {num_columns}" + ) + _cute_nvgpu_ir.arch_sm100_dealloc_tmem( + tmem_ptr.value, + Int32(num_columns).ir_value(loc=loc, ip=ip), + is_two_cta=is_two_cta, + loc=loc, + ip=ip, + ) diff --git a/python/CuTeDSL/cutlass/cute/core.py b/python/CuTeDSL/cutlass/cute/core.py new file mode 100644 index 00000000..6af262cd --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/core.py @@ -0,0 +1,6417 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import copy as py_copy +from dataclasses import dataclass +import math +import operator +from abc import ABC, abstractmethod +from functools import lru_cache, partial, reduce +from inspect import isclass +from itertools import chain +from typing import Iterable, overload, List, Tuple, Union, Type, Any, Dict, Optional +from enum import Enum, auto + +from cutlass.cutlass_dsl import ( + const, + T, + lru_cache_ir, + is_dynamic_expression, + for_generate, + yield_out, + if_generate, + extract_mlir_values, + new_from_mlir_values, + _binary_op_type_promote, + not_, + cutlass_arith, + dsl_user_op, +) + +from cutlass._mlir import ir +from cutlass._mlir.dialects import cute as _cute_ir +from cutlass._mlir.dialects.cute import ( + ScaledBasis as _ScaledBasis, + Ratio as _Ratio, +) + +from cutlass._mlir.dialects import cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir.dialects import llvm, builtin, vector, arith + +from .typing import ( + Numeric, + Integer, + NumericMeta, + Boolean, + Int32, + Int8, + Int16, + Int32, + Int64, + Float32, + TFloat32, + Int, + IntTuple, + Shape, + Stride, + Coord, + Layout, + Tile, + Tiler, + XTuple, + Tensor, + Pointer, + AddressSpace, + as_numeric, +) + + +#################################################################################################### +# +# Internal IntTuple helpers +# +#################################################################################################### + + +def _get_typed_value(x): + if isinstance(x, Integer): + return ( + x.value.get_typed_value() if isinstance(x.value, IntValue) else x.ir_value() + ) + else: + return x + + +def _pack_x(x, packer, op, *, loc=None, ip=None) -> ir.Value: + x = transform_leaf(_get_typed_value, x) + res_ty, dyn_elems = packer(x) + # <"0"> is deduced from type inference which should be removed for make_... operations + dyn_elems = [t for t in dyn_elems if not is_static(t)] + return op(res_ty, dyn_elems, loc=loc, ip=ip).result + + +def _pack_shape(shape: Shape, *, loc=None, ip=None) -> ir.Value: + return _pack_x(shape, _cute_ir.pack_shape, _cute_ir.MakeShapeOp, loc=loc, ip=ip) + + +def _pack_stride(stride: Stride, *, loc=None, ip=None) -> ir.Value: + # Convert basis elements to the base class before _pack_x + stride = transform_leaf( + lambda x: x.to(_cute_ir.ScaledBasis) if isinstance(x, ScaledBasis) else x, + stride, + ) + return _pack_x(stride, _cute_ir.pack_stride, _cute_ir.MakeStrideOp, loc=loc, ip=ip) + + +def _pack_coord(coord: Coord, *, loc=None, ip=None) -> ir.Value: + return _pack_x(coord, _cute_ir.pack_coord, _cute_ir.MakeCoordOp, loc=loc, ip=ip) + + +def _pack_int_tuple(int_tuple: IntTuple, *, loc=None, ip=None) -> ir.Value: + return _pack_x( + int_tuple, _cute_ir.pack_int_tuple, _cute_ir.MakeIntTupleOp, loc=loc, ip=ip + ) + + +def _pack_tile(tile: Tile, *, loc=None, ip=None) -> ir.Value: + def expand_leaves(tile) -> list: + leaves = [] + for e in tile: + if isinstance(e, _Layout): + leaves.extend(list(flatten_to_tuple(e.shape))) + leaves.extend(list(flatten_to_tuple(e.stride))) + else: + leaves.append(e) + return leaves + + layout_leaves = flatten_to_tuple(tile) + dyn_elems = expand_leaves(layout_leaves) + dyn_elems = [ + _get_typed_value(x) for x in dyn_elems if isinstance(x, (Integer, ir.Value)) + ] + + res_ty = _cute_ir.pack_tile(tile) + return _cute_ir.make_tile(res_ty, dyn_elems, loc=loc, ip=ip) + + +def _unpack_x_tuple(t: Union[ir.Type, ir.Value], *, loc=None, ip=None) -> XTuple: + # If t is an MLIR type, make sure it's static and make a Value + if isinstance(t, ir.Type): + if not _cute_ir.is_static(t): + raise ValueError() + t = _cute_ir.static(t) + + if isinstance(t, ir.Value): + input_ty = t.type + if t.type.rank == 0: + # Handle this case separately, _cute_ir.get_leaves will return an Op in this case + vals = [] + else: + vals = _cute_ir.get_leaves(t, loc=loc, ip=ip) + if not isinstance(vals, list): + vals = [vals] + else: + raise TypeError(f"expects static type or value, but got {t}") + + # CuTe IR only supports Int32 for now. Need to support detection of other types + res = _cute_ir.unpack_x_tuple(input_ty, vals) + + def post_process(x): + if isinstance(x, _cute_ir.ScaledBasis): + return ScaledBasis(post_process(x.get_value()), x.get_mode()) + elif isinstance(x, _cute_ir.Ratio): + return Ratio(x.numerator, x.denominator) + else: + return x + + return transform_leaf(post_process, res) + + +#################################################################################################### +# +# Core types +# +#################################################################################################### + + +class IntValue(cutlass_arith.ArithValue): + """Internal representation of constrained integer types with divisibility information. + + IntValue serves as a proxy for constrained integer types in the CuTe IR. Rather than + directly storing values of IntTupleType with depth=0, it stores the result of the + `cute.get_scalars` operation applied to such values. + + This class represents the following sequence of operations in the IR: + %0 = ... : (...) -> !cute.int_tuple<"?"> + %1 = cute.get_scalars(%0) : (!cute.int_tuple<"?">) -> i32 + + where the first operation produces a `cute.int_tuple<"?">` with depth=0 and rank=1. It + automatically emit `cute.get_scalars` and track it. + + IntValue inherits behavior from ArithValue with the following extensions: + * Overloaded operations that accept IntTupleType values to propagate divisibility information + * Support for CuTe operations that utilize divisibility constraints + + API for interacting with IntValue: + * get_typed_value() - Returns the value as an IntTupleType + * get_divisibility() - Returns the divisibility constraint of the value + """ + + def __init__(self, v, signed=True): + # Cute Constrained Int Type is always signed + if isinstance(v, int): + v = _pack_int_tuple(v) + + if isinstance(v.type, _cute_ir.IntTupleType): + scalar_val = _cute_ir.get_scalars(v) + super().__init__(scalar_val, True) + else: + super().__init__(v, True) + + def get_typed_value(self): + if isinstance(self.type, ir.IntegerType): + def_op = self.owner.operation + if def_op.name == "cute.get_scalars": + return def_op.operands[0] + + assert not isinstance(self.type, _cute_ir.IntTupleType) + + return _pack_int_tuple(self) + + @property + def divisibility(self): + if isinstance(self.get_typed_value().type, _cute_ir.IntTupleType): + return self.get_typed_value().type.get_divisibility([0]) + else: + return 1 + + def __str__(self): + if self.divisibility == 1: + return f"?" + else: + return f"?{{div={self.divisibility}}}" + + def __repr__(self): + parent_name = cutlass_arith.ArithValue.__name__ + return super().__str__().replace(parent_name, IntValue.__name__) + + def pretty_str(self): + return self.__str__() + + @staticmethod + def _binary_op(op): + def wrapper(self, other, **kwargs): + if isinstance(other, IntValue): + other_val = other.get_typed_value() + elif isinstance(other, ir.Value) and isinstance( + other.type, _cute_ir.IntTupleType + ): + other_val = other + elif isinstance(other, ir.Value) and isinstance(other.type, ir.IntegerType): + other = cutlass_arith.int_to_int(other, Int32, **kwargs) + other_val = _pack_int_tuple(other) + elif isinstance(other, (int, bool)): + other_val = _pack_int_tuple(int(other)) + else: + # Dispatch to `__rmul__` of `other` + return NotImplemented + + return IntValue(op(self, other_val, **kwargs)) + + return wrapper + + @dsl_user_op + @_binary_op + def __add__(self, other, *, loc=None, ip=None): + return _cute_ir.add_offset(self.get_typed_value(), other, loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __sub__(self, other, *, loc=None, ip=None): + return _cute_ir.tuple_sub(self.get_typed_value(), other, loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __mul__(self, other, *, loc=None, ip=None): + return _cute_ir.tuple_mul(self.get_typed_value(), other, loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __floordiv__(self, other, *, loc=None, ip=None) -> "IntValue": + return _cute_ir.tuple_div(self.get_typed_value(), other, loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __mod__(self, other, *, loc=None, ip=None) -> cutlass_arith.ArithValue: + return _cute_ir.tuple_mod(self.get_typed_value(), other, loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __radd__(self, other, *, loc=None, ip=None) -> "IntValue": + return _cute_ir.add_offset(other, self.get_typed_value(), loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __rsub__(self, other, *, loc=None, ip=None) -> "IntValue": + return _cute_ir.tuple_sub(other, self.get_typed_value(), loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __rmul__(self, other, *, loc=None, ip=None): + return _cute_ir.tuple_mul(other, self.get_typed_value(), loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __rfloordiv__(self, other, *, loc=None, ip=None) -> "IntValue": + return _cute_ir.tuple_div(other, self.get_typed_value(), loc=loc, ip=ip) + + @dsl_user_op + @_binary_op + def __rmod__(self, other, *, loc=None, ip=None) -> "IntValue": + return _cute_ir.tuple_mod(other, self.get_typed_value(), loc=loc, ip=ip) + + +class Ratio(_Ratio): + """A class representing a rational number as a ratio of two integers. + + Ratio is used in CuTe to represent exact fractional values that arise in + tensor layout operations, particularly in composition operations where + divisibility conditions may not be satisfied. + + :param numerator: The numerator of the ratio + :type numerator: int + :param denominator: The denominator of the ratio + :type denominator: int + :raises TypeError: If numerator or denominator are not integers + """ + + def __init__(self, numerator: int, denominator: int): + if not isinstance(numerator, int) or not isinstance(denominator, int): + raise TypeError( + f"numerator and denominator must be integers, but got {numerator} and {denominator}" + ) + super().__init__(numerator, denominator) + + def is_integral(self) -> bool: + """Check if the ratio represents an integer value. + + :return: True if the numerator is divisible by the denominator + :rtype: bool + """ + return super().is_integral() + + def reduced(self) -> "Ratio": + """Return a new Ratio with the numerator and denominator reduced to lowest terms. + + :return: A new Ratio in reduced form + :rtype: Ratio + """ + res = super().reduced() + return Ratio(res.numerator, res.denominator) + + def __mul__(self, other): + """Multiply this ratio by another ratio or an integer. + + :param other: The value to multiply by + :type other: Union[Ratio, int] + :return: A new ratio representing the product + :rtype: Ratio + :raises TypeError: If other is not a Ratio or int + """ + if isinstance(other, Ratio): + return Ratio( + self.numerator * other.numerator, + self.denominator * other.denominator, + ) + elif isinstance(other, int): + return Ratio(self.numerator * other, self.denominator) + else: + raise TypeError(f"Cannot multiply Ratio with {type(other)}") + + def __rmul__(self, other): + """Right multiplication operation. + + :param other: The value to multiply by + :type other: Union[Ratio, int] + :return: A new ratio representing the product + :rtype: Ratio + """ + return self.__mul__(other) + + def __str__(self): + """String representation of the ratio. + + :return: String in the format "numerator/denominator" + :rtype: str + """ + return super().__str__() + + def to(self, dtype): + """Convert the ratio to another type. + + :param dtype: The target type for conversion + :type dtype: type + :return: The ratio converted to the specified type + :raises TypeError: If conversion to the specified type is not supported + """ + if dtype is Ratio: + return self + elif dtype is float: + return self.numerator / self.denominator + elif dtype is int: + return self.numerator // self.denominator + elif issubclass(dtype, _Ratio): + return self + else: + raise TypeError(f"Cannot convert Ratio to {dtype}") + + +class ScaledBasis: + """A class representing a scaled basis element in CuTe's layout algebra. + + ScaledBasis is used to represent elements in the layout algebra, particularly + in the context of composition operations. It consists of a value (scale) and + a mode that identifies mode of the basis element. + + :param value: The scale value + :type value: Union[int, Integer, Ratio, ir.Value] + :param mode: The mode identifying the basis element + :type mode: Union[int, List[int]] + :raises TypeError: If mode is not an integer or list of integers + + **Examples**:: + + # Create a scaled basis with integer scale and mode + sb1 = ScaledBasis(2, 0) # 2 * E(0) + + # Create a scaled basis with a Ratio scale + sb2 = ScaledBasis(Ratio(1, 2), 1) # (1/2) * E(1) + + # Create a scaled basis with a list of modes + sb3 = ScaledBasis(4, [0, 1]) # 4 * E([0, 1]) + + # Scaled basis elements are commonly used in layout strides + layout = make_layout((4, 8), stride=(ScaledBasis(1, 0), ScaledBasis(1, 1))) + + # This creates a layout with strides (1@0, 1@1) representing + # a coordinate system where each dimension has its own basis + """ + + def __init__(self, value, mode) -> None: + if isinstance(mode, int): + self._mode = [mode] + else: + if any(not isinstance(x, int) for x in mode): + raise TypeError("Mode must be a list of integers") + self._mode = mode + + self._value = value + + def is_static(self) -> bool: + """Check if the value is statically known. + + :return: True if the value is not a dynamic expression + :rtype: bool + """ + return not is_dynamic_expression(self._value) + + def to(self, dtype): + """Convert to another type. + + :param dtype: The target type for conversion + :type dtype: type + :return: The ScaledBasis converted to the specified type + :raises TypeError: If conversion to the specified type is not supported + """ + if dtype is ScaledBasis: + return self + elif dtype is _ScaledBasis: + if isinstance(self._value, Ratio): + scale = self._value + elif isinstance(self._value, Integer): + scale = self._value.ir_value() + else: + scale = self._value + + if isinstance(scale, IntValue): + return _ScaledBasis(scale.get_typed_value(), self._mode) + else: + return _ScaledBasis(scale, self._mode) + else: + raise TypeError(f"Cannot convert ScaledBasis to {dtype}") + + def __str__(self): + return f"{self.to(_ScaledBasis).__str__()}" + + def __hash__(self): + if isinstance(self.mode, list): + return hash((self.value, tuple(self.mode))) + else: + return hash((self.value, self.mode)) + + @property + def value(self): + """Get the scale value. + + :return: The scale value + """ + return self._value + + @property + def mode(self) -> List[int]: + """Get the mode identifying the basis element. + + :return: The mode as a list of integers + :rtype: List[int] + """ + return self._mode + + def __eq__(self, other): + if isinstance(other, ScaledBasis): + return self.value == other.value and self.mode == other.mode + else: + return False + + def __rmul__(self, scale: Union[Int, ir.Value, Ratio]) -> "ScaledBasis": + """Right multiplication by a scale factor. + + This operation is used in layout algebra to scale basis elements, + which is essential for operations like composition and partitioning. + + :param scale: The scale factor + :type scale: Union[Int, ir.Value, Ratio] + :return: A new scaled basis element + :rtype: ScaledBasis + :raises TypeError: If scale is not of a supported type + :raises NotImplementedError: If scaling a basis element with a ratio value + """ + if not isinstance(scale, (int, Integer, Ratio, ir.Value)): + raise TypeError( + f"scale must be an integer or a ratio, but got {type(scale)}" + ) + if isinstance(self.value, Ratio): + raise NotImplementedError( + "scaling a basis element having a ratio is not supported" + ) + + value = self.value + + if not isinstance(value, (Integer, Ratio, int, cutlass_arith.ArithValue)): + raise TypeError(f"Don't support {type(value)} for ScaledBasis") + + # Lift to IntValue type to preserve type info as much as possible + if isinstance(scale, cutlass_arith.ArithValue): + scale = IntValue(_pack_int_tuple(cutlass_arith.int_to_int(scale, Int32))) + + if isinstance(value, cutlass_arith.ArithValue): + value = IntValue(_pack_int_tuple(cutlass_arith.int_to_int(value, Int32))) + elif isinstance(value, Integer): + value = value.ir_value() + + return ScaledBasis(scale * value, self.mode) # type: ignore + + +def E(mode: Union[int, List[int]]) -> ScaledBasis: + """Create a unit ScaledBasis element with the specified mode. + + This function creates a ScaledBasis with value 1 and the given mode. + The mode represents the coordinate axis or dimension in the layout. + + :param mode: The mode (dimension) for the basis element, either a single integer or a list of integers + :type mode: Union[int, List[int]] + :return: A ScaledBasis with value 1 and the specified mode + :rtype: ScaledBasis + :raises TypeError: If mode is not an integer or a list + + **Examples**:: + + # Create a basis element for the first dimension (mode 0) + e0 = E(0) + + # Create a basis element for the second dimension (mode 1) + e1 = E(1) + + # Create a basis element for a hierarchical dimension + e_hier = E([0, 1]) + """ + if isinstance(mode, int): + mode = [mode] + + if not isinstance(mode, list): + raise TypeError(f"expects a list, got {type(mode)}") + + if not mode: + return 1 + + return ScaledBasis(1, mode) + + +def get_divisibility(x: Union[int, Integer]) -> int: + if isinstance(x, int): + return x + + if isinstance(x, Integer): + x = x.value + + if isinstance(x, IntValue): + return x.divisibility + else: + return 1 + + +@ir.register_value_caster(_cute_ir.SwizzleType.get_static_typeid(), replace=True) +class Swizzle(ir.Value): + """ + Swizzle is a transformation that permutes the elements of a layout. + + Swizzles are used to rearrange data elements to improve memory access patterns + and computational efficiency. + + Swizzle is defined by three parameters: + - MBase: The number of least-significant bits to keep constant + - BBits: The number of bits in the mask + - SShift: The distance to shift the mask + + The mask is applied to the least-significant bits of the layout. + + .. code-block:: + + 0bxxxxxxxxxxxxxxxYYYxxxxxxxZZZxxxx + ^--^ MBase is the number of least-sig bits to keep constant + ^-^ ^-^ BBits is the number of bits in the mask + ^---------^ SShift is the distance to shift the YYY mask + (pos shifts YYY to the right, neg shifts YYY to the left) + + e.g. Given + 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxZZxxx + the result is + 0bxxxxxxxxxxxxxxxxYYxxxxxxxxxAAxxx where AA = ZZ xor YY + + """ + + def __str__(self): + # Cut off the MLIR type's string for making pretty_str more concise + return self.type.__str__()[15 : 15 + 8] + + +@ir.register_value_caster(_cute_ir.LayoutType.get_static_typeid(), replace=True) +class _Layout(Layout): + """Layout is CuTe's core abstraction for representing tensor layouts. + + A Layout maps from a logical coordinate space to an index space, defined by a + pair of (Shape, Stride). The Shape defines the abstract dimensions of the Layout, + while the Stride defines how coordinates within the Shape map to linear indices. + + Layouts present a common interface to multidimensional array access that abstracts + away the details of how array elements are organized in memory. This allows algorithms + to be written generically, so that layouts can change without requiring code changes. + + CuTe layouts are inherently hierarchical, constructed from smaller, nested layouts + that can represent complex mappings required by GPU tensor instructions. They support + a rich algebra of operations including concatenation, coalescence, composition, + complement, and inversion. + + :ivar shape: An IntTuple representing the dimensions of the layout. + :ivar stride: An IntTuple representing the strides of the layout. + :ivar max_alignment: The maximum alignment of the layout. + + **Examples**:: + + .. code-block:: python + + # Creating a layout with shape (4,8) and default stride (layout left / "column major") + layout = cute.make_layout((4, 8)) + + # Creating a layout with explicit shape and stride + layout = cute.make_layout((4, 8), stride=(8, 1)) + + # Accessing a specific coordinate: (2, 3) -> 2 * 8 + 3 * 1 = 19 + idx = cute.crd2idx((2, 3), layout) + """ + + def __init__(self, op_result) -> None: + """Initialize a Layout object. + + :param op_result: The operation result value to wrap. + """ + super().__init__(op_result) + + def __str__(self) -> str: + """Return a string representation of the layout. + + :return: A string in the format "shape:stride". + """ + return f"{pretty_str(self.shape)}:{pretty_str(self.stride)}" + + @property + def shape(self, *, loc=None, ip=None) -> Shape: + """Get the shape of the layout. + + The shape defines the dimensions and structure of the layout's + coordinate space. + + :param loc: Optional location information for debugging. + :param ip: Optional insertion point for IR generation. + :return: The hierarchical shape of the layout. + """ + return _unpack_x_tuple(_cute_ir.get_shape(self, loc=loc, ip=ip), loc=loc, ip=ip) + + @property + def stride(self, *, loc=None, ip=None) -> Stride: + """Get the stride of the layout. + + The stride defines how coordinates map to linear indices in memory. + + :param loc: Optional location information for debugging. + :param ip: Optional insertion point for IR generation. + :return: The hierarchical stride of the layout. + """ + return _unpack_x_tuple( + _cute_ir.get_stride(self, loc=loc, ip=ip), loc=loc, ip=ip + ) + + @property + def max_alignment(self) -> int: + """Get the maximum alignment of the layout. + + :return: The maximum alignment in bytes. + """ + return self.type.max_alignment + + def __eq__(self, other) -> Union[bool, Boolean]: + """Check if this layout is equal to another layout. + + Two layouts are equal if they have the same shape and stride. + + :param other: The layout to compare with. + :return: True if layouts are equal, False otherwise. + May return an IR value for dynamic layouts. + """ + if isinstance(other, Layout): + if is_static(self.type) and is_static(other.type): + return self.type == other.type + return Boolean(_cute_ir.equal(self, other)) + else: + return False + + def __req__(self, other) -> Union[bool, Boolean]: + """Reflected equality check. + + :param other: The layout to compare with. + :return: Result of other.__eq__(self). + """ + if isinstance(other, Layout): + return other.__eq__(self) + return False + + def __ne__(self, other) -> Union[bool, Boolean]: + """Check if this layout is not equal to another layout. + + :param other: The layout to compare with. + :return: True if layouts are not equal, False otherwise. + """ + if isinstance(other, Layout): + if is_static(self.type) and is_static(other.type): + return self.type != other.type + return Boolean(not_(_cute_ir.equal(self, other))) + else: + return True + + def __rne__(self, other) -> Union[bool, Boolean]: + """Reflected inequality check. + + :param other: The layout to compare with. + :return: Result of other.__ne__(self). + """ + if isinstance(other, Layout): + return other.__ne__(self) + return False + + def __getitem__(self, idx: int) -> Layout: + """ + Top-level `get` to provide a syntax similar to `tuple`. + """ + return get(self, mode=[idx]) + + @dsl_user_op + def __call__(self, coord: Coord, loc=None, ip=None) -> IntTuple: + return crd2idx(coord, self, loc=loc, ip=ip) + + @dsl_user_op + def get_hier_coord(self, idx, *, loc=None, ip=None) -> Coord: + """Get the hierarchical coordinate corresponding to a linear index. + + This method maps from a linear index back to the logical coordinate + in the layout's coordinate space. + + :param idx: The linear index to convert. + :return: The hierarchical coordinate corresponding to the index. + + **Examples**:: + + layout = make_layout((4, 8), stride=(8, 1)) + + # map linear index back to coordinate: 5 -> (1, 1) + coord = get_hier_coord(5, layout) + """ + idx_val = Int32(idx).ir_value() + crd = _cute_ir.get_hier_coord(idx_val, self, loc=loc, ip=ip) + return _unpack_x_tuple(crd) + + @dsl_user_op + def get_flat_coord(self, idx, *, loc=None, ip=None) -> Coord: + idx_val = Int32(idx).ir_value() + res = _cute_ir.get_flat_coord(idx_val, self, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + + +@ir.register_value_caster(_cute_ir.ComposedLayoutType.get_static_typeid(), replace=True) +class ComposedLayout(ir.Value): + """ComposedLayout represents the functional composition of layouts in CuTe. + + A ComposedLayout is formed by the composition of three components: + inner o offset o outer, where: + + - inner: The inner layout or swizzle that is applied last + - offset: An integer tuple representing a coordinate offset + - outer: The outer layout that is applied first + + ComposedLayout implements the functional composition operation where: + R(c) := (inner o offset o outer)(c) := inner(offset + outer(c)) + + This composition allows for complex transformations of coordinates and indices, + enabling operations like tiling, partitioning, and reshaping of data. + + :ivar inner: The inner layout or swizzle component + :ivar offset: The coordinate offset applied between inner and outer layouts + :ivar outer: The outer layout component + :ivar max_alignment: The maximum alignment of the composed layout + + **Examples**:: + + # Create a composed layout with inner layout, offset, and outer layout + + # inner layout: (4, 8):(1, 4) + inner_layout = make_layout((4, 8)) + + offset = (0, 0) + + # outer layout: (2, 2):(1@0, 1@1) + outer_layout = make_layout((2, 2), stride=(1 * E(0), 1 * E(1))) + + # composed layout: (inner o offset o outer) + composed = make_composed_layout(inner_layout, offset, outer_layout) + + # Accessing components of the composed layout + inner = composed.inner + offset = composed.offset + outer = composed.outer + + # map coordinate (1, 2) to linear index + # - outer(1, 2) = (1, 2) + # - offset + outer(1, 2) = (1, 2) + # - inner(1, 2) = 1 * 1 + 2 * 4 = 9 + idx = crd2idx((1, 2), composed) + + # Composition is used in many tiling operations + # For example, in logical_product, raked_product, and blocked_product + """ + + def __init__(self, value) -> None: + """Initialize a ComposedLayout object. + + :param value: The operation result value to wrap. + """ + super().__init__(value) + + def __str__(self) -> str: + return f"{pretty_str(self.inner)} o {pretty_str(self.offset)} o {pretty_str(self.outer)}" + + @property + def inner(self, *, loc=None, ip=None) -> Union[Swizzle, Layout]: + return _cute_ir.composed_get_inner(self, loc=loc, ip=ip) + + @property + def offset(self, *, loc=None, ip=None) -> IntTuple: + return _unpack_x_tuple(_cute_ir.composed_get_offset(self, loc=loc, ip=ip)) + + @property + def outer(self, *, loc=None, ip=None) -> Layout: + return _cute_ir.composed_get_outer(self, loc=loc, ip=ip) + + @property + def shape(self, *, loc=None, ip=None) -> Shape: + return _unpack_x_tuple(_cute_ir.get_shape(self, loc=loc, ip=ip), loc=loc, ip=ip) + + @property + def max_alignment(self) -> int: + return self.type.max_alignment + + def __eq__(self, other) -> Union[bool, Boolean]: + if isinstance(other, ComposedLayout): + if is_static(self.type) and is_static(other.type): + return self.type == other.type + else: + raise NotImplementedError( + f"runtime comparison of composed layouts is not supported, got `{self}` and `{other}`" + ) + else: + return False + + def __req__(self, other) -> Union[bool, Boolean]: + if isinstance(other, ComposedLayout): + return Boolean(other.__eq__(self)) + return False + + def __ne__(self, other) -> Union[bool, Boolean]: + return not self.__eq__(other) + + def __rne__(self, other) -> Union[bool, Boolean]: + if isinstance(other, ComposedLayout): + return other.__ne__(self) + return False + + def __getitem__(self, idx: int) -> "ComposedLayout": + """ + Top-level `get` to provide a syntax similar to `tuple`. + """ + return get(self, mode=[idx]) + + @dsl_user_op + def __call__(self, coord: Coord, loc=None, ip=None) -> IntTuple: + return crd2idx(coord, self, loc=loc, ip=ip) + + +@ir.register_value_caster(_cute_ir.PtrType.get_static_typeid(), replace=True) +class _Pointer(Pointer): + """ + A pointer class representing a memory address with specific properties. + + Pointers are a fundamental type of iterator/engine that support random-access operations. + They can be offset by elements of a layout's codomain and dereferenced to produce values. + + :param value: The MLIR operation result value to initialize the pointer with + :type value: ir.Value + + :ivar type: The MLIR type of the pointer + :vartype type: Type + :ivar value_type: The type of value this pointer points to + :vartype value_type: Type + :ivar memspace: The memory space where the pointer data resides (e.g., gmem, smem, rmem) + :vartype memspace: AddressSpace + + :note: When composed with a layout, a pointer forms a tensor: T = E ∘ L, where E is the pointer + and L is the layout. The tensor evaluates the layout by mapping a coordinate c to the + codomain, offsets the pointer accordingly, and dereferences the result: + T(c) = (E ∘ L)(c) = *(E + L(c)) + """ + + def __init__(self, value) -> None: + assert isinstance(value, ir.Value) + self.value = ir.Value(value) + + def __str__(self) -> str: + # Cut off the MLIR type's string for making pretty_str more concise + return self.type.__str__()[6:] + + def __extract_mlir_values__(self): + return [self.value] + + def __new_from_mlir_values__(self, values): + # Only expecting single value of _Pointer instance or ir.Value + # In this context, a _Pointer instance is an encapsulated ir.Value which is automatically created + # by value caster for cute.ptr typed values + assert len(values) == 1, f"Expected 1 value, but got {len(values)}" + assert isinstance( + values[0], (_Pointer, ir.Value) + ), f"Expected _Pointer or ir.Value, but got {type(values[0])}" + return _Pointer( + values[0] if isinstance(values[0], ir.Value) else values[0].value + ) + + @property + @lru_cache_ir() + def value_type(self) -> Type[Numeric]: + return Numeric.from_mlir_type(self.value.type.value_type) + + @property + def alignment(self) -> int: + return self.type.alignment + + @property + def max_alignment(self) -> int: + return self.type.max_alignment + + @property + @lru_cache_ir() + def memspace(self) -> AddressSpace: + return self.type.address_space + + # Make it behave as if it inherited from ir.Value + @property + @lru_cache_ir() + def type(self) -> ir.Type: + return self.value.type + + # Only use if you absolutely need to get the LLVM pointer Value + @property + @lru_cache_ir() + def llvm_ptr(self, *, loc=None, ip=None) -> ir.Value: + """ + Get the LLVM pointer representation of this pointer. + + :param loc: The source location for the operation, defaults to None + :type loc: Location, optional + :param ip: The insertion point for the operation, defaults to None + :type ip: InsertionPoint, optional + :return: The LLVM pointer representation + :rtype: ir.Value + """ + llvm_ptr_ty = llvm.PointerType.get(self.type.address_space) + return builtin.unrealized_conversion_cast( + [llvm_ptr_ty], [self.value], loc=loc, ip=ip + ) + + def __add__(self, offset: IntTuple) -> Pointer: + """ + Offset the pointer by elements of a layout's codomain. + + :param offset: The offset to add to the pointer + :type offset: IntTuple + :return: A new pointer offset by the specified amount + :rtype: ir.Value + """ + offset = _pack_int_tuple(offset) + return _cute_ir.add_offset(self.value, offset=offset) + + @dsl_user_op + def toint(self, *, loc=None, ip=None): + if self.type.address_space in ( + _cute_ir.AddressSpace.gmem, + _cute_ir.AddressSpace.generic, + ): + res_type = Int64 + else: + res_type = Int32 + + return res_type( + _cute_ir.ptrtoint(res_type.mlir_type, self.value, loc=loc, ip=ip) + ) + + @dsl_user_op + def align(self, min_align: int, *, loc=None, ip=None) -> Pointer: + """ + Align a pointer to a specified byte alignment. + + :param min_align: The minimum byte alignment requirement. Must be a power of 2. + :type min_align: int + :param loc: The source location for the operation, defaults to None + :type loc: Location, optional + :param ip: The insertion point for the operation, defaults to None + :type ip: InsertionPoint, optional + :return: The aligned new pointer that satisfies alignment request. + :rtype: Pointer + :raises ValueError: If the alignment is not a power of 2. + :raises TypeError: If pointer is in tmem address space. + """ + + if (min_align & (min_align - 1)) != 0: + raise ValueError("Alignment must be a power of 2") + + assert isinstance(self.type, _cute_ir.PtrType) + if self.type.address_space is AddressSpace.tmem: + raise ValueError("aligning a TMEM pointer is not supported") + + if min_align <= self.alignment: + return self + else: + # Convert pointer to integer + address_int = self.toint(loc=loc, ip=ip) + # Align the address + aligned_address = (address_int + min_align - 1) & ~(min_align - 1) + # Create and return the aligned pointer + return make_ptr( + Numeric.from_mlir_type(self.type.value_type), + aligned_address, + self.type.address_space, + assumed_align=min_align, + loc=loc, + ip=ip, + ) + + +@ir.register_value_caster(_cute_ir.MemRefType.get_static_typeid(), replace=True) +@ir.register_value_caster(_cute_ir.CountingTensorType.get_static_typeid(), replace=True) +@ir.register_value_caster( + _cute_nvgpu_ir.SmemDescViewType.get_static_typeid(), replace=True +) +class _Tensor(Tensor): + """A tensor class representing the composition of an iterator (engine) with a layout. + + A tensor evaluates the layout by mapping a coordinate to the codomain, offsets the + iterator accordingly, and dereferences the result to obtain the tensor's value. + Formally: T(c) = (E ∘ L)(c) = *(E + L(c)), where E is the iterator/engine and L is the layout. + + :param value: The MLIR operation result value to initialize the tensor with + :type value: ir.Value + :param dtype: The user specified data type of the tensor elements. It could be \ + different from the underlying dtype in the iterator. The default is None. + :type dtype: Type[Numeric], optional + + Attributes: + iterator: The pointer or iterator (engine) component of the tensor + layout: The layout component defining the mapping from coordinates to offsets + shape: The shape of the tensor, inherited from the layout + stride: The stride of the tensor, inherited from the layout + element_type: The data type of the tensor elements + memspace: The memory space where the tensor data resides + + Notes: + - The tensor supports both direct element access via coordinates and slicing operations + - Load/store operations are only supported for specific memory spaces (rmem, smem, gmem, generic) + - For composed layouts, stride information is not directly accessible + - Dynamic layouts do not support vector load/store operations + + Examples: + + .. code-block:: python + + # Create a tensor with shape (4,8) in row-major layout + tensor = make_tensor(ptr, make_layout(shape=(4,8), stride=(8,1))) + + # Access individual element + val = tensor[0, 0] # or val = tensor[(0, 0)] + + # Slice operation - get first column + subtensor = tensor[None, 0] # or subtensor = tensor[(None, 0)] + """ + + def __init__(self, value, dtype: Optional[Type[Numeric]] = None): + self._dtype = dtype + if isinstance(value, ir.Value): + self.value = value + else: + raise TypeError(f"Expected ir.Value, got {type(value)}") + + def __str__(self): + return f"tensor<{pretty_str(self.iterator)} o {pretty_str(self.layout)}>" + + def __extract_mlir_values__(self): + return [self.value] + + def __new_from_mlir_values__(self, values): + # Only expecting single value of _Tensor or ir.Value + # In this context, a _Tensor instance is an encapsulated ir.Value which is automatically created + # by value caster for MemRef/CountingTensor/SmemDescView typed values + assert len(values) == 1, f"Expected 1 value, but got {len(values)}" + assert isinstance( + values[0], (_Tensor, ir.Value) + ), f"Expected _Tensor or ir.Value, but got {type(values[0])}" + return _Tensor( + values[0] if isinstance(values[0], ir.Value) else values[0].value, + self._dtype, + ) + + # Cheat to let `Type(_Tensor())` to return cute.Tensor + @property + def __class__(self) -> Type[Tensor]: + return Tensor + + # Make it behave as if it inherited from ir.Value + @property + @lru_cache_ir() + def type(self) -> ir.Type: + return self.value.type + + @dsl_user_op + def __getitem__( + self, crd: Coord, *, loc=None, ip=None + ) -> Union[Tensor, Numeric, IntTuple]: + """Access or slice tensor elements using coordinates. + + This method implements + * tensor evaluation T(c) = *(E + L(c)) when `c` is a coordinate without slicing, or + * tensor slicing operations T(c) = make_tensor(E + L(c), slice(L, c)) + where E is the iterator/engine and L is the layout + + :param crd: Coordinate or slice specification for accessing tensor elements + :type crd: Coord + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: Tensor element value or sliced subtensor + :rtype: Union[Tensor, ir.Value, IntTuple] + + :raises ValueError: If coordinate access is invalid for the tensor layout + + Examples: + + .. code-block:: python + + # Create a tensor with pointer iterator + ptr = make_ptr(cutlass.Float32, 0, cutlass.AddressSpace.gmem) + layout = make_layout((64, 128)) # leftmost mode is major + tensor = make_tensor(ptr, layout) # Tensor using pointer iterator + + # Direct element access loads from memory + val = tensor[0] # Loads element at offset 0 + val = tensor[1] # Loads element at offset 4 (4bytes per Float32) + val = tensor[(0, 1)] # Loads element at offset 64 + + # Create a counting tensor + layout = make_layout((64, 128), stride=(1 * E(0), 1 * E(1))) + tensor = make_tensor((128, 128), layout) + + # Direct element access + val = tensor[0] # Returns (128, 128) + val = tensor[(0, 1)] # Returns (128, 129) + + # Slice access + sliced = view[(3, None)] # Returns tensor slice + + .. note:: + Sub-byte types like Float4E2M1FN and Float6E3M2FN are not supported for scalar + dereference operations. Attempting to set individual elements of tensors with + these element types will result in errors. + + Examples: + + .. code-block:: python + + # Unsupported operations with sub-byte types: + ptr = make_ptr(cutlass.Float4E2M1FN, 0, cutlass.AddressSpace.gmem) + tensor = make_tensor(ptr, layout) + # The following will raise an error: + val = tensor[0] # Error: sub-byte scalar dereference not supported + + # Similarly for other sub-byte types: + ptr = make_ptr(cutlass.Float6E3M2FN, 0, cutlass.AddressSpace.gmem) + tensor = make_tensor(ptr, layout) + val = tensor[0] # Error: sub-byte scalar dereference not supported + """ + if has_underscore(crd): + return slice_(self.value, crd) + elif isinstance(self.type, _cute_ir.CountingTensorType): + res = _cute_ir.get_iter(slice_(self, crd).value, loc=loc, ip=ip) + return _unpack_x_tuple(res) + else: + self._check_can_load_store() + self._check_can_dereference() + + crd_val = _pack_coord(crd, loc=loc, ip=ip) + data_val = _cute_ir.memref_load(self.value, crd_val, loc=loc, ip=ip) + return self.element_type(data_val) + + def _cvt_to_dest(self, data: Union["TensorSSA", Numeric], *, loc=None, ip=None): + if data.dtype is self.element_type: + return data.ir_value(loc=loc, ip=ip) + + orig_dtype = data.dtype + # Implicit upcast to wider type + if ( + data.dtype.is_same_kind(self.element_type) + and self.element_type.width >= data.dtype.width + ): + data = data.to(self.element_type, loc=loc, ip=ip) # type: ignore + + if data.dtype.width != self.element_type.width: + raise ValueError( + f"Type mismatch, store {orig_dtype} (-> {data.dtype}) " + f"to Tensor with element type {self.element_type}" + ) + + val = data.ir_value(loc=loc, ip=ip) + if isinstance(data.dtype, (Int8, Boolean)) and (self.element_type is Boolean): + zero = Int8(0).ir_value(loc=loc, ip=ip) + val = arith.cmpi(arith.CmpIPredicate.ne, val, zero, loc=loc, ip=ip) + + return val + + @dsl_user_op + def __setitem__( + self, + crd: Coord, + data: Union[int, float, ir.Value, Numeric, "TensorSSA"], + *, + loc=None, + ip=None, + ) -> None: + """Set tensor elements at specified coordinates. + + Assigns values to tensor elements through direct coordinate access or slice assignment. + For slice assignment, the value must be a TensorSSA with matching shape. + + :param crd: Coordinate or slice specification for tensor element assignment + :type crd: Coord + :param value: Value to assign - can be scalar or TensorSSA for slice assignment + :type value: Union[int, float, ir.Value, TensorSSA] + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + + :raises ValueError: If tensor type doesn't support load/store operations + :raises ValueError: If slice assignment value is not a TensorSSA + :raises ValueError: If value type doesn't match tensor element type + :raises NotImplementedError: If value type is not supported + + .. note:: + Sub-byte types like Float4E2M1FN and Float6E3M2FN are not supported for scalar + dereference operations. Attempting to set individual elements of tensors with + these element types will result in errors. + + Examples: + + .. code-block:: python + + # Unsupported operations with sub-byte types: + ptr = make_ptr(cutlass.Float4E2M1FN, 0, cutlass.AddressSpace.gmem) + tensor = make_tensor(ptr, layout) + # The following will raise an error: + tensor[0] = 1.0 # Error: sub-byte scalar dereference not supported + + # Similarly for other sub-byte types: + ptr = make_ptr(cutlass.Float6E3M2FN, 0, cutlass.AddressSpace.gmem) + tensor = make_tensor(ptr, layout) + tensor[0] = 0.5 # Error: sub-byte scalar dereference not supported + """ + self._check_can_load_store() + + # convert scalar type + if not has_underscore(crd): + self._check_can_dereference() + # First, convert ir.Value to Numeric + if isinstance(data, ir.Value): + data = as_numeric(data) + elif isinstance(data, (int, float, bool)): + data = as_numeric(data) + + if not isinstance(data, Numeric): + raise ValueError(f"unsupported data type: {type(data)}") + + # Implicit upcast to wider type + val = self._cvt_to_dest(data, loc=loc, ip=ip) + if val.type != self.element_type.mlir_type: + raise ValueError( + f"type mismatch, store {val.type} to {self.element_type}" + ) + + crd_val = _pack_coord(crd, loc=loc, ip=ip) + _cute_ir.memref_store(self.value, crd_val, val, loc=loc, ip=ip) + else: + if not isinstance(data, TensorSSA): + raise ValueError(f"expects TensorSSA, but got {data}") + + self.__getitem__(crd).store(data, loc=loc, ip=ip) # type: ignore + + @property + def __class__(self) -> Type[Tensor]: + return Tensor + + # Make it behave as if it inherited from ir.Value + @property + @lru_cache_ir() + def type(self) -> ir.Type: + return self.value.type + + @property + def iterator(self) -> Union[Pointer, IntTuple]: + res = _cute_ir.get_iter(self.value) + if isinstance(res, Pointer): + return res + elif isinstance(res.type, _cute_ir.IntTupleType): + return _unpack_x_tuple(res) + elif isinstance(res, ir.Value): + # Example: SMEM descriptor iterator, not well supported today + return res + else: + raise TypeError(f"unsupported iterator type, got {type(res)}") + + @property + def layout(self) -> Layout: + return _cute_ir.get_layout(self.value) + + @property + def shape(self) -> Shape: + return self.layout.shape + + @property + def stride(self) -> Stride: + if isinstance(self.type, _cute_ir.ComposedLayoutType): + raise ValueError(f"can't get stride from composed layout") + return self.layout.stride + + @property + def leading_dim(self) -> Union[int, Tuple[int], None]: + """ + Get the leading dimension of this Tensor. + + Returns: + int: Single leading dimension index if found + Tuple[int, ...]: Tuple of indices for nested leading dimensions + None: If no leading dimension is found + """ + return find(1, self.stride, exclude_when=(1, self.shape)) + + @property + @lru_cache_ir() + def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]: + if is_integer(self.iterator) or isinstance(self.iterator, tuple): + return IntTuple + elif isinstance(self.iterator, Pointer): + return self.iterator.value_type + else: + raise TypeError(f"unsupported iterator type, got {type(self.iterator)}") + + @property + @lru_cache_ir() + def memspace(self) -> AddressSpace: + if isinstance(self.iterator, Pointer): + return self.iterator.memspace + + raise ValueError(f"{self} doesn't have memspace") + + @dsl_user_op + def load(self, *, loc=None, ip=None) -> "TensorSSA": + """Load tensor elements as a vector. + + Loads all elements of the tensor into a vector representation, assuming the tensor + has a static shape and is in a memory space that supports load operations. + + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: Vector representation of tensor elements + :rtype: TensorSSA + + :raises ValueError: If tensor has dynamic layout + :raises ValueError: If tensor memory space doesn't support load operations + """ + if not is_static(self.shape): + raise ValueError("dynamic layout doesn't support load") + + self._check_can_load_store() + + res_vect = _cute_ir.memref_load_vec(self.value, row_major=True, loc=loc, ip=ip) + + return TensorSSA(res_vect, self.shape, self.element_type) + + @dsl_user_op + def store(self, data: "TensorSSA", *, loc=None, ip=None): + """Store vector data into tensor. + + Stores vector data into the tensor, assuming matching shapes and a memory space + that supports store operations. + + :param data: Vector data to store into tensor + :type data: TensorSSA + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + + :raises ValueError: If tensor has dynamic layout + :raises ValueError: If tensor memory space doesn't support store operations + :raises ValueError: If data shape doesn't match tensor shape + """ + if not isinstance(data, TensorSSA): + raise ValueError(f"Expects TensorSSA, but got {type(data)}") + + if not is_static(self.shape): + raise ValueError("Dynamic layout doesn't support vectorized store") + + self._check_can_load_store() + + n_elems = size(self.shape, loc=loc, ip=ip) + if n_elems != size(data.shape, loc=loc, ip=ip): + raise ValueError( + f"lhs and rhs must have the same shape, but got {self.shape} and {data.shape}" + ) + + elem_mlir_type = cutlass_arith.element_type(data.dtype.mlir_type) + if cutlass_arith.is_narrow_precision(elem_mlir_type): + if elem_mlir_type.width * n_elems % 32 != 0: + raise ValueError( + f"narrow precision type must be 32-bit aligned vector, but got {elem_mlir_type} with {n_elems} elements" + ) + + # Implicit upcast to wider type + new_data = self._cvt_to_dest(data, loc=loc, ip=ip) + + return _cute_ir.memref_store_vec( + new_data, self.value, row_major=True, loc=loc, ip=ip + ) + + @dsl_user_op + def fill(self, value: Numeric, *, loc=None, ip=None) -> None: + """Fill tensor with a constant value. + + Fills all elements of the tensor with the specified value, assuming static size + and supported memory space. + + :param value: Value to fill tensor with + :type value: Union[int, float] + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + + :raises NotImplementedError: If tensor has dynamic size + + Examples: + + .. code-block:: python + + # Create tensor from numpy array + b = np.random.randn(4, 8).astype(np.float32) + tensor = from_dlpack(b) + + # Fill tensor with constant value + tensor.fill(0.5) # All elements become 0.5 + """ + self._check_can_load_store() + + sz = size(self, loc=loc, ip=ip) + if type(sz) is not int: + raise NotImplementedError(f"dynamic size is not supported: {self.type}") + + # Should we cast to destination type even with narrow cast? + dst_type = self.element_type + value = dst_type(value) + + self[None] = full(self.shape, fill_value=value, dtype=dst_type, loc=loc, ip=ip) + + def _check_can_load_store(self): + if not isinstance( + self.type, _cute_ir.MemRefType + ) or not self.type.address_space in ( + AddressSpace.rmem, + AddressSpace.smem, + AddressSpace.gmem, + AddressSpace.generic, + ): + raise ValueError(f"{self} doesn't support load and store") + + def _check_can_dereference(self): + # Check for sub-byte types and raise error if needed + if self.element_type.width % 8 != 0 and self.element_type is not Boolean: + raise ValueError( + f"Sub-byte scalar dereference not supported for type {self.element_type}" + ) + + +@dsl_user_op +def print_tensor(tensor: Tensor, *, verbose: bool = False, loc=None, ip=None): + """Print content of the tensor in human readable format. + + tensor(raw_ptr<@..., Float32, generic, align(4)> o (8,5):(5,1), data= + [[-0.4326, -0.5434, 0.1238, 0.7132, 0.8042], + [-0.8462, 0.9871, 0.4389, 0.7298, 0.6948], + [ 0.3426, 0.5856, 0.1541, 0.2923, 0.6976], + [-0.1649, 0.8811, 0.1788, 0.1404, 0.2568], + [-0.2944, 0.8593, 0.4171, 0.8998, 0.1766], + [ 0.8814, 0.7919, 0.7390, 0.4566, 0.1576], + [ 0.9159, 0.7577, 0.6918, 0.0754, 0.0591], + [ 0.6551, 0.1626, 0.1189, 0.0292, 0.8655]]) + """ + if not isinstance(tensor.type, _cute_ir.MemRefType): + raise NotImplementedError( + f"printing {tensor} is not supported because it doesn't support trivial dereferencing. " + f"Coordinate Tensor will be supported in the future." + ) + + tensor._check_can_load_store() # type: ignore + + if tensor.element_type.is_integer: + signed = tensor.element_type.signed + else: + signed = False + + _cute_ir.print_view(tensor.value, verbose=verbose, is_signed=signed, loc=loc, ip=ip) + + +#################################################################################################### +# +# Core API +# +#################################################################################################### + + +# +# Utilties +# + + +@lru_cache_ir() +def is_integer(a) -> bool: + """Check if an object is static integer or dynamic integer""" + return ( + isinstance(a, int) + or isinstance(a, Integer) + or (isinstance(a, ir.Value) and isinstance(a.type, ir.IntegerType)) + ) + + +def is_valid_leaf(a) -> bool: + """ + Returns whether `a` has a type that is valid for a CuTe tuple's leaf. + """ + return ( + is_integer(a) + or (a is None) + or isinstance(a, (ScaledBasis, Layout, ComposedLayout)) + ) + + +def is_int_tuple(a) -> bool: + if isinstance(a, tuple): + return all([is_int_tuple(x) for x in a]) + else: + return is_integer(a) + + +def is_static(x: Union[ir.Type, ir.Value, XTuple]) -> bool: + """Check if a value is statically known at compile time. + + In CuTe, static values are those whose values are known at compile time, + as opposed to dynamic values which are only known at runtime. + + :param x: The value to check + :type x: Union[ir.Type, ir.Value, XTuple] + :return: True if the value is static, False otherwise + :rtype: bool + :raises TypeError: If an unsupported type is provided + """ + if isinstance(x, ir.Type): + return _cute_ir.is_static(x) + elif isinstance(x, tuple): + return all(is_static(a) for a in x) + # Can it be a static int? + elif isinstance(x, Numeric): + return False + elif is_dynamic_expression(x): + return _cute_ir.is_static(x.type) + elif isinstance(x, int) or x is None: + return True + elif isinstance(x, ScaledBasis): + return x.is_static() + else: + raise TypeError(f"unsupported type {x}") + + +def has_underscore(a: XTuple) -> bool: + if type(a) is tuple: + return any([has_underscore(x) for x in a]) + else: + return a is None + + +def has_scaled_basis(a: XTuple) -> bool: + """Check if a tuple or its nested elements contain ScaledBasis objects. + + ScaledBasis objects are fundamental components in CuTe layouts, + representing the basis vectors of coordinate systems. + + :param a: The tuple to check + :type a: XTuple + :return: True if the tuple contains ScaledBasis objects, False otherwise + :rtype: bool + """ + if type(a) is tuple: + return any([has_scaled_basis(x) for x in a]) + else: + return isinstance(a, ScaledBasis) + + +def _tuple_str(t: tuple) -> str: + """ + Constructs a string representation of a python tuple without calling __repr__ on its elements. + """ + + def construct_inner_str(t) -> str: + if not isinstance(t, tuple): + return pretty_str(t) + res = "" + l = len(t) + for i in range(l): + res += pretty_str(t[i]) + if i < l - 1: + res += "," + return res + + res = "(" + construct_inner_str(t) + ")" + return res + + +def pretty_str(arg) -> str: + """ + Constructs a concise readable pretty string. + """ + if isinstance(arg, tuple): + # _tuple_str for tuples + return _tuple_str(arg) + elif arg is None: + # We interpret None as underscores for slicers + return "_" + else: + # Fallback to __str__ + return arg.__str__() + + +@dsl_user_op +def printf(*args, loc=None, ip=None) -> None: + """Print a value or a list of values. + + :param args: List of values to print + :type args: list + :param loc: Source location where it's called, defaults to None + :type loc: source location, optional + :param ip: Insertion pointer, defaults to None + :type ip: insertion pointer, optional + :raises ValueError: If no arguments are provided or if an unsupported argument type is passed + """ + + if len(args) == 0: + raise ValueError("expects at least one argument to print") + + if isinstance(args[0], str): + fmt = args[0] + "\n" + args = args[1:] + else: + fmt = "{}" + ", {}" * (len(args) - 1) + "\n" + + def process_arg(arg): + arg0 = arg.value if isinstance(arg, Numeric) else arg + + if isinstance(arg0, ir.Value): + if isinstance(arg0.type, ir.FloatType) and (arg0.type != T.f32()): + raise TypeError( + f"cute.printf only supports 32-bit floating-point type, but got {arg0.type}" + ) + return arg0 + elif isinstance(arg0, bool): + return const(arg0, Boolean) + elif isinstance(arg0, int): + return const(arg0, Int32) + elif isinstance(arg0, float): + return const(arg0, Float32) + elif has_underscore(arg0): + # Assume it's a coordinate + return _pack_coord(arg0) + elif has_scaled_basis(arg0): + # Assume it's a stride + return _pack_stride(arg0) + elif isinstance(arg0, tuple): + # Assume it's an int_tuple + return _pack_int_tuple(arg0) + elif isinstance(arg0, (_Tensor, _Pointer)): + return arg0.value + else: + raise TypeError(f"unsupported argument type in printf, got {type(arg)}") + + args = [process_arg(a) for a in args] + _cute_ir.print_(args, fmt=fmt, loc=loc, ip=ip) + + +@dsl_user_op +def front(input, *, loc=None, ip=None): + """Recursively get the first element of input. + + This function traverses a hierarchical structure (like a layout or tensor) + and returns the first element at the deepest level. It's particularly useful + for accessing the first stride value in a layout to determine properties like + majorness. + + :param input: The hierarchical structure to traverse + :type input: Union[Tensor, Layout, Stride] + :param loc: Source location where it's called, defaults to None + :type loc: source location, optional + :param ip: Insertion pointer for IR generation, defaults to None + :type ip: insertion pointer, optional + :return: The first element at the deepest level of the input structure + :rtype: Union[int, float, bool, ir.Value] + """ + if rank(input) == 1 and depth(input) == 0: + return input + else: + return front(get(input, mode=[0], loc=loc, ip=ip), loc=loc, ip=ip) + + +@dsl_user_op +def is_major(mode, stride: Stride, *, loc=None, ip=None) -> bool: + """ + Check whether a mode in stride is the major mode. + """ + first_stride = front(get(stride, mode=[mode], loc=loc, ip=ip), loc=loc, ip=ip) + if is_dynamic_expression(first_stride): + return False + return True if first_stride == 1 else False + + +@dsl_user_op +def find( + x: int, + t: Union[tuple, ir.Value, int], + *, + exclude_when: Optional[IntTuple] = None, + loc=None, + ip=None, +) -> Union[int, Tuple[int, ...], None]: + """Find the first position of a x in t. + If exclude_when is provided, the positions where comparison equals comparison_value will be excluded from the search results. + + :param x: The static integer x to search for + :type x: int + :param t: The search space + :type t: Union[tuple, ir.Value, int] + :param exclude_when: A tuple of (comparison_value, comparison) - positions where comparison equals comparison_value will be excluded from the search results + :type exclude_when: Optional[Tuple[int, Union[tuple, ir.Value, int]]] + :return: Index if found at top level, tuple of indices showing nested position, or None if not found + :rtype: Union[int, Tuple[int, ...], None] + """ + if not isinstance(x, int): + raise TypeError(f"find() requires a static x to search for, but got {x}") + + # Extract comparison value and tuple from exclude_when if provided + comparison_value, comparison = None, None + if exclude_when is not None: + comparison_value, comparison = exclude_when + + # Iterate through t, checking both nested tuples and leaf values + for i in range(rank(t)): + # Get current elements from t and comparison + curr1 = get(t, mode=[i], loc=loc, ip=ip) + curr2 = ( + get(comparison, mode=[i], loc=loc, ip=ip) + if comparison is not None + else None + ) + + if isinstance(curr1, tuple): + # Recursively search nested tuple + sub_pos = find( + x, + curr1, + exclude_when=( + (comparison_value, curr2) if comparison is not None else None + ), + loc=loc, + ip=ip, + ) + if sub_pos is not None: + # Combine current index with recursive result + if isinstance(sub_pos, int): + return (i, sub_pos) + return (i,) + sub_pos + else: + # For leaf values, check if it matches x + # Skip dynamic expressions and Numeric types which can't be compared + if not (is_dynamic_expression(curr1) or isinstance(curr1, Numeric)): + if curr1 == x: + if ( + comparison is None + or is_dynamic_expression(curr2) + or isinstance(curr2, Numeric) + or curr2 != comparison_value + ): + return i + + return None + + +def transform_leaf(f, *args): + """ + Apply a function to the leaf nodes of nested tuple structures. + + This function traverses nested tuple structures in parallel and applies the function f + to corresponding leaf nodes. All input tuples must have the same nested structure. + + :param f: Function to apply to leaf nodes + :type f: Callable + :param args: One or more nested tuple structures with matching profiles + :return: A new nested tuple with the same structure as the inputs, but with leaf values transformed by f + :raises TypeError: If the input tuples have different nested structures + + Example: + + .. code-block:: python + + >>> transform_leaf(lambda x: x + 1, (1, 2)) + (2, 3) + >>> transform_leaf(lambda x, y: x + y, (1, 2), (3, 4)) + (4, 6) + >>> transform_leaf(lambda x: x * 2, ((1, 2), (3, 4))) + ((2, 4), (6, 8)) + """ + if all(isinstance(t, tuple) for t in args): + return tuple(transform_leaf(f, *_args) for _args in zip(*args)) + elif all(not isinstance(t, tuple) for t in args): + return f(*args) + else: + raise TypeError(f"profile of input tuples doesn't match: {args}") + + +@dsl_user_op +def assume(src, divby=None, *, loc=None, ip=None): + if divby is None: + return src + + if isinstance(src, Integer): + width = type(src).width + src_val = src.ir_value() + else: + width = src.type.width + src_val = src + + res_ty = _cute_ir.ConstrainedIntType.get(divby, width) + assumed_val = _cute_ir.assume(res_ty, src_val, loc=loc, ip=ip) + return type(src)(IntValue(_pack_int_tuple(assumed_val, loc=loc, ip=ip))) + + +@dsl_user_op +def make_swizzle(b, m, s, *, loc=None, ip=None): + # canonicalize to <0, 4, 3> for identity swizzle (as compiler assumes <0, 4, 3>) + if b == 0: + m, s = 4, 3 + ty = ir.Type.parse(f'!cute.swizzle<"S<{b},{m},{s}>">') + return Swizzle(_cute_ir.static(ty, loc=loc, ip=ip)) + + +# +# Tuple API (also used by layouts and tensors) +# + + +def depth(a: Union[XTuple, Layout, "ComposedLayout"]) -> int: + """Returns the depth (nesting level) of a tuple, layout, or tensor. + + The depth of a tuple is the maximum depth of its elements plus 1. + For an empty tuple, the depth is 1. For layouts and tensors, the depth + is determined by the depth of their shape. For non-tuple values (e.g., integers), + the depth is considered 0. + + :param a: The object whose depth is to be determined + :type a: Union[XTuple, Layout, ComposedLayout, Tensor, Any] + :return: The depth of the input object + :rtype: int + + Example: + + .. code-block:: python + + >>> depth(1) + 0 + >>> depth((1, 2)) + 1 + >>> depth(((1, 2), (3, 4))) + 2 + """ + if type(a) is tuple: + if not a: + return 1 + return max(depth(x) for x in a) + 1 + elif isinstance(a, (Layout, ComposedLayout, Tensor)): + return depth(a.shape) + else: + return 0 + + +@lru_cache_ir() +def rank(a: Union[XTuple, Layout, "ComposedLayout"]) -> int: + """Returns the rank (dimensionality) of a tuple, layout, or tensor. + + The rank of a tuple is its length. For layouts and tensors, the rank is + determined by the rank of their shape. For non-tuple values (e.g., integers), + the rank is considered 1 for convenience. + + :param a: The object whose rank is to be determined + :type a: Union[XTuple, Layout, ComposedLayout, Tensor, Any] + :return: The rank of the input object + :rtype: int + + This function is used in layout algebra to determine the dimensionality + of tensors and layouts for operations like slicing and evaluation. + """ + if isinstance(a, tuple): + return len(a) + elif isinstance(a, (Layout, ComposedLayout, Tensor)): + return rank(a.shape) + elif depth(a) == 0: + return 1 + else: + raise TypeError(f"unsupported type in rank, got {type(a)}") + + +def is_congruent( + a: Union[XTuple, Layout, ComposedLayout, Tensor], + b: Union[XTuple, Layout, ComposedLayout, Tensor], +) -> bool: + """ + Returns whether a is congruent to b. + """ + if isinstance(a, (Layout, ComposedLayout, Tensor)): + a = a.shape + if isinstance(b, (Layout, ComposedLayout, Tensor)): + b = b.shape + if isinstance(a, tuple) and isinstance(b, tuple): + return (len(a) == len(b)) and all(is_congruent(x, y) for x, y in zip(a, b)) + if isinstance(a, tuple) or isinstance(b, tuple): + return False + return True + + +def is_weakly_congruent( + a: Union[XTuple, Layout, ComposedLayout, Tensor], + b: Union[XTuple, Layout, ComposedLayout, Tensor], +) -> bool: + """ + Returns whether a is weakly congruent to b. + """ + if isinstance(a, (Layout, ComposedLayout, Tensor)): + a = a.shape + if isinstance(b, (Layout, ComposedLayout, Tensor)): + b = b.shape + if not isinstance(a, tuple): + return True + if isinstance(a, tuple) and isinstance(b, tuple): + return (len(a) == len(b)) and all( + is_weakly_congruent(x, y) for x, y in zip(a, b) + ) + if isinstance(a, tuple) or isinstance(b, tuple): + return False + return True + + +@overload +def get(input: Shape, mode, *, loc=None, ip=None) -> Shape: ... +@overload +def get(input: Stride, mode, *, loc=None, ip=None) -> Stride: ... +@overload +def get(input: Coord, mode, *, loc=None, ip=None) -> Coord: ... +@overload +def get(input: IntTuple, mode, *, loc=None, ip=None) -> IntTuple: ... +@overload +def get(input: Tile, mode, *, loc=None, ip=None) -> Tile: ... +@overload +def get(input: Layout, mode, *, loc=None, ip=None) -> Layout: ... +@overload +def get(input: ComposedLayout, mode, *, loc=None, ip=None) -> ComposedLayout: ... + + +@dsl_user_op +def get(input, mode: List[int], *, loc=None, ip=None): + """Extract a specific element or sub-layout from a layout or tuple. + + This function recursively traverses the input according to the mode indices, + extracting the element at the specified path. For layouts, this operation + corresponds to extracting a specific sub-layout. + + :param input: The input layout or tuple to extract from + :type input: Layout, ComposedLayout, tuple + :param mode: Indices specifying the path to traverse for extraction + :type mode: List[int] + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: The extracted element or sub-layout + :rtype: Layout, ComposedLayout, or element type + :raises ValueError: If any index in mode is out of range + :raises TypeError: If mode contains non-integer elements or if input has unsupported type + + **Examples**: + + For a layout like ((4,8),2):((16,1),8), get with mode=[0,1] would extract + the element 8 from the shape component. + """ + # Empty mode returns input and terminates the recursive call + if not mode: + return input + + if rank(input) <= mode[0]: + raise ValueError( + f"elements in mode must be less than rank({input}), got {mode}" + ) + + if depth(input) == 0: + return input + elif isinstance(input, tuple): + if not isinstance(mode[0], int): + raise TypeError( + f"invalid element in mode, expects int, got {type(mode[0])}" + ) + return get(input[mode[0]], mode=mode[1:]) + else: + if not isinstance(input, (Layout, ComposedLayout)): + raise TypeError(f"unsupported type of input, got {type(input)}") + return _cute_ir.get( + input.type.get_op_res_type(mode=mode), input, mode=mode, loc=loc, ip=ip + ) + + +@overload +def select(input: Shape, mode, *, loc=None, ip=None) -> Shape: ... +@overload +def select(input: Stride, mode, *, loc=None, ip=None) -> Stride: ... +@overload +def select(input: Coord, mode, *, loc=None, ip=None) -> Coord: ... +@overload +def select(input: IntTuple, mode, *, loc=None, ip=None) -> IntTuple: ... +@overload +def select(input: Tile, mode, *, loc=None, ip=None) -> Tile: ... +@overload +def select(input: Layout, mode, *, loc=None, ip=None) -> Layout: ... +@overload +def select(input: ComposedLayout, mode, *, loc=None, ip=None) -> ComposedLayout: ... + + +@dsl_user_op +def select(input, mode: List[int], *, loc=None, ip=None): + """Select modes from input. + + :param input: Input to select from + :type input: Layout, ComposedLayout, tuple + :param mode: Indices specifying which dimensions or elements to select + :type mode: List[int] + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: A new instance with selected dimensions/elements + :rtype: Layout, ComposedLayout, tuple + :raises ValueError: If any index in mode is out of range + :raises TypeError: If the input type is invalid + """ + if any((not isinstance(i, int)) or (i >= rank(input)) for i in mode): + raise ValueError( + f"invalid mode element for input of rank {rank(input)}, got {mode=}" + ) + + if isinstance(input, tuple): + return tuple(input[i] for i in mode) + + if not isinstance(input, (Layout, ComposedLayout)): + raise TypeError(f"unsupported type of input, got {type(input)}") + + return _cute_ir.select(input, mode=mode, loc=loc, ip=ip) + + +@overload +def group_modes(input: Shape, begin: int, end: int, *, loc=None, ip=None) -> Shape: ... +@overload +def group_modes( + input: Stride, begin: int, end: int, *, loc=None, ip=None +) -> Stride: ... +@overload +def group_modes(input: Coord, begin: int, end: int, *, loc=None, ip=None) -> Coord: ... +@overload +def group_modes( + input: IntTuple, begin: int, end: int, *, loc=None, ip=None +) -> IntTuple: ... +@overload +def group_modes(input: Tile, begin: int, end: int, *, loc=None, ip=None) -> Tile: ... +@overload +def group_modes( + input: Layout, begin: int, end: int, *, loc=None, ip=None +) -> Layout: ... +@overload +def group_modes( + input: ComposedLayout, begin: int, end: int, *, loc=None, ip=None +) -> ComposedLayout: ... +@overload +def group_modes( + input: Tensor, begin: int, end: int, *, loc=None, ip=None +) -> Tensor: ... + + +@dsl_user_op +def group_modes(input, begin: int, end: int = -1, *, loc=None, ip=None): + """Group modes of a hierarchical tuple or layout into a single mode. + + This function groups a range of modes from the input object into a single mode, + creating a hierarchical structure. For tuples, it creates a nested tuple containing + the specified range of elements. For layouts and other CuTe objects, it creates + a hierarchical representation where the specified modes are grouped together. + + :param input: Input object to group modes from (layout, tuple, etc.) + :type input: Layout, ComposedLayout, tuple, Shape, Stride, etc. + :param beg: Beginning index of the range to group (inclusive) + :type beg: int + :param end: Ending index of the range to group (exclusive) + :type end: int + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: A new object with the specified modes grouped + :rtype: Same type as input with modified structure + + Examples: + + .. code-block:: python + + # Group modes in a tuple + t = (2, 3, 4, 5) + grouped = group_modes(t, 1, 3) # (2, (3, 4), 5) + + # Group modes in a layout + layout = make_layout((2, 3, 4, 5)) + grouped_layout = group_modes(layout, 1, 3) # Layout with shape (2, (3, 4), 5) + + # Group modes in a shape + shape = make_shape(2, 3, 4, 5) + grouped_shape = group_modes(shape, 0, 2) # Shape ((2, 3), 4, 5) + """ + if depth(input) == 0: + return (input,) + if isinstance(input, tuple): + return (*input[:begin], (input[begin:end]), *input[end:]) + return _cute_ir.group_modes(input.value, begin, end, loc=loc, ip=ip) + + +@overload +def slice_(src: Shape, coord: Coord, *, loc=None, ip=None) -> Shape: ... +@overload +def slice_(src: Stride, coord: Coord, *, loc=None, ip=None) -> Stride: ... +@overload +def slice_(src: Coord, coord: Coord, *, loc=None, ip=None) -> Coord: ... +@overload +def slice_(src: IntTuple, coord: Coord, *, loc=None, ip=None) -> IntTuple: ... +@overload +def slice_(src: Tile, coord: Coord, *, loc=None, ip=None) -> Tile: ... +@overload +def slice_(src: Layout, coord: Coord, *, loc=None, ip=None) -> Layout: ... +@overload +def slice_( + src: ComposedLayout, coord: Coord, *, loc=None, ip=None +) -> ComposedLayout: ... +@overload +def slice_(src: Tensor, coord: Coord, *, loc=None, ip=None) -> Tensor: ... + + +@dsl_user_op +def slice_(src, coord: Coord, *, loc=None, ip=None): + """Perform a slice operation on a source object using the given coordinate. + + This function implements CuTe's slicing operation which extracts a subset of elements + from a source object (tensor, layout, etc.) based on a coordinate pattern. The slice + operation preserves the structure of the source while selecting specific elements. + + :param src: Source object to be sliced (tensor, layout, tuple, etc.) + :type src: Union[Tensor, Layout, IntTuple, Value] + :param coord: Coordinate pattern specifying which elements to select + :type coord: Coord + :param loc: Source location information, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for IR generation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A new object containing the sliced elements + :rtype: Union[Tensor, Layout, IntTuple, tuple] + :raises ValueError: If the coordinate pattern is incompatible with source + + Examples: + + .. code-block:: python + + # Layout slicing + layout = make_layout((4,4)) + + # Select 1st index of first mode and keep all elements in second mode + sub_layout = slice_(layout, (1, None)) + + .. code-block:: python + + # Basic tensor slicing + tensor = make_tensor(...) # Create a 2D tensor + + # Select 1st index of first mode and keep all elements in second mode + sliced = slice_(tensor, (1, None)) + + .. code-block:: python + + # Select 2nd index of second mode and keep all elements in first mode + sliced = slice_(tensor, (None, 2)) + + Note: + - `None` represents keeping all elements in that mode + - Slicing preserves the layout/structure of the original object + - Can be used for: + * Extracting sub-tensors/sub-layouts + * Creating views into data + * Selecting specific patterns of elements + """ + + def lift_slice(a, b): + if isinstance(a, tuple): + if (not isinstance(b, tuple)) or (len(a) != len(b)): + raise ValueError("coord must be weakly congruent to src in slice_") + return reduce( + lambda p, q: p + q, (lift_slice(x, y) for x, y in zip(a, b)), () + ) + elif a is None: + return (b,) + else: + return () + + if is_integer(src) or isinstance(src, tuple): + if isinstance(coord, tuple): + if (not isinstance(src, tuple)) or (len(coord) != len(src)): + raise ValueError("coord must be weakly congruent to src in slice_") + return reduce( + lambda p, q: p + q, (lift_slice(x, y) for x, y in zip(coord, src)), () + ) + elif coord is None: + return src + else: + return () + + if isinstance(src, Tensor): + src = src.value + coord_val = _pack_coord(coord, loc=loc, ip=ip) + return _cute_ir.slice(input=src, coord=coord_val, loc=loc, ip=ip) + + +@overload +def dice(src: Shape, coord: Coord, *, loc=None, ip=None) -> Shape: ... +@overload +def dice(src: Stride, coord: Coord, *, loc=None, ip=None) -> Stride: ... +@overload +def dice(src: Coord, coord: Coord, *, loc=None, ip=None) -> Coord: ... +@overload +def dice(src: IntTuple, coord: Coord, *, loc=None, ip=None) -> IntTuple: ... +@overload +def dice(src: Tile, coord: Coord, *, loc=None, ip=None) -> Tile: ... +@overload +def dice(src: Layout, coord: Coord, *, loc=None, ip=None) -> Layout: ... +@overload +def dice(src: ComposedLayout, coord: Coord, *, loc=None, ip=None) -> ComposedLayout: ... + + +@dsl_user_op +@lru_cache_ir() +def dice(src, dicer, *, loc=None, ip=None): + """Keep modes in input when it is paired with an integer in dicer. + + This function performs dicing operation on the input based on the dicer coordinate. + Dicing is a fundamental operation in CuTe that allows selecting specific modes from + a tensor or layout based on a coordinate pattern. + + :param dicer: A static coordinate indicating how to dice the input + :type dicer: Coord + :param input: The operand to be diced on + :type input: Union[IntTuple, Shape, Stride, Coord, Layout, ComposedLayout] + :param loc: Source location information, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for IR generation, defaults to None + :type ip: Optional[InsertionPoint] + :return: The diced result with selected modes from the input + :rtype: Union[IntTuple, Shape, Stride, Coord, Layout, ComposedLayout] + :raises TypeError: If dicer has an unsupported type + :raises ValueError: If input is not provided + + Examples: + + .. code-block:: python + + # Basic dicing of a layout + layout = make_layout((32,16,8)) + + # Keep only first and last modes + diced = dice((1,None,1), layout) + + Note: + - The dicer coordinate must be static + - Use underscore (_) to remove a mode + """ + if not is_static(dicer): + raise ValueError(f"expects dicer to be static, but got {dicer}") + + def lift_dice(a, b): + if isinstance(a, tuple): + if (not isinstance(b, tuple)) or (len(a) != len(b)): + raise ValueError("dicer must be weakly congruent to input in dice") + return reduce( + lambda p, q: p + q, (lift_dice(x, y) for x, y in zip(a, b)), () + ) + elif a is None: + return () + else: + return (b,) + + if is_integer(src) or isinstance(src, tuple): + if isinstance(dicer, tuple): + if (not isinstance(src, tuple)) or (len(dicer) != len(src)): + raise ValueError("dicer must be weakly congruent to src in dice") + return reduce( + lambda p, q: p + q, (lift_dice(x, y) for x, y in zip(dicer, src)), () + ) + elif dicer is None: + return () + else: + return src + + dicer_val = _pack_coord(dicer, loc=loc, ip=ip) + return _cute_ir.dice(src, dicer_val.type.attribute, loc=loc, ip=ip) + + +def wrap(x) -> tuple: + """ + Wraps the input into a tuple if not a tuple. + """ + if isinstance(x, tuple): + return x + return (x,) + + +def _extend(func, input, elem, up_to_rank, loc, ip): + if input is None: + raise ValueError(f"No input provided for input") + + if isinstance(input, (Layout, ComposedLayout)): + if elem is None: + elem = make_layout(1) + elif not isinstance(elem, Layout): + raise TypeError(f"Input type of elem ({type(elem)}) is not accepted!") + N = rank(input) + 1 if up_to_rank is None else up_to_rank + return func(N, input, elem, loc=loc, ip=ip) + + if is_valid_leaf(input) or isinstance(input, tuple): + if elem is None: + elem = 1 + if (not isinstance(elem, tuple)) and (not is_valid_leaf(elem)): + raise TypeError(f"Input type of elem ({type(elem)}) is not accepted!") + + input = wrap(input) + repeat_cnt = 1 if up_to_rank is None else up_to_rank - rank(input) + if repeat_cnt == 0: + return input + elif repeat_cnt < 0: + raise ValueError(f"up_to_rank must be >= rank(input)") + else: + if func is _cute_ir.prepend_to_rank: + return (elem,) * repeat_cnt + input + else: + return input + (elem,) * repeat_cnt + + raise TypeError(f"invalid type for input, got {type(input)}") + + +@overload +def prepend( + input: Shape, elem: Shape, up_to_rank=None, *, loc=None, ip=None +) -> Shape: ... +@overload +def prepend( + input: Stride, elem: Stride, up_to_rank=None, *, loc=None, ip=None +) -> Stride: ... +@overload +def prepend( + input: Coord, elem: Coord, up_to_rank=None, *, loc=None, ip=None +) -> Coord: ... +@overload +def prepend( + input: IntTuple, elem: IntTuple, up_to_rank=None, *, loc=None, ip=None +) -> IntTuple: ... +@overload +def prepend(input: Tile, elem: Tile, up_to_rank=None, *, loc=None, ip=None) -> Tile: ... +@overload +def prepend( + input: Layout, elem: Layout, up_to_rank=None, *, loc=None, ip=None +) -> Layout: ... +@overload +def prepend( + input: ComposedLayout, elem: Layout, up_to_rank=None, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def prepend(input, elem, up_to_rank: Union[None, int] = None, *, loc=None, ip=None): + """Extend input to rank up_to_rank by prepending elem in front of input. + + This function extends the input object by prepending elements to reach a desired rank. + It supports various CuTe types including shapes, layouts, tensors etc. + + :param input: Source to be prepended to + :type input: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor] + :param elem: Element to prepend to input + :type elem: Union[Shape, Stride, Coord, IntTuple, Tile, Layout] + :param up_to_rank: The target rank after extension, defaults to None + :type up_to_rank: Union[None, int], optional + :param loc: Source location for MLIR, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point, defaults to None + :type ip: Optional[InsertionPoint] + :return: The extended result with prepended elements + :rtype: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor] + :raises ValueError: If up_to_rank is less than input's current rank + :raises TypeError: If input or elem has unsupported type + + Examples: + + .. code-block:: python + + # Prepend to a Shape + shape = (4,4) + prepend(shape, 2) # Returns (2,4,4) + + # Prepend to a Layout + layout = make_layout((8,8)) + prepend(layout, make_layout((2,))) # Returns (2,8,8):(1,1,8) + + # Prepend with target rank + coord = (1,1) + prepend(coord, 0, up_to_rank=4) # Returns (0,0,1,1) + """ + return _extend(_cute_ir.prepend_to_rank, input, elem, up_to_rank, loc=loc, ip=ip) + + +@overload +def append( + input: Shape, elem: Shape, up_to_rank=None, *, loc=None, ip=None +) -> Shape: ... +@overload +def append( + input: Stride, elem: Stride, up_to_rank=None, *, loc=None, ip=None +) -> Stride: ... +@overload +def append( + input: Coord, elem: Coord, up_to_rank=None, *, loc=None, ip=None +) -> Coord: ... +@overload +def append( + input: IntTuple, elem: IntTuple, up_to_rank=None, *, loc=None, ip=None +) -> IntTuple: ... +@overload +def append(input: Tile, elem: Tile, up_to_rank=None, *, loc=None, ip=None) -> Tile: ... +@overload +def append( + input: Layout, elem: Layout, up_to_rank=None, *, loc=None, ip=None +) -> Layout: ... +@overload +def append( + input: ComposedLayout, elem: Layout, up_to_rank=None, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def append(input, elem, up_to_rank: Union[None, int] = None, *, loc=None, ip=None): + """Extend input to rank up_to_rank by appending elem to the end of input. + + This function extends the input object by appending elements to reach a desired rank. + It supports various CuTe types including shapes, layouts, tensors etc. + + :param input: Source to be appended to + :type input: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor] + :param elem: Element to append to input + :type elem: Union[Shape, Stride, Coord, IntTuple, Tile, Layout] + :param up_to_rank: The target rank after extension, defaults to None + :type up_to_rank: Union[None, int], optional + :param loc: Source location for MLIR, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point, defaults to None + :type ip: Optional[InsertionPoint] + :return: The extended result with appended elements + :rtype: Union[Shape, Stride, Coord, IntTuple, Tile, Layout, ComposedLayout, Tensor] + :raises ValueError: If up_to_rank is less than input's current rank + :raises TypeError: If input or elem has unsupported type + + Examples: + + .. code-block:: python + + # Append to a Shape + shape = (4,4) + append(shape, 2) # Returns (4,4,2) + + # Append to a Layout + layout = make_layout((8,8)) + append(layout, make_layout((2,))) # Returns (8,8,2):(1,8,1) + + # Append with target rank + coord = (1,1) + append(coord, 0, up_to_rank=4) # Returns (1,1,0,0) + + Note: + - The function preserves the structure of the input while extending it + - Can be used to extend tensors, layouts, shapes and other CuTe types + - When up_to_rank is specified, fills remaining positions with elem + - Useful for tensor reshaping and layout transformations + """ + return _extend(_cute_ir.append_to_rank, input, elem, up_to_rank, loc=loc, ip=ip) + + +@dsl_user_op +def prepend_ones( + t: Tensor, up_to_rank: Union[None, int] = None, *, loc=None, ip=None +) -> Tensor: + return make_tensor( + t.iterator, prepend(t.layout, make_layout(1), up_to_rank), loc=loc, ip=ip + ) + + +@dsl_user_op +def append_ones( + t: Tensor, up_to_rank: Union[None, int] = None, *, loc=None, ip=None +) -> Tensor: + return make_tensor( + t.iterator, append(t.layout, make_layout(1), up_to_rank), loc=loc, ip=ip + ) + + +def repeat_like(x, target): + """Creates an object congruent to target and filled with x. + + This function recursively creates a nested tuple structure that matches the structure + of the target, with each leaf node filled with the value x. + + :param x: The value to fill the resulting structure with + :type x: Any + :param target: The structure to mimic + :type target: Union[tuple, Any] + :return: A structure matching target but filled with x + :rtype: Union[tuple, Any] + + Examples: + + .. code-block:: python + + repeat_like(0, (1, 2, 3)) # Returns (0, 0, 0) + repeat_like(1, ((1, 2), 3)) # Returns ((1, 1), 1) + repeat_like(2, 5) # Returns 2 + """ + if not isinstance(target, tuple): + return x + if not target: + return () + if len(target) == 1: + return (repeat_like(x, target[0]),) + return tuple(repeat_like(x, t) for t in target) + + +def flatten_to_tuple(a: Union[IntTuple, Coord, Shape, Stride]) -> tuple: + """Flattens a potentially nested tuple structure into a flat tuple. + + This function recursively traverses the input structure and flattens it into + a single-level tuple, preserving the order of elements. + + :param a: The structure to flatten + :type a: Union[IntTuple, Coord, Shape, Stride] + :return: A flattened tuple containing all elements from the input + :rtype: tuple + + Examples: + + .. code-block:: python + + flatten_to_tuple((1, 2, 3)) # Returns (1, 2, 3) + flatten_to_tuple(((1, 2), 3)) # Returns (1, 2, 3) + flatten_to_tuple((1, (2, (3,)))) # Returns (1, 2, 3) + """ + if not isinstance(a, tuple): + return wrap(a) + else: + return tuple(chain.from_iterable(tuple(flatten_to_tuple(x) for x in a))) + + +def flatten(a: Union[IntTuple, Coord, Shape, Stride, Layout, Tensor]) -> tuple: + """Flattens a CuTe data structure into a simpler form. + + For tuples, this function flattens the structure into a single-level tuple. + For non-tuple types, it returns the input unchanged. + + :param a: The structure to flatten + :type a: Union[IntTuple, Coord, Shape, Stride, Layout, Tensor] + :return: The flattened structure + :rtype: Union[tuple, Any] + :raises NotImplementedError: If input is a Layout or Tensor + + Examples: + + .. code-block:: python + + flatten((1, 2, 3)) # Returns (1, 2, 3) + flatten(((1, 2), (3, 4))) # Returns (1, 2, 3, 4) + flatten(5) # Returns 5 + """ + if isinstance(a, (Layout, Tensor)): + raise NotImplementedError("flatten layout and tensor is not supported") + + if not isinstance(a, tuple): + return a + else: + return flatten_to_tuple(a) + + +def unflatten( + sequence: Union[Tuple[Any, ...], List[Any], Iterable[Any]], profile: XTuple +) -> XTuple: + """Unflatten a flat tuple into a nested tuple structure according to a profile. + + This function transforms a flat sequence of elements into a nested tuple structure + that matches the structure defined by the profile parameter. It traverses the profile + structure and populates it with elements from the sequence. + + sequence must be long enough to fill the profile. Raises RuntimeError if it is not. + + :param sequence: A flat sequence of elements to be restructured + :type sequence: Union[Tuple[Any, ...], List[Any], Iterable[Any]] + :param profile: A nested tuple structure that defines the shape of the output + :type profile: XTuple + :return: A nested tuple with the same structure as profile but containing elements from sequence + :rtype: XTuple + + Example: + >>> unflatten([1, 2, 3, 4], ((0, 0), (0, 0))) + ((1, 2), (3, 4)) + """ + + def _make_generator(): + for element in sequence: + yield element + + xs = _make_generator() + return transform_leaf(lambda _: next(xs), profile) + + +@dsl_user_op +def elem_less( + lhs: Union[Shape, IntTuple, Coord], + rhs: Union[Shape, IntTuple, Coord], + *, + loc=None, + ip=None, +): + lhs_val = _pack_coord(lhs, loc=loc, ip=ip) + rhs_val = _pack_coord(rhs, loc=loc, ip=ip) + return Boolean(_cute_ir.elem_less(lhs_val, rhs_val, loc=loc, ip=ip)) + + +@overload +def filter_zeros( + input: Layout, *, target_profile=None, loc=None, ip=None +) -> Layout: ... +@overload +def filter_zeros( + input: Tensor, *, target_profile=None, loc=None, ip=None +) -> Tensor: ... + + +@dsl_user_op +def filter_zeros(input, *, target_profile=None, loc=None, ip=None): + """Filter out zeros from a layout or tensor. + + This function removes zero-stride dimensions from a layout or tensor. + See Section 3.3 in the CuTe Whitepaper for more details on layout operations. + + :param input: The input layout or tensor to filter + :type input: Layout or Tensor + :param target_profile: Target profile for the filtered result, defaults to None + :type target_profile: optional + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: The filtered layout or tensor with zeros removed + :rtype: Layout or Tensor + :raises TypeError: If input is not a Layout or Tensor + """ + if not isinstance(input, (Layout, Tensor)): + raise TypeError(f"Expect layout or tensor as input but got {type(input)=}") + if isinstance(input, Tensor): + input = input.value + return _cute_ir.filter_zeros(input, target_profile=target_profile, loc=loc, ip=ip) + + +@dsl_user_op +def filter(input: Union[Layout, Tensor], *, loc=None, ip=None): + """Filter a layout or tensor. + + This function filters a layout or tensor according to CuTe's filtering rules. + + :param input: The input layout or tensor to filter + :type input: Layout or Tensor + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: The filtered layout or tensor + :rtype: Layout or Tensor + :raises TypeError: If input is not a Layout or Tensor + """ + if not isinstance(input, (Layout, Tensor)): + raise TypeError(f"Expect layout or tensor as input but got {type(input)=}") + if isinstance(input, _Tensor): + input = input.value + return _cute_ir.filter(input, loc=loc, ip=ip) + + +@dsl_user_op +def product(a: Union[IntTuple, Shape], *, loc=None, ip=None): + """Return product of the given IntTuple or Shape. + + Computes the product of all elements in the input tuple or shape. + Returns static value if type is static. + + :param a: The input tuple or shape + :type a: IntTuple or Shape + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: Static product of IntTuple or Shape if static, otherwise a Value + :rtype: int or Value + :raises TypeError: If input is not an IntTuple or Shape + """ + if is_integer(a): + return a + if isinstance(a, tuple): + a_val = _pack_int_tuple(a, loc=loc, ip=ip) + res = _cute_ir.tuple_product(a_val, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + else: + raise TypeError(f"expects IntTuple or Shape, but got {type(a)}") + + +@overload +def product_like( + a: IntTuple, target_profile: XTuple, *, loc=None, ip=None +) -> IntTuple: ... +@overload +def product_like(a: Shape, target_profile: XTuple, *, loc=None, ip=None) -> Shape: ... + + +@dsl_user_op +def product_like( + a: Union[IntTuple, Shape], target_profile: XTuple, *, loc=None, ip=None +): + """Return product of the given IntTuple or Shape at leaves of `target_profile`. + + This function computes products according to the structure defined by target_profile. + + :param a: The input tuple or shape + :type a: IntTuple or Shape + :param target_profile: The profile that guides how products are computed + :type target_profile: XTuple + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: The resulting tuple with products computed according to target_profile + :rtype: IntTuple or Shape + :raises TypeError: If inputs have incompatible types + :raises ValueError: If inputs have incompatible shapes + """ + # Perform product at leaf of `target_profile` + if not isinstance(target_profile, tuple): + return product(a, loc=loc, ip=ip) + else: + if not isinstance(a, tuple): + raise TypeError(f"expects `a` tuple but got {a}") + + if len(a) != len(target_profile): + raise ValueError(f"expects `a` and `guide` have the same rank") + + return tuple( + product_like(x, g, loc=loc, ip=ip) for x, g in zip(a, target_profile) + ) + + +@overload +def product_each(a: IntTuple, *, loc=None, ip=None) -> IntTuple: ... +@overload +def product_each(a: Shape, *, loc=None, ip=None) -> Shape: ... + + +@dsl_user_op +def product_each(a, *, loc=None, ip=None): + """Compute products for each component of the input. + + Returns a rank(a) tuple `result` such that get(result, mode=[i]) == product(get(a, mode=[i])) + + :param a: The input tuple or shape + :type a: IntTuple or Shape + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: A tuple containing products for each component + :rtype: tuple + :raises TypeError: If input is not an IntTuple or Shape + """ + if is_integer(a): + return a + if isinstance(a, tuple): + if not a: + return 1 + else: + a_val = _pack_int_tuple(a, loc=loc, ip=ip) + res = _cute_ir.tuple_product_each(a_val, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + else: + raise TypeError(f"expects IntTuple or Shape, but got {type(a)}") + + +@dsl_user_op +def size( + a: Union[IntTuple, Shape, Layout, ComposedLayout, Tensor], + mode: List[int] = [], + *, + loc=None, + ip=None, +) -> Int: + """Return size of domain of layout or tensor. + + Computes the size (number of elements) in the domain of a layout or tensor. + For layouts, this corresponds to the shape of the coordinate space. + See Section 3.2 in the CuTe Whitepaper for more details on layout domains. + + :param a: The input object whose size to compute + :type a: IntTuple, Shape, Layout, ComposedLayout or Tensor + :param mode: List of mode(s) for size calculation. If empty, computes total size, defaults to [] + :type mode: list of int, optional + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: Static size of layout or tensor if static, otherwise a Value + :rtype: int or Value + :raises ValueError: If mode contains non-integer elements + """ + if any(not isinstance(m, int) for m in mode): + raise ValueError(f"expects integer elements in mode, but got {mode}") + + if isinstance(a, (TiledMma, TiledCopy)): + return a.size + a_val = None + if not isinstance(a, (Layout, ComposedLayout, Tensor)): + a_val = _pack_int_tuple(a, loc=loc, ip=ip) + elif isinstance(a, Tensor): + a_val = a.value + else: + a_val = a + + res = _cute_ir.size(a_val, mode=mode, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) # type: ignore + + +@dsl_user_op +def shape_div(lhs: Shape, rhs: Shape, *, loc=None, ip=None) -> Shape: + """Perform element-wise division of shapes. + + This function performs element-wise division between two shapes. + + :param lhs: Left-hand side shape + :type lhs: Shape + :param rhs: Right-hand side shape + :type rhs: Shape + :param loc: Source location for MLIR, defaults to None + :type loc: optional + :param ip: Insertion point, defaults to None + :type ip: optional + :return: The result of element-wise division + :rtype: Shape + """ + lhs = _pack_shape(lhs, loc=loc, ip=ip) + rhs = _pack_shape(rhs, loc=loc, ip=ip) + res = _cute_ir.shape_div(lhs, rhs, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + + +@dsl_user_op +def ceil_div(input: Shape, tiler: Tiler, *, loc=None, ip=None) -> Shape: + """ + Compute the ceiling division of a target shape by a tiling specification. + + This function computes the number of tiles required to cover the target domain. + It is equivalent to the second mode of `zipped_divide(input, tiler)`. + + :param input: A tuple of integers representing the dimensions of the target domain. + :type input: Shape + :param tiler: The tiling specification. + :type tiler: Union[Layout, Shape, Tile] + :param loc: Optional location information for IR diagnostics. + :type loc: optional + :param ip: Optional instruction pointer or context for underlying IR functions. + :type ip: optional + :return: A tuple of integers representing the number of tiles required along each dimension, + i.e. the result of the ceiling division of the input dimensions by the tiler dimensions. + :rtype: Shape + + Example: + + .. code-block:: python + + import cutlass.cute as cute + @cute.jit + def foo(): + input = (10, 6) + tiler = (3, 4) + result = cute.ceil_div(input, tiler) + print(result) # Outputs: (4, 2) + """ + input_val = _pack_shape(input, loc=loc, ip=ip) + tiler_val = _pack_tile(tiler, loc=loc, ip=ip) + res = _cute_ir.ceil_div(input=input_val, tiler=tiler_val, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + + +def round_up(a: IntTuple, b: IntTuple) -> IntTuple: + """ + Rounds up elements of a using elements of b. + """ + if isinstance(a, tuple): + if not a: + raise ValueError(f"inputs cannot be empty") + if not isinstance(b, tuple): + raise TypeError( + f"expects both inputs to be tuple, but got {type(a)} and {type(b)}" + ) + if rank(a) < rank(b): + raise ValueError( + f"expects rank(a) to be greater or equal than rank(b), but got {a}, {b}" + ) + b = append(b, 1, rank(a)) + return tuple(round_up(x, y) for x, y in zip(a, b)) + return ((a + b - 1) // b) * b + + +# +# Layout API (also used by tensors) +# + + +@dsl_user_op +def make_layout( + shape: Shape, *, stride: Union[Stride, None] = None, loc=None, ip=None +) -> Layout: + """Create a CuTe Layout object from shape and optional stride information. + + A Layout in CuTe represents the mapping between logical and physical coordinates of a tensor. + This function creates a Layout object that defines how tensor elements are arranged in memory. + + :param shape: Shape of the layout defining the size of each mode + :type shape: Shape + :param stride: Optional stride values for each mode, defaults to None + :type stride: Union[Stride, None] + :param loc: Source location information, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for IR generation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A new Layout object with the specified shape and stride + :rtype: Layout + + Examples: + + .. code-block:: python + + # Create a 2D compact left-most layout with shape (4,4) + layout = make_layout((4,4)) # compact left-most layout + + # Create a left-most layout with custom strides + layout = make_layout((4,4), stride=(1,4)) # left-most layout with strides (1,4) + + # Create a layout for a 3D tensor + layout = make_layout((32,16,8)) # left-most layout + + # Create a layout with custom strides + layout = make_layout((2,2,2), stride=(4,1,2)) # layout with strides (4,1,2) + + Note: + - If stride is not provided, a default compact left-most stride is computed based on the shape + - The resulting layout maps logical coordinates to physical memory locations + - The layout object can be used for tensor creation and memory access patterns + - Strides can be used to implement: + * Row-major vs column-major layouts + * Padding and alignment + * Blocked/tiled memory arrangements + * Interleaved data formats + - Stride is keyword only argument to improve readability, e.g. + * make_layout((3,4), (1,4)) can be confusing with make_layout(((3,4), (1,4))) + * make_layout((3,4), stride=(1,4)) is more readable + """ + shape_val = _pack_shape(shape, loc=loc, ip=ip) + if stride is not None: + stride_val = _pack_stride(stride, loc=loc, ip=ip) + layout_ty = _cute_ir.LayoutType.get(shape_val, stride_val) + else: + stride_val = None + layout_ty = _cute_ir.LayoutType.get(shape_val) + + return _cute_ir.make_layout( + layout_ty, shape=shape_val, stride=stride_val, loc=loc, ip=ip + ) + + +@dsl_user_op +def make_identity_layout(shape: Shape, *, loc=None, ip=None) -> Layout: + """Create an identity layout with the given shape. + + An identity layout maps logical coordinates directly to themselves without any transformation. + This is equivalent to a layout with stride (1@0,1@1,...,1@(N-1)). + + :param shape: The shape of the layout + :type shape: Shape + :param loc: Source location information, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for IR generation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A new identity Layout object with the specified shape + :rtype: Layout + + Examples: + + .. code-block:: python + + # Create a 2D identity layout with shape (4,4) + layout = make_identity_layout((4,4)) # stride=(1@0,1@1) + + # Create a 3D identity layout + layout = make_identity_layout((32,16,8)) # stride=(1@0,1@1,1@2) + + Note: + - An identity layout is a special case where each coordinate maps to itself + - Useful for direct coordinate mapping without any transformation + """ + if not is_int_tuple(shape): + raise TypeError(f"expects a shape input, got {type(shape)}") + shape_val = _pack_shape(shape, loc=loc, ip=ip) + return _cute_ir.make_identity_layout(shape_val, loc=loc, ip=ip) + + +@dsl_user_op +def make_ordered_layout(shape: Shape, order: Shape, *, loc=None, ip=None) -> Layout: + """Create a layout with a specific ordering of dimensions. + + This function creates a layout where the dimensions are ordered according to the + specified order parameter, allowing for custom dimension ordering in the layout. + + :param shape: The shape of the layout + :type shape: Shape + :param order: The ordering of dimensions + :type order: Shape + :param loc: Source location information, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for IR generation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A new Layout object with the specified shape and dimension ordering + :rtype: Layout + + Examples: + + .. code-block:: python + + # Create a row-major layout + layout = make_ordered_layout((4,4), order=(1,0)) + + # Create a column-major layout + layout = make_ordered_layout((4,4), order=(0,1)) # stride=(1,4) + + # Create a layout with custom dimension ordering for a 3D tensor + layout = make_ordered_layout((32,16,8), order=(2,0,1)) # stride=(128,1,16) + + Note: + - The order parameter specifies the ordering of dimensions from fastest-varying to slowest-varying + - For a 2D tensor, (0,1) creates a column-major layout, while (1,0) creates a row-major layout + - The length of order must match the rank of the shape + """ + shape_val = _pack_shape(shape, loc=loc, ip=ip) + order_val = _pack_shape(order, loc=loc, ip=ip) + return _cute_ir.make_ordered_layout( + shape=shape_val, order=order_val, loc=loc, ip=ip + ) + + +@dsl_user_op +def make_composed_layout( + inner, offset: IntTuple, outer: Layout, *, loc=None, ip=None +) -> ComposedLayout: + """Create a composed layout by composing an inner transformation with an outer layout. + + As described in the CuTe whitepaper, a composed layout applies a sequence of transformations + to coordinates. The composition is defined as (inner ∘ offset ∘ outer), where the operations + are applied from right to left. + + :param inner: The inner transformation (can be a Layout or Swizzle) + :type inner: Union[Layout, Swizzle] + :param offset: An integral offset applied between transformations + :type offset: IntTuple + :param outer: The outer (right-most) layout that is applied first + :type outer: Layout + :param loc: Source location information, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for IR generation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A new ComposedLayout representing the composition + :rtype: ComposedLayout + + Examples: + + .. code-block:: python + + # Create a basic layout + inner = make_layout(...) + outer = make_layout((4,4), stride=(E(0), E(1))) + + # Create a composed layout with an offset + composed = make_composed_layout(inner, (2,0), outer) + + Note: + - The composition applies transformations in the order: outer → offset → inner + - The stride divisibility condition must be satisfied for valid composition + - Certain compositions (like Swizzle with scaled basis) are invalid and will raise errors + - Composed layouts inherit many properties from the outer layout + """ + if not isinstance(outer, Layout): + raise TypeError( + f"expects the outer (or right-most or effectively visible) layout to be an affine layout, but got {outer}" + ) + if isinstance(inner, Swizzle) and has_scaled_basis(outer.stride): + raise TypeError(f"invalid composition {inner} o {offset} o {outer}") + offset_val = _pack_int_tuple(offset, loc=loc, ip=ip) + return _cute_ir.make_composed_layout(inner, offset_val, outer, loc=loc, ip=ip) + + +@dsl_user_op +def cosize( + a: Union[Layout, ComposedLayout, Tensor], mode: List[int] = [], *, loc=None, ip=None +): + """Return size of codomain of layout or tensor. Return static value if type is static. + + :param a: Layout, ComposedLayout, or Tensor object + :type a: Union[Layout, ComposedLayout, Tensor] + :param mode: List of mode(s) for cosize calculation + :type mode: List[int], optional + :param loc: Location information for diagnostics, defaults to None + :type loc: optional + :param ip: Instruction pointer for diagnostics, defaults to None + :type ip: optional + :return: Static size of layout or tensor (fast fold) if static, or a dynamic Value + :rtype: Union[int, Value] + """ + if any(not is_static(m) for m in mode): + raise ValueError(f"expects static mode, but got {mode}") + + if isinstance(a, _Tensor): + a = a.value + res = _cute_ir.cosize(a, mode=mode, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + + +@dsl_user_op +def size_in_bytes( + dtype: Type[Numeric], layout: Union[Layout, ComposedLayout], *, loc=None, ip=None +): + """Calculate the size in bytes based on its data type and layout. + + :param dtype: The DSL numeric data type + :type dtype: Type[Numeric] + :param layout: The layout of the elements. If None, the function returns 0 + :type layout: Layout, optional + :param loc: Location information for diagnostics, defaults to None + :type loc: optional + :param ip: Instruction pointer for diagnostics, defaults to None + :type ip: optional + :return: The total size in bytes. Returns 0 if the layout is None + :rtype: int + """ + if not isinstance(dtype, NumericMeta): + raise TypeError(f"dtype must be a Numeric, but got {dtype}") + + if layout is None: + return 0 + elif isinstance(layout, ComposedLayout): + if not isinstance(layout.inner, Swizzle): + raise TypeError( + f"invalid composed layout {layout}, inner must be a Swizzle" + ) + else: + return cosize(layout.outer, loc=loc, ip=ip) * dtype.width // 8 + else: + return cosize(layout, loc=loc, ip=ip) * dtype.width // 8 + + +@dsl_user_op +def coalesce(input, *, target_profile: Coord = None, loc=None, ip=None): + if target_profile: + profile_val = _pack_coord(target_profile, loc=loc, ip=ip) + return _cute_ir.coalesce(input, target_profile=profile_val, loc=loc, ip=ip) + else: + return _cute_ir.coalesce(input, loc=loc, ip=ip) + + +@dsl_user_op +def crd2idx(coord: Coord, layout, *, loc=None, ip=None): + """ + Convert a multi-dimensional coordinate into a value using the specified layout. + + This function computes the inner product of the flattened coordinate and stride: + + index = sum(flatten(coord)[i] * flatten(stride)[i] for i in range(len(coord))) + + :param coord: A tuple or list representing the multi-dimensional coordinate + (e.g., (i, j) for a 2D layout). + :type coord: Coord + :param layout: A layout object that defines the memory storage layout, including shape and stride, + used to compute the inner product. + :type layout: Layout or ComposedLayout + :param loc: Optional location information for IR diagnostics. + :type loc: optional + :param ip: Optional instruction pointer or context for underlying IR functions. + :type ip: optional + :returns: The result of applying the layout transformation to the provided coordinate. + :rtype: Any type that the layout maps to + + Example: + + .. code-block:: python + + import cutlass.cute as cute + @cute.jit + def foo(): + L = cute.make_layout((5, 4), stride=(4, 1)) + idx = cute.crd2idx((2, 3), L) + # Computed as: 2 * 4 + 3 = 11 + print(idx) + foo() # Expected output: 11 + """ + coord_val = _pack_coord(coord, loc=loc, ip=ip) + if isinstance(layout, (tuple, int)): + layout = make_layout(layout, loc=loc, ip=ip) + + res = _cute_ir.crd2idx(coord_val, layout, loc=loc, ip=ip) + return _unpack_x_tuple(res, loc=loc, ip=ip) + + +@dsl_user_op +def recast_layout(new_type_bits, old_type_bits, src_layout, *, loc=None, ip=None): + return _cute_ir.recast_layout( + new_type_bits, old_type_bits, src_layout, loc=loc, ip=ip + ) + + +@dsl_user_op +def slice_and_offset(coord, src, *, loc=None, ip=None): + layout = slice_(src, coord, loc=loc, ip=ip) + offset = crd2idx(coord, src, loc=loc, ip=ip) + return layout, offset + + +@dsl_user_op +@lru_cache_ir() +def shape( + input: Union[Shape, Tensor, Layout, Tile], *, mode=None, loc=None, ip=None +) -> Shape: + """Returns the shape of a tensor, layout or tiler. + + For shapes, this function is identical to get. + + This function extracts the shape information from the input object. For tensors and layouts, + it returns their internal shape property. For tilers, it unpacks the shape from the tile + representation. + + :param input: The object to extract shape from + :type input: Union[Tensor, Layout, Tile] + :param mode: Optional mode selector to extract specific dimensions from the shape + :type mode: Optional[int] + :param loc: Source location for MLIR operation tracking + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation + :type ip: Optional[InsertionPoint] + :return: The shape of the input object, optionally filtered by mode + :rtype: Shape + + Example: + + .. code-block:: python + + # Get shape of a layout + l0 = cute.make_layout((2, 3, 4)) + s0 = cute.shape(l0) # => (2, 3, 4) + + # Get shape of a hierarchical tiler + l1 = cute.make_layout(1) + s1 = cute.shape((l0, l1)) # => ((2, 3, 4), 1) + + # Get specific mode from a shape + s2 = cute.shape(l0, mode=0) # => 2 + """ + if is_int_tuple(input): + return get(input, mode=mode) + + if isinstance(input, (Tensor, Layout)): + shp = input.shape + else: + val = _cute_ir.get_shape(_pack_tile(input, loc=loc, ip=ip)) + shp = _unpack_x_tuple(val, loc=loc, ip=ip) + return get(shp, mode=mode) + + +# +# Pointer API +# + + +@dsl_user_op +def recast_ptr( + ptr: Pointer, + swizzle_=None, + dtype: Optional[Type[Numeric]] = None, + loc=None, + ip=None, +) -> Pointer: + if dtype is not None: + if not isclass(dtype) or not issubclass(dtype, Numeric): + raise TypeError(f"dtype must be a type of Numeric, but got {dtype}") + dtype = dtype.mlir_type + + value_type = ptr.type.value_type if dtype is None else dtype + swizzle = swizzle_.type.attribute if swizzle_ is not None else None + res_ty = _cute_ir.PtrType.get( + value_type, + AddressSpace(ptr.type.address_space), + ptr.alignment, + swizzle, + ) + return _cute_ir.recast_iter(res_ty, ptr.value, loc=loc, ip=ip) + + +@dsl_user_op +def make_ptr( + dtype: Union[Type[Numeric], None], + value, + mem_space: AddressSpace = AddressSpace.generic, + *, + assumed_align=None, + loc=None, + ip=None, +) -> Pointer: + if dtype is None or not isinstance(dtype, NumericMeta): + raise TypeError(f"expects dtype to be a type of Numeric, but got {dtype}") + + if not is_integer(value): + raise TypeError(f"expects integer value, but got {type(value)}") + + bytes_per_elt = max(1, dtype.width // 8) + if assumed_align is None: + assumed_align = bytes_per_elt + + if bytes_per_elt % assumed_align != 0 and assumed_align % bytes_per_elt != 0: + raise ValueError( + f"{bytes_per_elt=} is not a multiple of {assumed_align=} and vice versa." + ) + + value = Int32(value) if mem_space == AddressSpace.tmem else Int64(value) + aligned_ty = _cute_ir.ConstrainedIntType.get(assumed_align, type(value).width) + aligned_intptr = _cute_ir.assume(aligned_ty, value.ir_value(), loc=loc, ip=ip) + + ptr_ty = _cute_ir.PtrType.get( + T.i8() if dtype is None else dtype.mlir_type, mem_space, assumed_align + ) + return _cute_ir.inttoptr(ptr_ty, aligned_intptr, loc=loc, ip=ip) + + +# +# Tensor API +# + + +@dsl_user_op +def make_tensor( + iterator, layout: Union[Shape, Layout, ComposedLayout], *, loc=None, ip=None +) -> Tensor: + """Creates a tensor by composing an engine (iterator/pointer) with a layout. + + A tensor is defined as T = E ∘ L, where E is an engine (array, pointer, or counting iterator) + and L is a layout that maps logical coordinates to physical offsets. The tensor + evaluates coordinates by applying the layout mapping and dereferencing the engine + at the resulting offset. + + :param iterator: Engine component (pointer, iterator, or counting iterator) that provides + data access capabilities + :type iterator: Union[Pointer, IntTuple] + :param layout: Layout component that defines the mapping from logical coordinates to + physical offsets + :type layout: Union[Shape, Layout, ComposedLayout] + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A tensor object representing the composition E ∘ L + :rtype: Tensor + + :raises ValueError: If iterator type is not supported + + Examples: + + .. code-block:: python + + # Create a tensor with row-major layout + layout = make_layout((64, 128), stride=(128, 1)) + tensor = make_tensor(ptr, layout) + + # Create a tensor with hierarchical layout + layout = make_layout(((128, 8), (1, 4, 1)), stride=((32, 1), (0, 8, 4096))) + tensor = make_tensor(smem_ptr, layout) + + # Create a counting tensor + layout = make_layout(2, stride=16 * E(0)) + tensor = make_tensor(5, layout) + + Notes: + - The engine (iterator) must support random access operations + - Common engine types include raw pointers, arrays, and random-access iterators + - The layout defines both the shape (logical dimensions) and stride (physical mapping) + - Supports both direct coordinate evaluation T(c) and partial evaluation (slicing) + """ + if not isinstance(layout, (Layout, ComposedLayout)): + layout = make_layout(layout, loc=loc, ip=ip) + elif isinstance(layout, ComposedLayout) and layout.type.is_normal_layout: + layout = layout.outer + + ty = None + if is_integer(iterator) or isinstance(iterator, tuple): + iterator = _pack_int_tuple(iterator, loc=loc, ip=ip) + ty = _cute_ir.CountingTensorType.get(iterator.type, layout.type) + elif isinstance(iterator, Pointer): + iterator = iterator.value + ty = _cute_ir.MemRefType.get(iterator.type, layout.type) + else: + raise TypeError(f"unsupported iterator type, got {type(iterator)}") + + return _cute_ir.make_view(result=ty, iter=iterator, layout=layout, loc=loc, ip=ip) + + +@dsl_user_op +def make_identity_tensor(shape: Shape, *, loc=None, ip=None) -> Tensor: + """Creates an identity tensor with the given shape. + + An identity tensor maps each coordinate to itself, effectively creating a counting + sequence within the shape's bounds. This is useful for generating coordinate indices + or creating reference tensors for layout transformations. + + :param shape: The shape defining the tensor's dimensions. Can be a simple integer + sequence or a hierarchical structure ((m,n),(p,q)) + :type shape: Shape + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: A tensor that maps each coordinate to itself + :rtype: Tensor + + Examples: + + .. code-block:: python + + # Create a simple 1D counting tensor + tensor = make_identity_tensor(6) # [0,1,2,3,4,5] + + # Create a 2D counting tensor + tensor = make_identity_tensor((3,2)) # [(0,0),(1,0),(2,0),(0,1),(1,1),(2,1)] + + # Create hierarchical counting tensor + tensor = make_identity_tensor(((2,1),3)) + # [((0,0),0),((1,0),0),((0,0),1),((1,0),1),((0,0),2),((1,0),2)] + + Notes: + - The shape parameter follows CuTe's IntTuple concept + - Coordinates are ordered colexicographically + - Useful for generating reference coordinates in layout transformations + """ + shape_val = _pack_shape(shape, loc=loc, ip=ip) + return _cute_ir.make_identity_tensor(shape_val, loc=loc, ip=ip) + + +@dsl_user_op +def make_fragment( + layout_or_shape: Union[Layout, Shape], + dtype: Type[Numeric], + *, + loc=None, + ip=None, +) -> Tensor: + if not issubclass(dtype, Numeric): + raise TypeError(f"value_type must be a type of Numeric, but got {type(dtype)}") + elem_ty = dtype.mlir_type + + # Alignment for register memory is useless(?), pick-up large enough number + # to allow .128 (> 16B) load store + alignment = 32 + layout = None + if not isinstance(layout_or_shape, Layout): + layout = make_layout(layout_or_shape, loc=loc, ip=ip) + else: + layout = layout_or_shape + + ptr_ty = _cute_ir.PtrType.get(elem_ty, AddressSpace.rmem, alignment) + res_ty = _cute_ir.MemRefType.get(ptr_ty, layout.type) + tensor = _cute_ir.memref_alloca(res_ty, layout=layout, loc=loc, ip=ip) + return _Tensor(tensor.value, dtype) + + +@overload +def make_fragment_like( + src: Tensor, dtype: Optional[Type[Numeric]], *, loc=None, ip=None +) -> Tensor: ... + + +@overload +def make_fragment_like(src: Layout, *, loc=None, ip=None) -> Layout: ... + + +@overload +def make_fragment_like(src: ComposedLayout, *, loc=None, ip=None) -> ComposedLayout: ... + + +@dsl_user_op +def make_fragment_like(src, dtype=None, *, loc=None, ip=None): + """Create tensor with a compact layout in the same shape as the source on stack. + + This function either creates a fragment tensor with compact layout in + same shape as the source layout or a new layout with the same shape as the source. + The strides of the new layout follow the order induced by the source's strides, with a + special handling of the 0th mode: it is always stride-1 and generated in column-major order + (LayoutLeft). + + :param src: The source layout or tensor whose shape will be matched + :type src: Union[Layout, ComposedLayout, Tensor] + :param dtype: The element type for the fragment tensor, defaults to None + :type dtype: Type[Numeric], optional + :param loc: Source location for MLIR operations, defaults to None + :type loc: Location, optional + :param ip: Insertion point for MLIR operations, defaults to None + :type ip: InsertionPoint, optional + + :return: A new layout or fragment tensor with matching shape + :rtype: Union[Layout, Tensor] + + **Examples** + + Creating a rmem tensor from a tensor: + + .. code-block:: python + + smem_tensor = cute.make_tensor(smem_ptr, layout) + frag_tensor = cute.make_fragment_like(smem_tensor, cutlass.Float32) + # frag_tensor will be a register-backed tensor with the same shape + + Creating a fragment with a different element type: + + .. code-block:: python + + tensor = cute.make_tensor(gmem_ptr, layout) + bool_frag = cute.make_fragment_like(tensor, cutlass.Boolean) + # bool_frag will be a register-backed tensor with Boolean elements + + **Notes** + + - When used with a Tensor, if a type is provided, it will create a new + fragment tensor with that element type. + - For layouts with ScaledBasis strides, the function creates a fragment + from the shape only. + - This function is commonly used in GEMM and other tensor operations to + create register storage for intermediate results. + + """ + if isinstance(src, (Layout, ComposedLayout)): + new_layout = None + # Create base fragment layout + if isinstance(src, Layout) and has_scaled_basis(src.stride): + # For scaled basis strides, create fragment from shape only + new_layout = _cute_ir.make_fragment_like( + make_layout(src.shape), loc=loc, ip=ip + ) + else: + # Otherwise use full source layout + new_layout = _cute_ir.make_fragment_like(src, loc=loc, ip=ip) + if dtype is not None: + # call make_fragment to convert layout to tensor + return make_fragment(new_layout, dtype, loc=loc, ip=ip) + else: + return new_layout + elif isinstance(src, Tensor): + if isinstance(src.type, _cute_ir.CountingTensorType): + if dtype is None: + raise ValueError( + "dtype must be provided when src is a coordinate tensor" + ) + + new_layout = _cute_ir.make_fragment_like( + make_layout(src.shape), loc=loc, ip=ip + ) + return make_fragment(new_layout, dtype, loc=loc, ip=ip) + else: + if dtype is None: + ty = src.element_type.mlir_type + else: + ty = dtype.mlir_type + new_tensor = _cute_ir.make_fragment_like( + src.value, elem_type=ty, loc=loc, ip=ip + ) + return _Tensor( + new_tensor.value, dtype if dtype is not None else src.element_type + ) + else: + raise TypeError( + f"src must be a Layout or ComposedLayout or tensor, got {type(src)}" + ) + + +@dsl_user_op +def recast_tensor( + src: Tensor, dtype: Type[Numeric], swizzle_=None, *, loc=None, ip=None +): + if not isclass(dtype) or not issubclass(dtype, Numeric): + raise TypeError(f"dtype must be a type of Numeric, but got {dtype}") + + if dtype is Boolean: + dst_width = 8 + else: + dst_width = dtype.width + + if src.element_type is Boolean: + src_width = 8 + else: + src_width = src.element_type.width + + src_iter = recast_ptr(src.iterator, dtype=dtype, loc=loc, ip=ip) + src_layout = recast_layout(dst_width, src_width, src.layout, loc=loc, ip=ip) + return make_tensor(src_iter, src_layout, loc=loc, ip=ip) + + +@dsl_user_op +def domain_offset(coord: Coord, tensor: Tensor, *, loc=None, ip=None) -> Tensor: + offset = crd2idx(coord, tensor.layout, loc=loc, ip=ip) + if isinstance(tensor.iterator, Pointer): + return make_tensor(tensor.iterator + offset, tensor.layout) + elif is_integer(tensor.iterator) or isinstance(tensor.iterator, tuple): + new_iter = _cute_ir.add_offset( + _pack_int_tuple(tensor.iterator), _pack_int_tuple(offset) + ) + return make_tensor(_unpack_x_tuple(new_iter), tensor.layout) + else: + raise ValueError(f"unsupported tensor for domain_offset, got {tensor}") + + +# +# Layout algebra +# + + +@overload +def composition( + lhs: Layout, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None +) -> Layout: ... + + +@overload +def composition( + lhs: Tensor, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None +) -> Tensor: ... + + +@dsl_user_op +def composition(lhs, rhs: Union[Layout, Shape, Tile], *, loc=None, ip=None): + """ + Compose two layout representations using the CuTe layout algebra. + + Compose a left-hand layout (or tensor) with a right-hand operand into a new layout R, such that + for every coordinate c in the domain of the right-hand operand, the composed layout satisfies: + + R(c) = A(B(c)) + + where A is the left-hand operand provided as ``lhs`` and B is the right-hand operand provided as + ``rhs``. In this formulation, B defines the coordinate domain while A applies its transformation to + B's output, and the resulting layout R inherits the stride and shape adjustments from A. + + Satisfies: + cute.shape(cute.composition(lhs, rhs)) is compatible with cute.shape(rhs) + + :param lhs: The left-hand operand representing the transformation to be applied. + :type lhs: Layout or Tensor + :param rhs: The right-hand operand defining the coordinate domain. If provided as an int or tuple, + it will be converted to a tile layout. + :type rhs: Layout, Shape, or Tile, or int or tuple + :param loc: Optional location information for IR diagnostics. + :type loc: optional + :param ip: Optional instruction pointer or context for underlying IR functions. + :type ip: optional + :returns: A new composed layout R, such that for all coordinates c in the domain of ``rhs``, + R(c) = lhs(rhs(c)). + :rtype: Layout or Tensor + + Example: + + .. code-block:: python + + import cutlass.cute as cute + @cute.jit + def foo(): + # Create a layout that maps (i,j) to i*4 + j + L1 = cute.make_layout((2, 3), stride=(4, 1)) + # Create a layout that maps (i,j) to i*3 + j + L2 = cute.make_layout((3, 4), stride=(3, 1)) + # Compose L1 and L2 + L3 = cute.composition(L1, L2) + # L3 now maps coordinates through L2 then L1 + """ + rhs_val = rhs + if not isinstance(rhs, Layout) and isinstance(rhs, (int, tuple)): + rhs_val = _pack_tile(rhs, loc=loc, ip=ip) + if isinstance(lhs, _Tensor): + lhs = lhs.value + return _cute_ir.composition(lhs, rhs_val, loc=loc, ip=ip) + + +@dsl_user_op +def complement( + input: Layout, cotarget: Union[Layout, Shape], *, loc=None, ip=None +) -> Layout: + """ + Compute the complement layout of the input layout with respect to the cotarget. + + The complement of a layout A with respect to cotarget n is a layout A* such that + for every k in Z_n and c in the domain of A, there exists a unique c* in the domain + of A* where k = A(c) + A*(c*). + + This operation is useful for creating layouts that partition a space in complementary ways, + such as row and column layouts that together cover a matrix. + + :param input: The layout to compute the complement of + :type input: Layout + :param cotarget: The target layout or shape that defines the codomain + :type cotarget: Union[Layout, Shape] + :param loc: Optional location information for IR diagnostics + :type loc: optional + :param ip: Optional instruction pointer or context for underlying IR functions + :type ip: optional + :returns: The complement layout + :rtype: Layout + + Example: + + .. code-block:: python + + import cutlass.cute as cute + @cute.jit + def foo(): + # Create a right-major layout for a 4x4 matrix + row_layout = cute.make_layout((4, 4), stride=(4, 1)) + # Create a left-major layout that complements the row layout + col_layout = cute.complement(row_layout, 16) + # The two layouts are complementary under 16 + """ + if isinstance(cotarget, Layout): + return _cute_ir.complement(input, cotarget=cotarget, loc=loc, ip=ip) + else: + cotarget_val = _pack_shape(cotarget, loc=loc, ip=ip) + return _cute_ir.complement(input, cotarget=cotarget_val, loc=loc, ip=ip) + + +@dsl_user_op +def right_inverse(input: Layout, *, loc=None, ip=None) -> Layout: + if not isinstance(input, Layout): + raise TypeError(f"expects input of type Layout, but got {type(Layout)}") + return _cute_ir.right_inverse(input=input, loc=loc, ip=ip) + + +@dsl_user_op +def left_inverse(input: Layout, *, loc=None, ip=None) -> Layout: + if not isinstance(input, Layout): + raise TypeError(f"expects input of type Layout, but got {type(Layout)}") + return _cute_ir.left_inverse(input=input, loc=loc, ip=ip) + + +@overload +def logical_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ... +@overload +def logical_product( + block: ComposedLayout, tiler: Layout, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def logical_product(block, tiler: Layout, *, loc=None, ip=None): + return _cute_ir.logical_product(input=block, tiler=tiler, loc=loc, ip=ip) + + +@overload +def zipped_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ... +@overload +def zipped_product( + block: ComposedLayout, tiler: Layout, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def zipped_product(block, tiler: Layout, *, loc=None, ip=None): + return _cute_ir.zipped_product(input=block, tiler=tiler, loc=loc, ip=ip) + + +@overload +def tiled_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ... +@overload +def tiled_product( + block: ComposedLayout, tiler: Layout, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def tiled_product(block, tiler: Layout, *, loc=None, ip=None): + return _cute_ir.tiled_product(input=block, tiler=tiler, loc=loc, ip=ip) + + +@overload +def flat_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ... +@overload +def flat_product( + block: ComposedLayout, tiler: Layout, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def flat_product(block, tiler: Layout, *, loc=None, ip=None): + return _cute_ir.flat_product(input=block, tiler=tiler, loc=loc, ip=ip) + + +@overload +def raked_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ... +@overload +def raked_product( + block: ComposedLayout, tiler: Layout, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def raked_product(block, tiler: Layout, *, loc=None, ip=None): + return _cute_ir.raked_product(input=block, tiler=tiler, loc=loc, ip=ip) + + +@overload +def blocked_product(block: Layout, tiler: Layout, *, loc=None, ip=None) -> Layout: ... +@overload +def blocked_product( + block: ComposedLayout, tiler: Layout, *, loc=None, ip=None +) -> ComposedLayout: ... + + +@dsl_user_op +def blocked_product(block, tiler: Layout, *, loc=None, ip=None): + return _cute_ir.blocked_product(input=block, tiler=tiler, loc=loc, ip=ip) + + +@overload +def logical_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ... +@overload +def logical_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ... + + +@dsl_user_op +def logical_divide(target, tiler: Tiler, *, loc=None, ip=None): + if isinstance(target, _Tensor): + target = target.value + if isinstance(tiler, tuple): + tiler = _pack_tile(tiler, loc=loc, ip=ip) + return _cute_ir.logical_divide(input=target, tiler=tiler, loc=loc, ip=ip) + + +@overload +def zipped_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ... +@overload +def zipped_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ... + + +@dsl_user_op +def zipped_divide(target, tiler: Tiler, *, loc=None, ip=None): + if isinstance(target, _Tensor): + target = target.value + if isinstance(tiler, tuple): + tiler = _pack_tile(tiler, loc=loc, ip=ip) + return _cute_ir.zipped_divide(input=target, tiler=tiler, loc=loc, ip=ip) + + +@overload +def tiled_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ... +@overload +def tiled_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ... + + +@dsl_user_op +def tiled_divide(target, tiler: Tiler, *, loc=None, ip=None): + if isinstance(target, _Tensor): + target = target.value + if isinstance(tiler, tuple): + tiler = _pack_tile(tiler, loc=loc, ip=ip) + return _cute_ir.tiled_divide(input=target, tiler=tiler, loc=loc, ip=ip) + + +@overload +def flat_divide(target: Layout, tiler: Tiler, *, loc=None, ip=None) -> Layout: ... +@overload +def flat_divide(target: Tensor, tiler: Tiler, *, loc=None, ip=None) -> Tensor: ... + + +@dsl_user_op +def flat_divide(target, tiler: Tiler, *, loc=None, ip=None): + if isinstance(target, _Tensor): + target = target.value + if isinstance(tiler, tuple): + tiler = _pack_tile(tiler, loc=loc, ip=ip) + return _cute_ir.flat_divide(input=target, tiler=tiler, loc=loc, ip=ip) + + +# +# Higher-level utilties +# + + +@dsl_user_op +def max_common_layout( + a: Union[Layout, Tensor], b: Union[Layout, Tensor], *, loc=None, ip=None +) -> Layout: + a_layout = a.layout if isinstance(a, _Tensor) else a + b_layout = b.layout if isinstance(b, _Tensor) else b + + inv_b = right_inverse(b_layout, loc=loc, ip=ip) + common = coalesce(composition(a_layout, inv_b, loc=loc, ip=ip), loc=loc, ip=ip) + + # some_ir_value == 1 generates a new IR Value which evaluates to True! + s = get(common.shape, mode=[0], loc=loc, ip=ip) + d = get(common.stride, mode=[0], loc=loc, ip=ip) + # Keep only the static identity component of the common layout + if isinstance(s, int) and isinstance(d, int) and d == 1: + # Truncate to the size of the contiguous vector (static stride-1 mode) + return composition(inv_b, get(common, mode=[0], loc=loc, ip=ip), loc=loc, ip=ip) + else: + return make_layout(1, stride=0, loc=loc, ip=ip) + + +@dsl_user_op +def max_common_vector( + a: Union[Layout, Tensor], b: Union[Layout, Tensor], *, loc=None, ip=None +) -> int: + a_layout = a.layout if isinstance(a, _Tensor) else a + b_layout = b.layout if isinstance(b, _Tensor) else b + + inv_b = right_inverse(b_layout, loc=loc, ip=ip) + common = coalesce(composition(a_layout, inv_b, loc=loc, ip=ip), loc=loc, ip=ip) + + # Keep only the static identity component of the common layout + if ( + is_static(get(common.shape, mode=[0], loc=loc, ip=ip)) + and get(common.stride, mode=[0], loc=loc, ip=ip) == 1 + ): + # Truncate to the size of the contiguous vector (static stride-1 mode) + return get(common.shape, mode=[0], loc=loc, ip=ip) + else: + return 1 + + +@dsl_user_op +def tile_to_shape( + atom: Union[Layout, ComposedLayout], + trg_shape: Shape, + order: Shape, + *, + loc=None, + ip=None, +) -> Union[Layout, ComposedLayout]: + trg_shape = _pack_shape(shape(trg_shape), loc=loc, ip=ip) + order = _pack_shape(order, loc=loc, ip=ip) + return _cute_ir.tile_to_shape(atom, trg_shape, order, loc=loc, ip=ip) + + +@dsl_user_op +def local_partition( + target: Tensor, + tiler: Union[Layout, Shape], + index, + proj: XTuple = 1, + *, + loc=None, + ip=None, +) -> Tensor: + return _cute_ir.local_partition( + input=target.value, tiler=dice(tiler, proj), index=index, loc=loc, ip=ip + ) + + +@dsl_user_op +def local_tile( + input: Tensor, + tiler: Union[Layout, Shape], + coord: Coord, + proj: XTuple = None, + *, + loc=None, + ip=None, +) -> Tensor: + tiler_val = _pack_shape(tiler, loc=loc, ip=ip) + coord_val = _pack_coord(coord, loc=loc, ip=ip) + if proj is not None: + if not isinstance(proj, tuple): + raise TypeError(f"Expects tuple for proj, but got {type(proj)}") + proj_val = _pack_coord(proj, loc=loc, ip=ip) + proj = proj_val.type.attribute + + return _cute_ir.local_tile( + input=input.value, + tile=tiler_val, + static_tile=None, + coord=coord_val, + static_coord=None, + proj=proj, + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def make_layout_image_mask( + lay: Layout, coord: Coord, mode: int, *, loc=None, ip=None +) -> Int16: + """ + Makes a 16-bit integer mask of the image of a layout sliced at a given mode + and accounting for the offset given by the input coordinate for the other modes. + """ + if not is_static(lay): + raise ValueError( + f"make_layout_image_mask requires the layout to be static, but got {pretty_str(lay)}" + ) + r = rank(lay) + if rank(coord) != r: + raise ValueError( + f"the rank of the coordinate must be equal to the one of the layout, but got {pretty_str(coord)}" + ) + if mode > r or mode < 0: + raise ValueError(f"expects `mode` to be in [0,rank(lay)), but got {mode}") + # Given that we require the layout to be static, we can check that the mask fits in 16 bits + # This might be too conservative but safe + if cosize(lay) > 16: + raise ValueError("the mask may not fit into a 16-bit integer") + + # Replace the mode to keep with _ in the coordinate + slicer = tuple(None if idx == mode else x for idx, x in enumerate(coord)) + # Slice the layout with the slicer above and keep track of the offset + sliced_lay, offset = slice_and_offset(slicer, lay, loc=loc, ip=ip) + # Given that we replace only one mode with _, the rank of the slice should be 1 + assert rank(sliced_lay) == 1 + + # Create the mask of the image + mcast_mask = Int16(0) + for i in range(size(sliced_lay)): + mcast_mask = mcast_mask | (1 << sliced_lay(i)) + mcast_mask <<= offset + return Int16(mcast_mask) + + +#################################################################################################### +# +# Atom +# +#################################################################################################### + + +class Op(ABC): + """ + Operation abstract base class. + """ + + pass + + +class MmaOp(Op): + """ + MMA Operation abstract base class. + """ + + @abstractmethod + def _make_trait(self, *, loc=None, ip=None, **kwargs): + pass + + +class CopyOp(Op): + """ + Copy Operation abstract base class. + """ + + @abstractmethod + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ): + pass + + +class Trait(ABC): + """ + Trait abstract base class. + + Traits are internal-only classes used by Atoms that wrap the underlying IR Value. The Python + user should only interact with Ops and Atoms. + """ + + def __init__(self, value: ir.Value) -> None: + self.value = value + + def __extract_mlir_values__(self): + return [self.value] + + def __new_from_mlir_values__(self, values): + return self.__class__(values[0]) + + def set(self, field, value, *, loc=None, ip=None) -> None: + raise NotImplementedError( + "set not implemented, the requesting Atom has likely no runtime state" + ) + + def unpack(self, *, loc=None, ip=None, **kwargs) -> ir.Value: + return self.value + + +class Atom(ABC): + """ + Atom base class. + + An Atom is the composition of + + - a MMA or Copy Operation; + - an internal MMA or Copy Trait. + + An Operation is a pure Python class that is used to model a specific MMA or Copy instruction. + The Trait wraps the underlying IR Value and provides access to the metadata of the instruction + encoded using CuTe Layouts. When the Trait can be constructed straighforwardly from an + Operation, the ``make_mma_atom`` or ``make_copy_atom`` API should be used. There are cases where + constructing the metadata is not trivial and requires more information, for example to determine + the number of bytes copied per TMA instruction ("the TMA vector length"). In such cases, + dedicated helper functions are provided with an appropriate API such that the Atom is + constructed internally in an optimal fashion for the user. + """ + + def __init__(self, op: Op, trait: Trait) -> None: + self._op = op + self._trait = trait + + def __extract_mlir_values__(self): + return extract_mlir_values(self._trait) + + def __new_from_mlir_values__(self, values): + return self.__class__(self.op, new_from_mlir_values(self._trait, values)) + + @property + def op(self) -> Op: + return self._op + + @property + def type(self): + return self._trait.value.type + + @dsl_user_op + def set(self, modifier, value, *, loc=None, ip=None) -> None: + """ + Sets runtime fields of the Atom. + + Some Atoms have runtime state, for example a tcgen05 MMA Atom + + + .. code-block:: python + + tiled_mma = cute.make_tiled_mma(some_tcgen05_mma_op) + tiled_mma.set(cute.nvgpu.tcgen05.Field.ACCUMULATE, True) + + The ``set`` method provides a way to the user to modify such runtime state. Modifiable + fields are provided by arch-specific enumerations, for example ``tcgen05.Field``. The Atom + instance internally validates the field as well as the value provided by the user to set + the field to. + """ + self._trait.set(modifier, value, loc=loc, ip=ip) + + def _unpack(self, *, loc=None, ip=None, **kwargs) -> ir.Value: + return self._trait.unpack(loc=loc, ip=ip, **kwargs) + + +#################################################################################################### +# +# MMA Atoms, TiledMma, and ThrMma +# +#################################################################################################### + + +class MmaAtom(Atom): + """ + The MMA Atom class. + """ + + def __str__(self) -> str: + res = "MMA Atom\n" + res += " ThrID: " + pretty_str(self.thr_id) + "\n" + res += " Shape MNK: " + pretty_str(self.shape_mnk) + "\n" + res += " TV Layout A: " + pretty_str(self.tv_layout_A) + "\n" + res += " TV Layout B: " + pretty_str(self.tv_layout_B) + "\n" + res += " TV Layout C: " + pretty_str(self.tv_layout_C) + return res + + # + # Properties + # + + @property + def thr_id(self) -> Layout: + return _cute_ir.static(self._trait.value.type.thr_id) + + @property + def shape_mnk(self) -> Shape: + return _unpack_x_tuple(self._trait.value.type.shape_mnk) + + @property + def tv_layout_A(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_a_tv) + + @property + def tv_layout_B(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_b_tv) + + @property + def tv_layout_C(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_c_tv) + + # + # make_fragment + # + + @dsl_user_op + def make_fragment_A(self, input, *, loc=None, ip=None): + # input could be memref/shape/layout for tmem based fragment + if isinstance(input, _Tensor): + input = input.value + if isinstance(input, tuple): + input = _pack_shape(input, loc=loc, ip=ip) + return _cute_ir.mma_make_fragment( + _cute_ir.MmaOperand.A, + self._trait.value, + input, + loc=loc, + ip=ip, + ) + + @dsl_user_op + def make_fragment_B(self, input, *, loc=None, ip=None): + if isinstance(input, _Tensor): + input = input.value + return _cute_ir.mma_make_fragment( + _cute_ir.MmaOperand.B, + self._trait.value, + input, + loc=loc, + ip=ip, + ) + + @dsl_user_op + def make_fragment_C(self, input, *, loc=None, ip=None): + # input could be memref/shape/layout for tmem based fragment + if isinstance(input, _Tensor): + input = input.value + if isinstance(input, tuple): + input = _pack_shape(input, loc=loc, ip=ip) + return _cute_ir.mma_make_fragment( + _cute_ir.MmaOperand.C, + self._trait.value, + input, + loc=loc, + ip=ip, + ) + + +class TiledMma(MmaAtom): + """ + The tiled MMA class. + """ + + def __str__(self) -> str: + res = "Tiled MMA\n" + res += " Thr Layout VMNK: " + pretty_str(self.thr_layout_vmnk) + "\n" + res += " Permutation MNK: " + pretty_str(self.permutation_mnk) + "\n" + res += "MMA Atom\n" + res += " ThrID: " + pretty_str(self.thr_id) + "\n" + res += " Shape MNK: " + pretty_str(self.shape_mnk) + "\n" + res += " TV Layout A: " + pretty_str(self.tv_layout_A) + "\n" + res += " TV Layout B: " + pretty_str(self.tv_layout_B) + "\n" + res += " TV Layout C: " + pretty_str(self.tv_layout_C) + return res + + # + # Properties + # + + @property + def tv_layout_A_tiled(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_a_tv_tiled) + + @property + def tv_layout_B_tiled(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_b_tv_tiled) + + @property + def tv_layout_C_tiled(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_c_tv_tiled) + + @property + def permutation_mnk(self) -> Tile: + return _unpack_x_tuple(self._trait.value.type.permutation_mnk) + + @property + def thr_layout_vmnk(self) -> Layout: + return _cute_ir.static(self._trait.value.type.thr_layout_vmnk) + + @property + def size(self) -> int: + return self._trait.value.type.size + + # + # Tiler + # + + def get_tile_size(self, mode_idx: int) -> Shape: + assert (mode_idx >= 0) and (mode_idx < 3) + perm_tile = self.permutation_mnk[mode_idx] + if perm_tile is None: + thr_layout_vmnk = self.thr_layout_vmnk + atom_shape_mnk = self.shape_mnk + return size(atom_shape_mnk, mode=[mode_idx]) * size( + thr_layout_vmnk, mode=[mode_idx + 1] + ) + else: + return size(perm_tile) + + # + # get_slice + # + + def get_slice(self, thr_idx: Union[int, Int32]) -> "ThrMma": + return ThrMma(self.op, self._trait, thr_idx) + + # + # partition_shape + # + + def _partition_shape(self, operand_id, shape, *, loc=None, ip=None): + shape = _pack_shape(shape, loc=loc, ip=ip) + return _unpack_x_tuple( + _cute_ir.tiled_mma_partition_shape( + operand_id, self._trait.value, shape, loc=loc, ip=ip + ), + loc=loc, + ip=ip, + ) + + @dsl_user_op + def partition_shape_A(self, shape_mk, *, loc=None, ip=None): + return self._partition_shape(_cute_ir.MmaOperand.A, shape_mk, loc=loc, ip=ip) + + @dsl_user_op + def partition_shape_B(self, shape_nk, *, loc=None, ip=None): + return self._partition_shape(_cute_ir.MmaOperand.B, shape_nk, loc=loc, ip=ip) + + @dsl_user_op + def partition_shape_C(self, shape_mn, *, loc=None, ip=None): + return self._partition_shape(_cute_ir.MmaOperand.C, shape_mn, loc=loc, ip=ip) + + # + # _thrfrg + # + + @overload + def _thrfrg(self, operand_id, input: Layout, *, loc=None, ip=None) -> Layout: ... + + @overload + def _thrfrg(self, operand_id, input: Tensor, *, loc=None, ip=None) -> Tensor: ... + + def _thrfrg(self, operand_id, input, *, loc=None, ip=None) -> Union[Tensor, Layout]: + if isinstance(input, Tensor): + return make_tensor( + input.iterator, + self._thrfrg(operand_id, input.layout, loc=loc, ip=ip), + ) + elif isinstance(input, Layout): + if not is_static(input.type): + raise ValueError(f"Expects a static layout but got {input.type}") + return _cute_ir.static( + self._trait.value.type.thrfrg(operand_id, input), loc=loc, ip=ip + ) + + raise ValueError( + f"Expects a layout or a tensor as input but got {type(input)=}" + ) + + def _thrfrg_A( + self, input: Union[Layout, Tensor], *, loc=None, ip=None + ) -> Union[Layout, Tensor]: + return self._thrfrg(_cute_ir.MmaOperand.A, input, loc=loc, ip=ip) + + def _thrfrg_B( + self, input: Union[Layout, Tensor], *, loc=None, ip=None + ) -> Union[Layout, Tensor]: + return self._thrfrg(_cute_ir.MmaOperand.B, input, loc=loc, ip=ip) + + def _thrfrg_C( + self, input: Union[Layout, Tensor], *, loc=None, ip=None + ) -> Union[Layout, Tensor]: + return self._thrfrg(_cute_ir.MmaOperand.C, input, loc=loc, ip=ip) + + +class ThrMma(TiledMma): + """ + The thread MMA class for modeling a thread-slice of a tiled MMA. + """ + + def __init__(self, op: Op, trait: Trait, thr_idx: Union[int, Int32]) -> None: + super().__init__(op, trait) + self._thr_idx = thr_idx + + def __new_from_mlir_values__(self, values): + return self.__class__( + self.op, new_from_mlir_values(self._trait, values), self.thr_idx + ) + + @property + def thr_idx(self): + return self._thr_idx + + @dsl_user_op + def partition_A(self, input_mk: Tensor, *, loc=None, ip=None) -> Tensor: + thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip) + return _cute_ir.tiled_mma_partition( + _cute_ir.MmaOperand.A, + self._trait.value, + input_mk.value, + thr_idx, + loc=loc, + ip=ip, + ) + + @dsl_user_op + def partition_B(self, input_nk: Tensor, *, loc=None, ip=None) -> Tensor: + thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip) + return _cute_ir.tiled_mma_partition( + _cute_ir.MmaOperand.B, + self._trait.value, + input_nk.value, + thr_idx, + loc=loc, + ip=ip, + ) + + @dsl_user_op + def partition_C(self, input_mn: Tensor, *, loc=None, ip=None) -> Tensor: + thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip) + return _cute_ir.tiled_mma_partition( + _cute_ir.MmaOperand.C, + self._trait.value, + input_mn.value, + thr_idx, + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def make_mma_atom(op: MmaOp, *, loc=None, ip=None, **kwargs) -> MmaAtom: + """ + Makes an MMA Atom from an MMA Operation. + + This function creates an MMA Atom from a given MMA Operation. Arbitrary kw arguments can be + provided for Op-specific additional parameters. They are not used as of today. + + :param op: The MMA Operation to construct an Atom for + :type op: MmaOp + :return: The MMA Atom + :rtype: MmaAtom + """ + trait = op._make_trait(loc=loc, ip=ip, **kwargs) + return MmaAtom(op, trait) + + +@dsl_user_op +def make_tiled_mma( + op_or_atom: Union[Op, MmaAtom], + atom_layout_mnk=(1, 1, 1), + permutation_mnk=None, + *, + loc=None, + ip=None, + **kwargs, +) -> TiledMma: + """ + Makes a tiled MMA from an MMA Operation or an MMA Atom. + + :param op_or_atom: The MMA Operation or Atom + :type op_or_atom: Union[Op, MmaAtom] + :param atom_layout_mnk: A Layout describing the tiling of Atom across threads + :type atom_layout_mnk: Layout + :param permutation_mnk: A permutation Tiler describing the tiling of Atom across values including any permutation of such tiling + :type permutation_mnk: Tiler + :return: The resulting tiled MMA + :rtype: TiledMma + """ + if isinstance(op_or_atom, Op): + op = op_or_atom + atom = make_mma_atom(op_or_atom, loc=loc, ip=ip, **kwargs) + elif isinstance(op_or_atom, MmaAtom): + op = op_or_atom.op + atom = op_or_atom + else: + raise TypeError( + f"expected an MMA Op or Atom, but got an instance of {type(op_or_atom)}" + ) + if isinstance(atom_layout_mnk, tuple): + atom_layout_mnk = make_layout(atom_layout_mnk, loc=loc, ip=ip) + if rank(atom_layout_mnk) != 3: + raise ValueError(f"expects rank-3 MNK atom layout, but got {atom_layout_mnk}") + permutation_mnk_ty = None + if permutation_mnk is not None: + permutation_mnk_ty = _pack_tile(permutation_mnk, loc=loc, ip=ip).type + ty = _cute_nvgpu_ir.TiledMmaType.get( + atom._trait.value.type, + atom_layout_mnk.type, + permutation_mnk_ty, + ) + val = _cute_ir.make_tiled_mma(ty, atom._trait.value, loc=loc, ip=ip) + # Instead of modifying atom which might have been provided by the user, create a brand new + # trait instance and replace the Atom ir.Value with the tiled one + trait = new_from_mlir_values(atom._trait, [val]) + return TiledMma(op, trait) + + +#################################################################################################### +# +# Copy Atoms, TiledCopy, and ThrCopy +# +#################################################################################################### + + +class CopyAtom(Atom): + """ + The Copy Atom class. + """ + + def __str__(self) -> str: + res = "Copy Atom\n" + res += " ThrID: " + str(self.thr_id) + "\n" + res += " TV Layout Src: " + str(self.layout_src_tv) + "\n" + res += " TV Layout Dst: " + str(self.layout_dst_tv) + "\n" + res += " Value type: " + str(self._trait.value.type.value_type) + return res + + # + # Properties + # + + @property + def value_type(self) -> Type[Numeric]: + return Numeric.from_mlir_type(self._trait.value.type.value_type) + + @property + def thr_id(self) -> Layout: + return _cute_ir.static(self._trait.value.type.thr_id) + + @property + def layout_src_tv(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_src_tv) + + @property + def layout_dst_tv(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_dst_tv) + + +class TiledCopy(CopyAtom): + """ + The tiled Copy class. + """ + + def __str__(self) -> str: + res = "Tiled Copy\n" + res += " Tiler MN: " + pretty_str(self.tiler_mn) + "\n" + res += " TV Layout tiled: " + str(self.layout_tv_tiled) + "\n" + res += "Copy Atom\n" + res += " ThrID: " + str(self.thr_id) + "\n" + res += " TV Layout Src: " + str(self.layout_src_tv) + "\n" + res += " TV Layout Dst: " + str(self.layout_dst_tv) + "\n" + res += " Value type: " + str(self._trait.value.type.value_type) + return res + + # + # Properties + # + + @property + def layout_tv_tiled(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_tv_tiled) + + @property + def tiler_mn(self) -> Tile: + return _unpack_x_tuple(self._trait.value.type.tiler_mn) + + @property + def layout_src_tv_tiled(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_src_tv_tiled) + + @property + def layout_dst_tv_tiled(self) -> Layout: + return _cute_ir.static(self._trait.value.type.layout_dst_tv_tiled) + + @property + def size(self) -> int: + return self._trait.value.type.size + + # + # get_slice and retile + # + + def get_slice(self, thr_idx: Union[int, Int32]) -> "ThrCopy": + return ThrCopy(self.op, self._trait, thr_idx) + + @dsl_user_op + def retile(self, src, *, loc=None, ip=None): + return _cute_ir.tiled_copy_retile( + tiled_copy=self._trait.value, input=src.value, loc=loc, ip=ip + ) + + +class ThrCopy(TiledCopy): + """ + The thread Copy class for modeling a thread-slice of a tiled Copy. + """ + + def __init__(self, op: Op, trait: Trait, thr_idx: Union[int, Int32]) -> None: + super().__init__(op, trait) + self._thr_idx = thr_idx + + def __new_from_mlir_values__(self, values): + return self.__class__( + self.op, new_from_mlir_values(self._trait, values), self.thr_idx + ) + + @property + def thr_idx(self): + return self._thr_idx + + @dsl_user_op + def partition_S(self, src: Tensor, *, loc=None, ip=None) -> Tensor: + thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip) + return _cute_ir.tiled_copy_partition_S( + self._trait.value, src.value, thr_idx, loc=loc, ip=ip + ) + + @dsl_user_op + def partition_D(self, dst: Tensor, *, loc=None, ip=None) -> Tensor: + thr_idx = _pack_coord(self.thr_idx, loc=loc, ip=ip) + return _cute_ir.tiled_copy_partition_D( + self._trait.value, dst.value, thr_idx, loc=loc, ip=ip + ) + + +@dsl_user_op +def make_copy_atom( + op: CopyOp, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs +) -> CopyAtom: + """ + Makes a Copy Atom from a Copy Operation. + + This function creates a Copy Atom from a given Copy Operation. Arbitrary kw arguments can be + provided for Op-specific additional parameters. + + Example: + + .. code-block:: python + + op = cute.nvgpu.CopyUniversalOp() + atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64) + + :param op: The Copy Operation to construct an Atom for + :type op: CopyOp + :param copy_internal_type: An internal data type used to construct the source/destination layouts in unit of tensor elements + :type copy_internal_type: Type[Numeric] + :return: The Copy Atom + :rtype: CopyAtom + """ + trait = op._make_trait(copy_internal_type, loc=loc, ip=ip, **kwargs) + return CopyAtom(op, trait) + + +@dsl_user_op +def make_layout_tv( + thr_layout: Layout, val_layout: Layout, *, loc=None, ip=None +) -> Tuple[Shape, Layout]: + """ + Create a tiled copy given separate thr and val layouts. A TV partitioner is inferred based on inputs. + Requires input thr layout be compact. + + Parameters + ---------- + atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc. + thr_layout : mn -> tid (need to be compact?) + val_layout : mn -> vid + loc : source location for mlir (optional) + ip : insertion point (optional) + + Returns + ------- + layout_mn + logical tile size + layout_tv + thread-value layout (tid, vid) -> mn + """ + + # Take the raked_products to compute the Layout_MN + # (M,N) -> (thr_idx, val_idx) + layout_mn = raked_product(thr_layout, val_layout, loc=loc, ip=ip) + thr_size = size(thr_layout, loc=loc, ip=ip) + val_size = size(val_layout, loc=loc, ip=ip) + tmp = make_layout((thr_size, val_size), loc=loc, ip=ip) + # (thr_idx, val_idx) -> (M,N) + layout_tv = composition( + right_inverse(layout_mn, loc=loc, ip=ip), tmp, loc=loc, ip=ip + ) + + tiler_mn = product_each(layout_mn.shape, loc=loc, ip=ip) + + return (tiler_mn, layout_tv) + + +@dsl_user_op +def make_tiled_copy_tv(atom, thr_layout, val_layout, *, loc=None, ip=None) -> TiledCopy: + """ + Create a tiled copy given separate thr and val layouts. A TV partitioner is inferred based on inputs. + Requires input thr layout be compact. + + Parameters + ---------- + atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc. + thr_layout : mn -> tid (need to be compact?) + val_layout : mn -> vid + loc : source location for mlir (optional) + ip : insertion point (optional) + + Returns + ------- + tiled_copy + A tiled copy for partitioner + """ + + tiler_mn, layout_tv = make_layout_tv(thr_layout, val_layout, loc=loc, ip=ip) + tiler_mn = _pack_tile(product_each(tiler_mn, loc=loc, ip=ip), loc=loc, ip=ip) + if not is_static(layout_tv.type) or not is_static(tiler_mn.type): + raise ValueError( + f"expects layout tv and tiler mn, but got {layout_tv.type} and {tiler_mn.type}" + ) + tiled_copy_ty = _cute_nvgpu_ir.TiledCopyType.get( + atom.type, layout_tv.type, tiler_mn.type + ) + val = _cute_ir.make_tiled_copy(tiled_copy_ty, atom._trait.value, loc=loc, ip=ip) + # Instead of modifying atom which might have been provided by the user, create a brand new + # trait instance and replace the Atom ir.Value with the tiled one + trait = new_from_mlir_values(atom._trait, [val]) + return TiledCopy(atom.op, trait) + + +@dsl_user_op +def make_tiled_copy(atom, layout_tv, tiler_mn, *, loc=None, ip=None): + """ + Create a tiled type given a TV partitioner and tiler + + Parameters + ---------- + atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc. + layout_tv : thread-value layout. + tiler_mn : tile size (??) + loc : source location for mlir (optional) + ip : insertion point (optional) + + Returns + ------- + tiled_copy + A tuple of A tiled copy and atom + """ + + # tiler_mn = pack_tuple(tiler_mn, make_tile) + if type(tiler_mn) is tuple: + tiler_mn = _pack_tile(tiler_mn, loc=loc, ip=ip) + + assert is_static(layout_tv.type) and is_static( + tiler_mn.type + ), "layout tv and tiler mn must be static" + tiled_copy_ty = _cute_nvgpu_ir.TiledCopyType.get( + atom.type, layout_tv.type, tiler_mn.type + ) + val = _cute_ir.make_tiled_copy(tiled_copy_ty, atom._trait.value, loc=loc, ip=ip) + # Instead of modifying atom which might have been provided by the user, create a brand new + # trait instance and replace the Atom ir.Value with the tiled one + trait = new_from_mlir_values(atom._trait, [val]) + return TiledCopy(atom.op, trait) + + +@dsl_user_op +def make_tiled_copy_S(atom, tiled_copy, *, loc=None, ip=None): + """ + Create a tiled type out of the copy_atom that matches the Src-Layout of tiled_copy. + + Parameters + ---------- + atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc. + tiled_copy : tiled copy + loc : source location for mlir (optional) + ip : insertion point (optional) + + Returns + ------- + tiled_copy + A tuple of A tiled copy and atom + """ + + return make_tiled_copy( + atom, tiled_copy.layout_src_tv_tiled, tiled_copy.tiler_mn, loc=loc, ip=ip + ) + + +@dsl_user_op +def make_tiled_copy_D(atom, tiled_copy, *, loc=None, ip=None): + """ + Create a tiled type out of the copy_atom that matches the Dst-Layout of tiled_copy. + + Parameters + ---------- + atom : copy atom, e.g. smit_copy and simt_async_copy, tma_load, etc. + tiled_copy : tiled copy + loc : source location for mlir (optional) + ip : insertion point (optional) + + Returns + ------- + tiled_copy + A tuple of A tiled copy and atom + """ + + return make_tiled_copy( + atom, tiled_copy.layout_dst_tv_tiled, tiled_copy.tiler_mn, loc=loc, ip=ip + ) + + +@dsl_user_op +def make_tiled_copy_C_atom(atom: CopyAtom, mma: TiledMma, *, loc=None, ip=None): + """ + Create the smallest tiled copy that can retile LayoutC_TV + for use with pipelined epilogues with subtiled stores + + Parameters + ---------- + atom: CopyAtom + mma : TiledMma + loc : source location for mlir (optional) + ip : insertion point (optional) + + Returns + ------- + tiled_copy + A tiled copy for partitioner + """ + # Truncate the V-layout to just the Copy_Atom, keep the V-order + layoutC_tv = mma.tv_layout_C_tiled + val_layout_src = atom.layout_src_tv + num_val_src = size(val_layout_src, mode=[1], loc=loc, ip=ip) + num_val_layoutC_tv = size(layoutC_tv, mode=[1], loc=loc, ip=ip) + if num_val_src > num_val_layoutC_tv: + raise ValueError( + f"The number value of CopyAtom's source layout {num_val_src} " + f"is greater than the size of TiledMma's LayoutC_TV {num_val_layoutC_tv}" + ) + layout_TV = composition( + layoutC_tv, + make_layout( + (size(layoutC_tv, mode=[0], loc=loc, ip=ip), num_val_src), loc=loc, ip=ip + ), + loc=loc, + ip=ip, + ) + + # Recompute tiler and restride the TV layout for the new tiler + + # Tiler -- Find the active elements in the MMA tensor and generate a tiler to extract them + # Convert to the awkward by-mode tiler to preserve the modes of the tiled MMA + mma_tiler = (mma.get_tile_size(0), mma.get_tile_size(1)) + + tiler_0 = filter( + composition( + make_layout(mma_tiler, stride=(1, 0), loc=loc, ip=ip), + layout_TV, + loc=loc, + ip=ip, + ), + loc=loc, + ip=ip, + ) + tiler_1 = filter( + composition( + make_layout(mma_tiler, stride=(0, 1), loc=loc, ip=ip), + layout_TV, + loc=loc, + ip=ip, + ), + loc=loc, + ip=ip, + ) + tiler = (tiler_0, tiler_1) + + tile2mma = composition( + make_layout(mma_tiler, loc=loc, ip=ip), tiler, loc=loc, ip=ip + ) + layout_tv = composition( + left_inverse(tile2mma, loc=loc, ip=ip), layout_TV, loc=loc, ip=ip + ) + + tiler_mn = _pack_tile(tiler, loc=loc, ip=ip) + + return make_tiled_copy(atom, layout_tv, tiler_mn, loc=loc, ip=ip) + + +#################################################################################################### +# +# cute.gemm and cute.copy +# +#################################################################################################### + + +@dsl_user_op +def gemm( + atom: MmaAtom, + d: Tensor, + a: Tensor, + b: Tensor, + c: Tensor, + *, + loc=None, + ip=None, + **kwargs, +) -> None: + """ + The GEMM algorithm. + + Computes ``D <- AB + C`` where ``C`` and ``D`` can alias. Note that some MMA Atoms (e.g. + warpgroup-wide or tcgen05 MMAs) require manually setting an "accumulate" boolean field. + + All tensors must be partitioned according to the provided MMA Atom. + """ + value = atom._unpack(loc=loc, ip=ip, **kwargs) + return _cute_ir.gemm(value, d.value, a.value, b.value, c.value, loc=loc, ip=ip) + + +@dsl_user_op +def basic_copy(src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None: + """ + Performs a basic element-wise copy. + + This functions **assumes** the following pre-conditions: + 1. `size(src) == size(dst)` + + When the `src` and `dst` shapes are static, the pre-conditions are actually verified and the + element-wise loop is fully unrolled. + """ + if is_static(src.shape) and is_static(dst.shape): + simt_copy_ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get( + src.element_type.mlir_type, src.element_type.width + ) + simt_copy = _cute_ir.atom(simt_copy_ty, loc=loc, ip=ip) + return _cute_ir.copy(simt_copy, src.value, dst.value, loc=loc, ip=ip) + + s = size(dst, loc=loc, ip=ip) + # Always generate an scf.for Op when one of the tensors is dynamic + for i in for_generate(0, s): + dst[i] = src[i] + yield_out() + + +@dsl_user_op +def basic_copy_if(pred: Tensor, src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None: + """ + Performs a basic predicated element-wise copy. + + This functions **assumes** the following pre-conditions: + 1. `size(src) == size(dst)` + 2. `size(src) == size(pred)` + + When all shapes are static, the pre-conditions are actually verified and the element-wise loop + is fully unrolled. + """ + if src.element_type.width != dst.element_type.width: + raise NotImplementedError( + "basic_copy_if currently only supports equal source and destination " + "element type bit width" + ) + + if is_static(src.shape) and is_static(dst.shape) and is_static(pred.shape): + return _basic_copy_if_static(pred, src, dst, loc=loc, ip=ip) + + s = size(dst, loc=loc, ip=ip) + # Always generate an scf.for Op when one of the tensors is dynamic + for i in for_generate(0, s): + if_generate(pred[i], lambda: dst.__setitem__(i, src[i])) + yield_out() + + +# Version of basic_copy_if when src and dst have static shapes +# - verify size(src) == size(dst) == size(prd) +# - fully unroll the loop for now +def _basic_copy_if_static( + pred: Tensor, src: Tensor, dst: Tensor, *, loc=None, ip=None +) -> None: + assert is_static(src.shape) and is_static(dst.shape) and is_static(pred.shape) + if size(src, loc=loc, ip=ip) != size(dst, loc=loc, ip=ip): + raise ValueError( + "basic_copy expects the size of source, destination, and predicate tensors to match" + ) + # Fully unrolled loop in the static case for now + for i in range(size(dst, loc=loc, ip=ip)): + if_generate(pred[i], lambda: dst.__setitem__(i, src[i])) + + +@dsl_user_op +def autovec_copy(src: Tensor, dst: Tensor, *, loc=None, ip=None) -> None: + """ + Auto-vectorizing SIMT copy policy. + + Given a source and destination tensors that are statically shaped, this policy figures out the + largest safe vector width that the copy instruction can take and performs the copy. + """ + if src.element_type.width != dst.element_type.width: + raise NotImplementedError( + "autovec_copy currently only supports equal source and destination " + "element type bit width" + ) + + # We are going to dispatch to copy-with-atom which requires shapes to be static + if not is_static(src.shape) or not is_static(dst.shape): + raise ValueError( + "autovec_copy expects source and destination tensors to be statically shaped" + ) + + vec_layout = max_common_layout(src, dst, loc=loc, ip=ip) + num_common_elements = size(vec_layout, loc=loc, ip=ip) + + # Next we construct an upper-bound on the number bits that can be vectorized by considering + # - the maximum alignment of the layouts + # - the maximum alignment of the pointers + + upper_bound = math.gcd(src.layout.max_alignment, dst.layout.max_alignment) + upper_bound = math.gcd(upper_bound, num_common_elements) + upper_bound *= src.element_type.width + + # For our instructions, the alignment of the pointer is an upper bound to the vector width + # max_alignment, as opposed to alignment, takes into account possible address swizzling + upper_bound = math.gcd(upper_bound, src.iterator.max_alignment * 8) + upper_bound = math.gcd(upper_bound, dst.iterator.max_alignment * 8) + + # Finally, we put a cap at 128b + num_bits_per_copy = math.gcd(upper_bound, 128) + + if (num_common_elements > 1) and (num_bits_per_copy % 8 == 0): + num_common_elements = num_bits_per_copy // src.element_type.width + + # 2 step logical divides ensuring that the divides are valid at every step + vec_src = logical_divide(src, vec_layout, loc=loc, ip=ip) + vec_dst = logical_divide(dst, vec_layout, loc=loc, ip=ip) + tiled_src = logical_divide( + vec_src, make_layout(num_common_elements, loc=loc, ip=ip), loc=loc, ip=ip + ) + tiled_dst = logical_divide( + vec_dst, make_layout(num_common_elements, loc=loc, ip=ip), loc=loc, ip=ip + ) + + # Dispatch to copy with atom + simt_type = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get( + src.element_type.mlir_type, num_bits_per_copy + ) + simt_copy = _cute_ir.atom(simt_type, loc=loc, ip=ip) + return _cute_ir.copy( + simt_copy, tiled_src.value, tiled_dst.value, loc=loc, ip=ip + ) + + # Failed to vectorize, use a basic copy + basic_copy(src, dst, loc=loc, ip=ip) + + +@dsl_user_op +def copy( + atom: CopyAtom, + src: Tensor, + dst: Tensor, + *, + pred: Tensor = None, + loc=None, + ip=None, + **kwargs, +) -> None: + """ + The Copy algorithm. + + The "copy with Atom" expects source and destination tensors to be partitioned according to the + provided Copy Atom. Some Atoms require additional Op-specific kw arguments, for example TMA + copies: + + .. code-block:: python + + cute.copy(tma_atom, src, dst, tma_bar_ptr=mbar_ptr, mcast_mask=mask) + + An additional predication tensor can be provided. If the partitioned tensors have the following + logical profile ``((ATOM_V,ATOM_REST),REST_M,...)``, the predication tensor must have a profile + consistent with ``(ATOM_REST,REST_M,...)``. + """ + if isinstance(src.type, _cute_ir.MemRefType) and isinstance( + dst.type, _cute_ir.MemRefType + ): + if src.element_type.width != dst.element_type.width: + raise TypeError( + "`copy` currently only supports equal source and destination " + "element type bit width" + ) + + value = atom._unpack(loc=loc, ip=ip, **kwargs) + if isinstance(pred, Tensor): + pred = pred.value + return _cute_ir.copy(value, src.value, dst.value, pred=pred, loc=loc, ip=ip) + + +#################################################################################################### +# +# TensorSSA class (experimental) +# +#################################################################################################### + + +class ReductionOp(Enum): + ADD = auto() + MUL = auto() + MAX = auto() + MIN = auto() + + def __str__(self): + return self.name.lower() + + +class TensorSSA(cutlass_arith.ArithValue): + """A class representing thread local data from CuTe Tensor in value semantic and immutable. + + :param value: Flatten vector as ir.Value holding logic data of SSA Tensor + :type value: ir.Value + :param shape: The nested shape in CuTe of the vector + :type shape: Shape + :param dtype: Data type of the tensor elements + :type dtype: Type[Numeric] + + :ivar _shape: The nested shape in CuTe of the vector + :ivar _dtype: Data type of the tensor elements + + :raises ValueError: If shape is not static + """ + + def __init__(self, value, shape: Shape, dtype: Type[Numeric]): + """Initialize a new TensorSSA object. + + :param value: Flatten vector as ir.Value holding logic data of SSA Tensor + :type value: ir.Value + :param shape: The nested shape in CuTe of the vector + :type shape: Shape + :param dtype: Data type of the tensor elements + :type dtype: Type[Numeric] + :raises ValueError: If shape is not static + """ + if not is_static(shape): + raise ValueError("dynamic shape is not supported") + + signed = dtype.signed if issubclass(dtype, Integer) else False + super().__init__(value, signed) + + self._shape = shape + self._dtype = dtype + + @property + def dtype(self) -> Type[Numeric]: + return self._dtype + + @property + def element_type(self) -> Type[Numeric]: + return self._dtype + + @abstractmethod + def __extract_mlir_values__(self): + return [self] + + @abstractmethod + def __new_from_mlir_values__(self, values): + return TensorSSA(values[0], self.shape, self.dtype) + + def __str__(self): + return f"tensor_value<{self.type} o {self.shape}>" + + @property + def shape(self): + return self._shape + + @overload + def _apply_op(self, op, other: "TensorSSA", flip, *, loc, ip) -> "TensorSSA": ... + + @overload + def _apply_op( + self, op, other: cutlass_arith.ArithValue, flip, *, loc, ip + ) -> "TensorSSA": ... + + @overload + def _apply_op( + self, op, other: Union[int, float, bool], flip, *, loc, ip + ) -> "TensorSSA": ... + + def _apply_op(self, op, other, flip=False, *, loc=None, ip=None): + def get_attr_for_type(ty, value): + if isinstance(ty, ir.IntegerType): + return ir.IntegerAttr.get(ty, value) + elif isinstance(ty, ir.FloatType): + return ir.FloatAttr.get(ty, value) + else: + raise TypeError(f"unsupported type: {ty}") + + # Canonicalize into Numeric + if isinstance(other, (int, float, bool)) or ( + not isinstance(other, TensorSSA) + and isinstance(other, cutlass_arith.ArithValue) + ): + other = as_numeric(other) + + # Promote types + lhs, rhs, res_type = _binary_op_type_promote(self, other, True) + + # Promote scalar to vector + if not isinstance(rhs, TensorSSA): + if isinstance(rhs, Numeric): + vect_val = vector.broadcast(lhs.type, rhs.ir_value(loc=loc, ip=ip)) + else: + elem_attr = get_attr_for_type(lhs.type.element_type, rhs) + vect_attr = ir.DenseElementsAttr.get_splat(lhs.type, elem_attr) + vect_val = arith.constant(lhs.type, vect_attr, loc=loc, ip=ip) + rhs = TensorSSA(vect_val, lhs.shape, lhs.dtype) + + if flip: + lhs, rhs = rhs, lhs + + if op in ( + operator.lt, + operator.le, + operator.gt, + operator.ge, + operator.eq, + operator.ne, + ): + res_type = Boolean + + if lhs.shape != rhs.shape: + raise ValueError( + f"lhs and rhs must have the same shape type, but got {lhs.shape} and {rhs.shape}" + ) + + if not isinstance(rhs, TensorSSA): + raise TypeError(f"rhs must be TensorSSA but got {rhs}") + + if ( + op in (operator.add, operator.sub) + and lhs.dtype == Boolean + and rhs.dtype == Boolean + ): + res = op(lhs.to(Int32), rhs.to(Int32)) + zero = zeros_like(res) + res = res.__ne__(zero).to(res_type) + else: + lhs_val = lhs.maybe_downcast() + rhs_val = rhs.maybe_downcast() + + if issubclass(lhs.dtype, Integer): + lhs_val = lhs_val.with_signedness(lhs.dtype.signed) + + if issubclass(rhs.dtype, Integer): + rhs_val = rhs_val.with_signedness(rhs.dtype.signed) + + res_vect = op(lhs_val, rhs_val) + res = TensorSSA(res_vect, lhs._shape, res_type) + + return res + + def __pow__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the results of tensor^other. + + :param other: The other tensor for exponent. + :type other: TensorSSA + :return: The power of the tensor. + :rtype: TensorSSA + """ + return self._apply_op(operator.pow, other, loc=loc, ip=ip) + + def __rpow__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the results of other^tensor. + + :param other: The other tensor to compute power with. + :type other: TensorSSA + :return: The element-wise power of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.pow, other, flip=True, loc=loc, ip=ip) + + def __add__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the sum of the tensor and another tensor. + + :param other: The other tensor to add. + :type other: TensorSSA + :return: The sum of the two tensors with the same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.add, other, loc=loc, ip=ip) + + def __radd__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the sum of the tensor and another tensor (reverse add) + + :param other: The other tensor to add. + :type other: TensorSSA + :return: The sum of the two tensors with the same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.add, other, flip=True, loc=loc, ip=ip) + + def __sub__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the difference of the tensor and another tensor. + + :param other: The other tensor to subtract. + :type other: TensorSSA + :return: The subtraction of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.sub, other, loc=loc, ip=ip) + + def __rsub__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the difference of the tensor and another tensor (reverse subtract) + + :param other: The other tensor to subtract. + :type other: TensorSSA + :return: The subtraction of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.sub, other, flip=True, loc=loc, ip=ip) + + def __mul__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the multiplication of the tensor and another tensor. + + :param other: The other tensor to multiply. + :type other: TensorSSA + :return: The multiplication of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.mul, other, loc=loc, ip=ip) + + def __rmul__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the multiplication of the tensor and another tensor (reverse multiply) + + :param other: The other tensor to multiply. + :type other: TensorSSA + :return: The multiplication of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.mul, other, flip=True, loc=loc, ip=ip) + + def __mod__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the modulo of the tensor and another tensor. + + :param other: The other tensor to compute modulo with. + :type other: TensorSSA + :return: The element-wise modulo of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.mod, other, loc=loc, ip=ip) + + def __rmod__(self, other) -> "TensorSSA": + """ + Returns the modulo of the tensor and another tensor (reverse modulo) + + :param other: The other tensor to compute modulo with. + :type other: TensorSSA + :return: The element-wise modulo of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.mod, other, flip=True) + + def __floordiv__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the floordiv(//) of the tensor and another tensor. + + :param other: The other tensor to compute floordiv with. + :type other: TensorSSA + :return: The floordiv of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.floordiv, other, loc=loc, ip=ip) + + def __rfloordiv__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the floordiv(//) of the tensor and another tensor (reverse floordiv) + + :param other: The other tensor to compute floordiv with. + :type other: TensorSSA + :return: The floordiv of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.floordiv, other, flip=True, loc=loc, ip=ip) + + def __truediv__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the truediv(/) of the tensor and another tensor. + + :param other: The other tensor to compute truediv with. + :type other: TensorSSA + :return: The truediv of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.truediv, other, loc=loc, ip=ip) + + def __rtruediv__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the truediv(/) of the tensor and another tensor (reverse truediv) + + :param other: The other tensor to compute truediv with. + :type other: TensorSSA + :return: The truediv of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.truediv, other, flip=True, loc=loc, ip=ip) + + def __eq__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the comparison of the tensor and another tensor as mask + + :param other: The other tensor to compare. + :type other: TensorSSA + :return: The comparison of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.eq, other, loc=loc, ip=ip) + + def __ne__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise not equal comparison of the tensor and another tensor. + + :param other: The other tensor to compare. + :type other: TensorSSA + :return: A boolean tensor with same shape as inputs, True where self != other. + :rtype: TensorSSA + """ + return self._apply_op(operator.ne, other, loc=loc, ip=ip) + + def __lt__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise less than comparison of the tensor and another tensor. + + :param other: The other tensor to compare with. + :type other: TensorSSA + :return: A boolean tensor with same shape as inputs, True where self < other. + :rtype: TensorSSA + """ + return self._apply_op(operator.lt, other, loc=loc, ip=ip) + + def __le__(self, other) -> "TensorSSA": + """ + Returns the element-wise less than or equal comparison of the tensor and another tensor. + + :param other: The other tensor to compare with. + :type other: TensorSSA + :return: A boolean tensor with same shape as inputs, True where self <= other. + :rtype: TensorSSA + """ + return self._apply_op(operator.le, other) + + def __gt__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise greater than comparison of the tensor and another tensor. + + :param other: The other tensor to compare with. + :type other: TensorSSA + :return: A boolean tensor with same shape as inputs, True where self > other. + :rtype: TensorSSA + """ + return self._apply_op(operator.gt, other) + + def __ge__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise greater than or equal comparison of the tensor and another tensor. + + :param other: The other tensor to compare with. + :type other: TensorSSA + :return: A boolean tensor with same shape as inputs, True where self >= other. + :rtype: TensorSSA + """ + return self._apply_op(operator.ge, other, loc=loc, ip=ip) + + def __xor__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise XOR of the tensor and another tensor. + + :param other: The other tensor to perform XOR with. + :type other: TensorSSA + :return: The element-wise XOR of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.xor, other) + + def __rxor__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the bitwise XOR of the tensor and another tensor. + + :param other: The other tensor to compute XOR with. + :type other: TensorSSA + :return: The element-wise bitwise XOR of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.xor, other, flip=True, loc=loc, ip=ip) + + def __or__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise OR of the tensor and another tensor. + + :param other: The other tensor to perform OR with. + :type other: TensorSSA + :return: The element-wise OR of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.or_, other) + + def __ror__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise OR of the tensor and another tensor. + + :param other: The other tensor to perform OR with. + :type other: TensorSSA + :return: The element-wise OR of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.or_, other, flip=True) + + def __and__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise AND of the tensor and another tensor. + + :param other: The other tensor to perform AND with. + :type other: TensorSSA + :return: The element-wise AND of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.and_, other) + + def __rand__(self, other, *, loc=None, ip=None) -> "TensorSSA": + """ + Returns the element-wise AND of the tensor and another tensor. + + :param other: The other tensor to perform AND with. + :type other: TensorSSA + :return: The element-wise AND of two tensors with same shape as inputs. + :rtype: TensorSSA + """ + return self._apply_op(operator.and_, other, flip=True, loc=loc, ip=ip) + + def _flatten_shape_and_coord(self, crd, *, loc=None, ip=None): + # Coalesce and flatten source layout at terminal of coordinate + # (N_0,(N_1,...), ...) -> (N_0,N_1,N_2,...) + crd_shp = product_like(self._shape, target_profile=crd, loc=loc, ip=ip) + + # Flatten coordinate + flat_shp = flatten(crd_shp) + assert isinstance(flat_shp, tuple) and is_static(flat_shp) + # (C_0,(C_1,...), ...) -> (C_0,C_1,C_2,...) + flat_crd = flatten(crd) + + assert isinstance(flat_crd, tuple) and is_static(flat_crd) + return flat_shp, flat_crd + + def _build_result(self, res_vect, res_shp, *, loc=None, ip=None): + if isinstance(res_shp, ir.Value): + raise ValueError( + f"expects static shape and coordinates, but got {self._shape} and {crd}" + ) + + # cast back to 1D vector + res_1d_ty = ir.VectorType.get([size(res_shp)], self.type.element_type) + res_1d_vect = vector.shape_cast(res_1d_ty, res_vect, loc=loc, ip=ip) + return TensorSSA(res_1d_vect, res_shp, self.dtype) + + @dsl_user_op + def __getitem__( + self, crd: Coord, *, loc=None, ip=None + ) -> Union["TensorSSA", Numeric]: + """Access or slice tensor elements using coordinates. + + This method implements tensor evaluation T(c) = *(E + L(c)) where E is the iterator/engine + and L is the layout. It supports both direct element access and slicing operations. + + :param crd: Coordinate or slice specification for accessing tensor elements + :type crd: Coord + :param loc: Source location for MLIR operation tracking, defaults to None + :type loc: Optional[Location] + :param ip: Insertion point for MLIR operation, defaults to None + :type ip: Optional[InsertionPoint] + :return: Tensor element value or sliced subtensor + :rtype: Union[TensorSSA, Numeric] + + :raises ValueError: If coordinate access is invalid for the tensor layout + + Examples: + + .. code-block:: python + + # Create a fragment from rmem as shape (8, 4) + layout = make_layout((8, 4)) + tensor = make_fragment(layout, Float32) + frg = tensor.load() + + # Direct element access + val = frg[0] # Returns first element of fragment + val = frg[(0, 1)] # Returns element at (0, 1) + + # Slice access + sliced = frg[(3, None)] # Returns fragment slice + """ + # short-cut to no-op + if crd is None: + return self + + if not has_underscore(crd) or depth(crd) == 0: + idx = crd2idx(crd, make_layout(self._shape)) + if is_static(idx): + res = vector.extract( + self, dynamic_position=[], static_position=[idx], loc=loc, ip=ip + ) + else: + res = vector.extract( + self, dynamic_position=[crd], static_position=[], loc=loc, ip=ip + ) + return self.dtype(res) + + if not is_static(crd): + raise ValueError("dynamic coordinate is not supported") + + flat_shp, flat_crd = self._flatten_shape_and_coord(crd) + + multi_dim_ty = ir.VectorType.get(list(flat_shp), self.type.element_type) + # vector -> vector + tmp_vect = vector.shape_cast(multi_dim_ty, self) + + # Slice and keep dims matching `_` or None + res_shp = slice_(self._shape, crd) + if isinstance(res_shp, ir.Value): + raise TypeError( + f"expects static shape and coordinates, but got {self._shape} and {crd}" + ) + + # Offsets is index of coordinates if NOT `_` otherwise 0 + offsets = [c if c is not None else 0 for c in flat_crd] + # Sizes is size of shapes if `_` otherwise 1 + sizes = [s if c is None else 1 for s, c in zip(flat_shp, flat_crd)] + # Logic stride to index vector. Only support stride-1 by vector + strides = [1] * rank(flat_shp) + + # Vector slice on N-D vector + res_ty = ir.VectorType.get(list(sizes), self.type.element_type) + res_vect = vector.extract_strided_slice( + res_ty, tmp_vect, offsets=offsets, sizes=sizes, strides=strides + ) + + # Slice and keep dims matching `_` or None + res_shp = slice_(self._shape, crd) + return self._build_result(res_vect, res_shp, loc=loc, ip=ip) + + @dsl_user_op + def to(self, dtype: Type[Numeric], *, loc=None, ip=None): + """Convert the tensor to a different numeric type. + + :param dtype: The target numeric type to cast to. + :type dtype: Type[Numeric] + :return: A new tensor with the same shape but with elements cast to the target type. + :rtype: TensorSSA + :raises TypeError: If dtype is not a subclass of Numeric. + :raises NotImplementedError: If dtype is an unsigned integer type. + """ + if dtype is ir.Value: + return self + + if not isclass(dtype) or not issubclass(dtype, Numeric): + raise TypeError(f"dtype must be a type of Numeric, but got {type(dtype)}") + + src_dtype = self.dtype + if src_dtype == dtype: + return self + + # maybe downcast can lose signedness + src = self.maybe_downcast().with_signedness(self.signed) + if src_dtype.is_float and dtype.is_float: + res_vect = cutlass_arith.cvtf(src, dtype.mlir_type, loc=loc, ip=ip) + elif src_dtype.is_float and issubclass(dtype, Integer): + res_vect = cutlass_arith.fptoi( + src, dtype.signed, dtype.mlir_type, loc=loc, ip=ip + ) + elif issubclass(src_dtype, Integer) and dtype.is_float: + res_vect = cutlass_arith.itofp( + src, src_dtype.signed, dtype.mlir_type, loc=loc, ip=ip + ) + else: + res_vect = cutlass_arith.int_to_int(src, dtype, loc=loc, ip=ip) + + return TensorSSA(res_vect, self._shape, dtype) + + def ir_value(self, *, loc=None, ip=None): + return self + + def reduce(self, op, init_val, reduction_profile: Coord, *, loc=None, ip=None): + """ + Perform reduce on selected modes with given predefined reduction op. + + :param op: The reduction operator to use (operator.add or operator.mul) + :type op: operator + :param init_val: The initial value for the reduction + :type init_val: numeric + :param reduction_profile: Specifies which dimensions to reduce. Dimensions marked with '_' are kept. + :type reduction_profile: Coord + + :return: The reduced tensor + :rtype: TensorSSA + + Examples: + reduce(f32 o (4,)) + => f32 + reduce(f32 o (4, 5)) + => f32 + reduce(f32 o (4, (5, 4)), reduction_profile=(_, 1)) + => f32 o (4,) + reduce(f32 o (4, (5, 4)), reduction_profile=(_, (_, 1))) + => f32 o (4, (5,)) + """ + # short-cut to no-op + if reduction_profile is None: + return self + + if op is ReductionOp.ADD: + red_kind = vector.CombiningKind.ADD + elif op is ReductionOp.MUL: + red_kind = vector.CombiningKind.MUL + elif op is ReductionOp.MAX: + red_kind = vector.CombiningKind.MAXIMUMF + elif op is ReductionOp.MIN: + red_kind = vector.CombiningKind.MINIMUMF + else: + raise NotImplementedError( + f"{op} is not supported, expects one of " + f"{ReductionOp.ADD, ReductionOp.MUL, ReductionOp.MAX, ReductionOp.MIN}" + ) + + elem_ty = self.element_type + # Canonicalize to `Numeric` and convert into MLIR value + init_val = as_numeric(init_val).ir_value(loc=loc, ip=ip) + + if depth(reduction_profile) == 0: + return vector.reduction( + elem_ty.mlir_type, red_kind, self, acc=init_val, loc=loc, ip=ip + ) + + flat_shp, flat_prof = self._flatten_shape_and_coord( + reduction_profile, loc=loc, ip=ip + ) + assert depth(flat_shp) == 1 and depth(flat_prof) == 1 + assert rank(flat_shp) == rank(flat_prof) + + temp_ty = ir.VectorType.get(list(flat_shp), elem_ty.mlir_type) + temp_vect = vector.shape_cast(temp_ty, self, loc=loc, ip=ip) + + if isinstance(flat_prof, tuple): + red_dims = [i for i, x in enumerate(flat_prof) if x is not None] + else: + red_dims = [0] + + temp_acc_shp = slice_(flat_shp, flat_prof, loc=loc, ip=ip) + temp_acc_ty = ir.VectorType.get(list(temp_acc_shp), elem_ty.mlir_type) + + init_val = vector.broadcast(temp_acc_ty, init_val, loc=loc, ip=ip) + res_vect = vector.multi_reduction( + red_kind, temp_vect, acc=init_val, reduction_dims=red_dims, loc=loc, ip=ip + ) + + # Slice and keep dims matching `_` or None + res_shp = slice_(self.shape, reduction_profile, loc=loc, ip=ip) + return self._build_result(res_vect, res_shp, loc=loc, ip=ip) + + +def _get_attr_for_type(ty, value): + if isinstance(ty, ir.IntegerType): + return ir.IntegerAttr.get(ty, value.to(int)) + elif isinstance(ty, ir.FloatType): + return ir.FloatAttr.get(ty, value.to(float)) + else: + raise TypeError(f"unsupported type: {ty}") + + +def _splat(res_ty, fill_value): + elem_attr = _get_attr_for_type(res_ty.element_type, fill_value) + vect_attr = ir.DenseElementsAttr.get_splat(res_ty, elem_attr) + return arith.constant(res_ty, vect_attr) + + +@dsl_user_op +def full(shape, fill_value, dtype: Type[Numeric], *, loc=None, ip=None) -> TensorSSA: + """ + Return a new TensorSSA of given shape and type, filled with fill_value. + + :param shape: Shape of the new tensor. + :type shape: tuple + :param fill_value: Value to fill the tensor with. + :type fill_value: scalar + :param dtype: Data type of the tensor. + :type dtype: Type[Numeric] + :return: Tensor of fill_value with the specified shape and dtype. + :rtype: TensorSSA + """ + size = product(shape, loc=loc, ip=ip) + if not is_static(size): + raise ValueError("shape must be static") + + if isinstance(fill_value, (ir.Value, int, float, bool)): + fill_value = dtype(fill_value) + + res_mlir_type = T.vector(size, dtype.mlir_type) + return TensorSSA(_splat(res_mlir_type, fill_value), shape, dtype) + + +def full_like( + a: TensorSSA, + fill_value, + dtype: Union[None, Type[Numeric]] = None, + *, + loc=None, + ip=None, +) -> TensorSSA: + """ + Return a full TensorSSA with the same shape and type as a given array. + + :param a: The shape and data-type of `a` define these same attributes of the returned array. + :type a: array_like + :param fill_value: Fill value. + :type fill_value: array_like + :param dtype: Overrides the data type of the result, defaults to None + :type dtype: Union[None, Type[Numeric]], optional + :return: Tensor of `fill_value` with the same shape and type as `a`. + :rtype: TensorSSA + + .. seealso:: + :func:`empty_like`: Return an empty array with shape and type of input. + :func:`ones_like`: Return an array of ones with shape and type of input. + :func:`zeros_like`: Return an array of zeros with shape and type of input. + :func:`full`: Return a new array of given shape filled with value. + + Examples + -------- + .. code-block:: python + + frg = cute.make_fragment(Float32, (2, 3)) + a = frg.load() + b = cute.full_like(a, 1.0) + """ + return full( + a.shape, fill_value, dtype if dtype is not None else a.dtype, loc=loc, ip=ip + ) + + +def empty_like(a, dtype=None): + """ + Return a new TensorSSA with the same shape and type as a given array, without initializing entries. + + :param a: The shape and data-type of `a` define these same attributes of the returned array. + :type a: TensorSSA + :param dtype: Overrides the data type of the result, defaults to None + :type dtype: Type[Numeric], optional + :return: Uninitialized tensor with the same shape and type (unless overridden) as `a`. + :rtype: TensorSSA + """ + return full_like(a, 0, dtype) + + +def ones_like(a, dtype=None): + """ + Return a TensorSSA of ones with the same shape and type as a given array. + + :param a: The shape and data-type of `a` define these same attributes of the returned array. + :type a: TensorSSA + :param dtype: Overrides the data type of the result, defaults to None + :type dtype: Type[Numeric], optional + :return: Tensor of ones with the same shape and type (unless overridden) as `a`. + :rtype: TensorSSA + """ + return full_like(a, 1, dtype) + + +def zeros_like(a, dtype=None, *, loc=None, ip=None): + """ + Return a TensorSSA of zeros with the same shape and type as a given array. + + :param a: The shape and data-type of `a` define these same attributes of the returned array. + :type a: TensorSSA + :param dtype: Overrides the data type of the result, defaults to None + :type dtype: Type[Numeric], optional + :return: Tensor of zeros with the same shape and type (unless overridden) as `a`. + :rtype: TensorSSA + """ + return full_like(a, 0, dtype, loc=loc, ip=ip) + + +def where( + cond: TensorSSA, x: TensorSSA, y: TensorSSA, *, loc=None, ip=None +) -> TensorSSA: + """ + Return elements chosen from x or y depending on condition. + + :param cond: Where True, yield x, where False, yield y. + :type cond: TensorSSA + :param x: Values from which to choose when condition is True. + :type x: TensorSSA + :param y: Values from which to choose when condition is False. + :type y: TensorSSA + :return: A tensor with elements from x where condition is True, and elements from y where condition is False. + :rtype: TensorSSA + """ + if x.dtype != y.dtype: + raise ValueError( + f"x and y must have the same dtype, but got {x.dtype} and {y.dtype}" + ) + + if cond.dtype != Boolean: + raise ValueError(f"cond must be Boolean type, but got {cond.dtype}") + + return TensorSSA( + arith.select(cond.ir_value(), x, y, loc=loc, ip=ip), x.shape, x.dtype + ) + + +def any_(x: TensorSSA, *, loc=None, ip=None) -> Boolean: + """ + Test whether any tensor element evaluates to True. + + :param x: Input tensor. + :type x: TensorSSA + :return: Returns a TensorSSA scalar containing True if any element of x is True, False otherwise. + :rtype: TensorSSA + """ + is_true = x != full_like(x, 0, x.dtype, loc=loc, ip=ip) + return Boolean( + vector.reduction(T.bool(), vector.CombiningKind.OR, is_true, loc=loc, ip=ip) + ) + + +def all_(x: TensorSSA, *, loc=None, ip=None) -> Boolean: + """ + Test whether all tensor elements evaluate to True. + + :param x: Input tensor. + :type x: TensorSSA + :return: Returns a TensorSSA scalar containing True if all elements of x are True, False otherwise. + :rtype: TensorSSA + """ + is_true = x != full_like(x, 0, x.dtype, loc=loc, ip=ip) + return Boolean( + vector.reduction(T.bool(), vector.CombiningKind.AND, is_true, loc=loc, ip=ip) + ) + + +############################################################################## +# User defined struct +############################################################################## + + +class struct: + """ + Decorator to abstract C structure in Python DSL. + + Usage: + .. code-block:: + + # Supports base_dsl scalar int/float elements, array and nested struct: + @cute.struct + class complex: + real : cutlass.Float32 + imag : cutlass.Float32 + @cute.struct + class StorageA: + mbarA : cute.struct.MemRange[cutlass.Int64, stage] + compA : complex + intA : cutlass.Int16 + + # Supports aligment for its elements: + @cute.struct + class StorageB: + a: cute.struct.Align[ + cute.struct.MemRange[cutlass.Float32, size_a], 1024 + ] + b: cute.struct.Align[ + cute.struct.MemRange[cutlass.Float32, size_b], 1024 + ] + x: cute.struct.Align[cutlass.Int32, 16] + compA: cute.struct.Align[complex, 16] + + # Statically get size and alignment: + size = StorageB.__sizeof__() + align = StorageB.__alignof__() + + # Allocate and referencing elements: + storage = allocator.allocate(StorageB) + + storage.a[0] ... + storage.x ... + storage.compA.real ... + + :param cls: The struct class with annotations. + :return: The decorated struct class. + """ + + # inner class for defining a continuous memory region + class _MemRangeMeta(type): + """ + A metaclass for creating MemRange classes. + + This metaclass is used to dynamically create MemRange classes with specific + data types and sizes. + + :ivar _dtype: The data type of the MemRange. + :ivar _size: The size of the MemRange. + """ + + _dtype = None + _size = None + + def __new__(cls, name, bases, dct): + new_cls = super().__new__(cls, name, bases, dct) + return new_cls + + def __getitem__(cls, params) -> Type["struct.MemRange"]: + # get params from syntax: struct.MemRange[dtype, size] + if len(params) == 2: + dtype, size = params + else: + raise TypeError("Invalid struct.MemRange Arguments") + + if not struct._is_scalar_type(dtype): + raise TypeError("MemRange only support dsl scalar type!") + + # Create new class with proper name and parameters + new_cls = type( + f"struct.MemRange[{dtype.__name__}, {size}]", + (struct.MemRange,), + {"_dtype": dtype, "_size": size}, + ) + return new_cls + + @property + def size(cls): + return cls._size + + @property + def elem_width(cls): + return cls._dtype.width + + @property + def size_in_bytes(cls): + return cls.size * cls.elem_width // 8 + + class MemRange(metaclass=_MemRangeMeta): + """ + Defines a range of memory by `MemRange[T, size]`. + """ + + pass + + class _MemRangeData: + """ + Represents a range of memory. + + :param dtype: The data type. + :param size: The size of the memory range in bytes. + :param base: The base address of the memory range. + """ + + def __init__(self, dtype, size, base): + """ + Initializes a new memory range. + + :param dtype: The data type. + :param size: The size of the memory range in bytes. + :param base: The base address of the memory range. + """ + self._dtype = dtype + self._size = size + self._base = base + + def data_ptr(self): + """ + Returns start pointer to the data in this memory range. + + :return: A pointer to the start of the memory range. + :raises AssertionError: If the size of the memory range is not greater than zero. + """ + assert self._size > 0 + return recast_ptr(self._base, dtype=self._dtype) + + def get_tensor(self, layout, swizzle=None, dtype=None): + """ + Creates a tensor from the memory range. + + :param layout: The layout of the tensor. + :param swizzle: Optional swizzle pattern. + :param dtype: Optional data type; defaults to the memory range's data type if not specified. + :return: A tensor representing the memory range. + :raises TypeError: If the layout is incompatible with the swizzle. + :raises AssertionError: If the size of the memory range is not greater than zero. + """ + assert self._size > 0 + # make tensor + if isinstance(layout, ComposedLayout) and (swizzle is not None): + raise TypeError(f"incompatible layout with swizzle") + elem_type = self._dtype if dtype is None else dtype + ptr = recast_ptr(self._base, swizzle, dtype=elem_type) + res = make_tensor(ptr, layout) + return res + + def __getitem__(self, index: int) -> Any: + """ + Returns the element at the specified index in the memory range. + + :param index: The index of the element to retrieve. + :return: The element at the specified index. + :raises AssertionError: If the index is out of range. + """ + assert (index >= 0) and (index < self._size) + return self.data_ptr() + index + + # inner class for aligning a member type + class _AlignMeta(type): + """ + Aligns the given object by setting its alignment attribute. + + :param v: The object to align. Must be a struct, MemRange, or a scalar type. + :param align: The alignment value to set. + :return: A copy of the object with the specified alignment. + :raises TypeError: If the object is not a struct, MemRange, or a scalar type. + """ + + def __new__(cls, name, bases, dct): + return super().__new__(cls, name, bases, dct) + + def __getitem__(cls, params) -> Any: + if len(params) == 2: + obj, align = params + else: + raise TypeError("Invalid struct.Align Arguments") + + # make a copy of type and mark alignment + if struct._is_scalar_type(obj) or isinstance( + obj, (struct, struct._MemRangeMeta) + ): + new_obj = py_copy.copy(obj) + setattr(new_obj, "_struct_alignment_", align) + return new_obj + else: + raise TypeError( + "align only can be applied to sturct/MemRange/base_dsl scalar" + ) + + class Align(metaclass=_AlignMeta): + """ + Aligns the given type by `Align[T, alignment]`. + """ + + pass + + # util func for base dsl scalar types + @staticmethod + def _is_scalar_type(dtype): + """ + Checks if the given type is a scalar numeric type. + + :param dtype: The type to check. + :return: True if the type is a subclass of Numeric, False otherwise. + """ + return isinstance(dtype, type) and issubclass(dtype, Numeric) + + # calculate size and alignment + def __init__(self, cls): + """ + Initializes a new struct decorator instance. + + :param cls: The class representing the structured data type. + :raises TypeError: If the struct is empty. + """ + self._cls = cls + # Get the class annotations + self._annotations = cls.__annotations__ + # Create a dictionary to store the offsets + self._offsets: Dict[str, int] = {} + + # Calculate the offsets and alignment + offset = 0 + alignment = 1 + if len(self._annotations) == 0: + raise TypeError("Empty struct is not supported!") + for name, object in self._annotations.items(): + # get alignment of object + def alignof(object, default: int = 1): + return getattr(object, "_struct_alignment_", default) + + # alignment for the next offset + def align_offset(offset, align): + return (offset + (align - 1)) & ~(align - 1) + + # switch addition order to support dynamic size + def add_offset(val): + return val + offset if isinstance(val, ir.Value) else offset + val + + # size of scalar + if struct._is_scalar_type(object): + dtype_size = object.width // 8 + sub_align = alignof(object, dtype_size) + offset = align_offset(offset, sub_align) + self._offsets[name] = offset + offset = add_offset(dtype_size) + # size of array is size_in_bytes, alignment is elem_size + elif isinstance(object, struct._MemRangeMeta): + if object.size == 0: + continue # skip empty array + sub_align = alignof(object, max(1, object.elem_width // 8)) + offset = align_offset(offset, sub_align) + self._offsets[name] = offset + offset = add_offset(object.size_in_bytes) + # size of struct + elif isinstance(object, struct): + sub_align = max(object.__alignof__(), alignof(object)) + offset = align_offset(offset, sub_align) + self._offsets[name] = offset + offset = add_offset(object.__sizeof__()) + else: + raise TypeError( + f"Struct element only support sturct/array/base_dsl scalar, " + f"but got {object}" + ) + # Total aligment determined by the strictest requirement + alignment = max(alignment, sub_align) + # Total size determined by alignment + self._align_of = alignment + self._size_of = align_offset(offset, alignment) + + # create the __init__ method for decorated struct + def __call__(self, base: Any) -> None: + """ + Creates a new instance of the decorated struct. + + :param base: The base address of the struct. + :return: An instance of the decorated struct. + :raises TypeError: If the base pointer is not byte-sized. + """ + if base.type.value_type.width != 8: + raise TypeError("struct base ptr value type must be byte sized.") + # make an new object of user-defined decorated struct + # otherwise it will override same self._cls when new instance created + cls = self._cls() + setattr(cls, "_base", base) + for name, off in self._offsets.items(): + obj = self._annotations[name] + if struct._is_scalar_type(obj): + new_obj = recast_ptr(base + off, dtype=obj) + setattr(cls, name, new_obj) + elif isinstance(obj, struct._MemRangeMeta): + new_obj = struct._MemRangeData(obj._dtype, obj._size, base + off) + setattr(cls, name, new_obj) + elif isinstance(obj, struct): + new_obj = obj(base + off) + setattr(cls, name, new_obj) + else: + raise TypeError( + f"Struct element only support sturct/array/base_dsl scalar, " + f"but got {obj}" + ) + return cls + + # get size + def size_in_bytes(self) -> int: + """ + Returns the size of the struct in bytes. + + :return: The size of the struct. + """ + return self._size_of + + # get size + def __sizeof__(self) -> int: + return self._size_of + + # get alignment + def __alignof__(self) -> int: + return self._align_of diff --git a/python/CuTeDSL/cutlass/cute/math.py b/python/CuTeDSL/cutlass/cute/math.py new file mode 100644 index 00000000..3dda89c2 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/math.py @@ -0,0 +1,354 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .core import TensorSSA +from cutlass._mlir.dialects import math, arith + + +def acos(a: TensorSSA) -> TensorSSA: + """Compute element-wise arc cosine of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :return: Tensor containing the arc cosine of each element in input tensor + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = acos(y) # Compute arc cosine + """ + return TensorSSA(math.acos(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def asin(a: TensorSSA) -> TensorSSA: + """Compute element-wise arc sine of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :return: Tensor containing the arc sine of each element in input tensor + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = asin(y) # Compute arc sine + """ + return TensorSSA(math.asin(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def atan(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise arc tangent of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the arc tangent of each element in input tensor + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = atan(y) # Compute arc tangent + """ + raise NotImplementedError("atan is not implemented") + return TensorSSA(math.atan(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def atan2(a: TensorSSA, b: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise arc tangent of two tensors. + + Computes atan2(a, b) element-wise. The function atan2(a, b) is the angle in radians + between the positive x-axis and the point given by the coordinates (b, a). + + :param a: First input tensor (y-coordinates) + :type a: TensorSSA + :param b: Second input tensor (x-coordinates) + :type b: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the arc tangent of a/b element-wise + :rtype: TensorSSA + + Example: + + .. code-block:: + + y = cute.make_fragment(ptr1, layout).load() # y coordinates + x = cute.make_fragment(ptr2, layout).load() # x coordinates + theta = atan2(y, x) # Compute angles + """ + return TensorSSA( + math.atan2(a, b, fastmath=arith.FastMathFlags.none), a.shape, a.dtype + ) + + +def cos(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise cosine of the input tensor. + + :param a: Input tensor (in radians) + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the cosine of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = cos(y) # Compute cosine + """ + return TensorSSA(math.cos(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def erf(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise error function of the input tensor. + + The error function is defined as: + erf(x) = 2/√π ∫[0 to x] exp(-t²) dt + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the error function value for each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = erf(y) # Compute error function + """ + return TensorSSA(math.erf(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def exp2(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise base-2 exponential of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing 2 raised to the power of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = exp2(y) # Compute 2^x + """ + return TensorSSA(math.exp2(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def log(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise natural logarithm of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the natural logarithm of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = log(y) # Compute natural logarithm + """ + return TensorSSA(math.log(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def log2(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise base-2 logarithm of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the base-2 logarithm of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = log2(y) # Compute log base 2 + """ + return TensorSSA(math.log2(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def log10(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise base-10 logarithm of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the base-10 logarithm of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = log10(y) # Compute log base 10 + """ + return TensorSSA(math.log10(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def rsqrt(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise reciprocal square root of the input tensor. + + Computes 1/√x element-wise. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the reciprocal square root of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = rsqrt(y) # Compute 1/√x + """ + return TensorSSA(math.rsqrt(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def sin(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise sine of the input tensor. + + :param a: Input tensor (in radians) + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the sine of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = sin(y) # Compute sine + """ + return TensorSSA(math.sin(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def sqrt(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise square root of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the square root of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = sqrt(y) # Compute square root + """ + return TensorSSA(math.sqrt(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def tan(a: TensorSSA) -> TensorSSA: + """Compute element-wise tangent of the input tensor. + + :param a: Input tensor (in radians) + :type a: TensorSSA + :return: Tensor containing the tangent of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = tan(y) # Compute tangent + """ + return TensorSSA(math.tan(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +def tanh(a: TensorSSA, fastmath: bool = False) -> TensorSSA: + """Compute element-wise hyperbolic tangent of the input tensor. + + :param a: Input tensor + :type a: TensorSSA + :param fastmath: Enable fast math optimizations, defaults to False + :type fastmath: bool, optional + :return: Tensor containing the hyperbolic tangent of each element + :rtype: TensorSSA + + Example: + + .. code-block:: + + x = cute.make_fragment(layout) # Create tensor + y = x.load() # Load values + z = tanh(y) # Compute hyperbolic tangent + """ + return TensorSSA(math.tanh(a, fastmath=arith.FastMathFlags.none), a.shape, a.dtype) + + +__all__ = [ + "acos", + "asin", + "atan", + "atan2", + "cos", + "erf", + "exp2", + "log", + "log10", + "log2", + "rsqrt", + "sin", + "sqrt", + "tan", + "tanh", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py new file mode 100644 index 00000000..0655bb09 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/__init__.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from . import warp +from . import cpasync +from . import warpgroup +from . import tcgen05 + +from .common import * +from .helpers import * + + +# __all__ is required here for documentation generation +__all__ = [ + "OpError", + "MmaUniversalOp", + "CopyUniversalOp", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/common.py b/python/CuTeDSL/cutlass/cute/nvgpu/common.py new file mode 100644 index 00000000..c93becad --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/common.py @@ -0,0 +1,143 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from dataclasses import dataclass +from typing import Type, Optional + +from cutlass.cutlass_dsl import DSLBaseError + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from .. import core +from ..typing import Float16, Float32, Float64, Numeric + + +class OpError(DSLBaseError): + """ + An exception class for Op construction errors. + """ + + def __init__( + self, op: core.Op, message: str, suggestion: Optional[str] = None + ) -> None: + if suggestion is None: + # Default suggestion + suggestion = "Check your Op construction code" + super().__init__( + message, + error_code=f"{op.__class__.__name__} error", + suggestion=suggestion, + ) + + +#################################################################################################### +# +# MMA Ops and Traits +# +#################################################################################################### + + +@dataclass(frozen=True) +class MmaUniversalOp(core.MmaOp): + """ + The universal MMA Operation. + + This Operation currently expects the A/B operands as well as the accumulator to share the same + data types. + + :param abacc_dtype: The data type for the A/B operands and the accumulator + :type abacc_dtype: Type[Numeric] + """ + + abacc_dtype: Type[Numeric] + + def __post_init__(self) -> None: + if self.abacc_dtype not in [Float16, Float32, Float64]: + raise OpError( + self, + f"expects the 'abacc_dtype' Op parameter to be one of Float16, Float32, or Float64", + ) + + def __str__(self) -> str: + return ( + "universal MMA Operation using FMA" + f"\n A/B/Accumulator data type = {self.abacc_dtype}" + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaUniversalTrait": + shape_mnk_attr = ir.Attribute.parse(f'#cute.shape<"(1,1,1)">') + atom_ty = _cute_nvgpu_ir.UniversalFmaAtomType.get( + shape_mnk_attr, + self.abacc_dtype.mlir_type, + self.abacc_dtype.mlir_type, + self.abacc_dtype.mlir_type, + ) + return MmaUniversalTrait(_cute_ir.atom(atom_ty, loc=loc, ip=ip)) + + +class MmaUniversalTrait(core.Trait): + pass + + +#################################################################################################### +# +# Copy Ops and Traits +# +#################################################################################################### + + +@dataclass(frozen=True) +class CopyUniversalOp(core.CopyOp): + """ + The universal Copy Operation. + + When creating a Copy Atom out of this operation, the expected usage pattern is + + .. code-block:: python + + op = cute.nvgpu.CopyUniversalOp() + atom = cute.make_copy_atom(op, tensor_dtype, num_bits_per_copy=64) + + - ``tensor_dtype`` is the data type used to build the reference TV Layout (either the source \ + or the destination TV Layout) in unit of tensor elements and is used for partitioning by \ + ``TiledCopy`` for example + - ``num_bits_per_copy`` is a kw argument specifying the number of bits to copy per Atom \ + execution. This can be larger than the width of the above data type. When not provided, \ + the compiler will do a best effort at auto-vectorizing. + """ + + def __str__(self) -> str: + return "universal Copy Operation" + + def _make_trait( + self, + copy_internal_type: Type[Numeric], + *, + loc=None, + ip=None, + **kwargs, + ) -> "CopyUniversalTrait": + num_bits_per_copy = kwargs.get("num_bits_per_copy", 0) + if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy < 0): + raise ValueError( + "expects a 'num_bits_per_copy' kw argument of type int that is non-negative " + f"when creating a copy Atom for {self.__class__.__name__}" + ) + ty = _cute_nvgpu_ir.CopyAtomSIMTSyncCopyType.get( + copy_internal_type.mlir_type, num_bits_per_copy + ) + return CopyUniversalTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class CopyUniversalTrait(core.Trait): + pass diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py new file mode 100644 index 00000000..322e8bf0 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/__init__.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .copy import * +from .helpers import * + + +# __all__ is required here for documentation generation +__all__ = [ + # + # copy.py + # + "LoadCacheMode", + "CopyG2SOp", + "CopyBulkTensorTileG2SOp", + "CopyBulkTensorTileG2SMulticastOp", + "CopyBulkTensorTileS2GOp", + # + # helpers.py + # + "make_tma_tile_atom", + "tma_partition", + "create_tma_multicast_mask", + "prefetch_descriptor", + "copy_tensormap", + "update_tma_descriptor", + "fence_tma_desc_acquire", + "cp_fence_tma_desc_release", + "fence_tma_desc_release", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py new file mode 100644 index 00000000..8de65a72 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/copy.py @@ -0,0 +1,366 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import enum +from dataclasses import dataclass +from typing import Optional, Type + +from cutlass.cutlass_dsl import CuTeDSL, t + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from ...core import CopyOp, Trait +from ...typing import Int16, Pointer, Integer, Numeric +from ..common import OpError +from ..tcgen05.mma import CtaGroup + + +#################################################################################################### +# +# Aynchronous copies +# +#################################################################################################### + + +class LoadCacheMode(enum.Enum): + """ + An enumeration for the possible cache modes of a non-bulk ``cp.async`` instruction. + + See the `PTX documentation `__. + """ + + ALWAYS = _cute_nvgpu_ir.LoadCacheMode.always + GLOBAL = _cute_nvgpu_ir.LoadCacheMode.global_ + STREAMING = _cute_nvgpu_ir.LoadCacheMode.streaming + LAST_USE = _cute_nvgpu_ir.LoadCacheMode.last_use + NONE = _cute_nvgpu_ir.LoadCacheMode.none + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + def _to_ir(self) -> _cute_nvgpu_ir.LoadCacheMode: + return self.value + + +@dataclass(frozen=True) +class CopyG2SOp(CopyOp): + """ + Non-bulk asynchronous GMEM to SMEM Copy Operation. + + See the `PTX documentation `__. + """ + + cache_mode: LoadCacheMode = LoadCacheMode.ALWAYS + + def __str__(self) -> str: + res = "cp.async GMEM -> SMEM copy Operation" + if self.cache_mode != LoadCacheMode.ALWAYS: + res += f"\n with cache mode = {self.cache_mode}" + return res + + def _make_trait( + self, + copy_internal_type: Type[t.Numeric], + *, + loc=None, + ip=None, + **kwargs, + ) -> "CopyG2STrait": + num_bits_per_copy = kwargs.get("num_bits_per_copy", None) + if not isinstance(num_bits_per_copy, int) or (num_bits_per_copy <= 0): + raise ValueError( + "expects a 'num_bits_per_copy' kw argument of type int that is positive " + f"when creating a copy Atom for {self.__class__.__name__}" + ) + # Verify that the user provided enum values + if not isinstance(self.cache_mode, LoadCacheMode): + raise OpError( + self, + "expects the 'cache_mode' Op parameter to be a LoadCacheMode instance", + ) + ty = _cute_nvgpu_ir.CopyAtomSIMTAsyncCopyType.get( + copy_internal_type.mlir_type, self.cache_mode._to_ir(), num_bits_per_copy + ) + return CopyG2STrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class CopyG2STrait(Trait): + pass + + +#################################################################################################### +# +# Bulk tensor copies a.k.a TMA copies +# +#################################################################################################### + +TMA_MBAR_PTR_FIELD_NAME = "tma_bar" +TMA_MASK_FIELD_NAME = "mcast_mask" +TMA_DESC_PTR_FIELD_NAME = "tma_descriptor_ptr" + +# +# TMA GMEM -> SMEM copies +# + + +@dataclass(frozen=True) +class CopyBulkTensorTileG2SOp(CopyOp): + """ + Bulk tensor asynchrnous GMEM to SMEM Copy Operation using the TMA unit. + + See the `PTX documentation `__. + This Operation uses TMA in the ``.tile`` mode. + """ + + cta_group: CtaGroup = CtaGroup.ONE + + admissible_archs = ["sm_90", "sm_90a", "sm_100a"] + + def __post_init__(self) -> None: + if not isinstance(self.cta_group, CtaGroup): + raise OpError( + self, "expects the 'cta_group' parameter to be a CtaGroup instance" + ) + # Arch verification + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90": + raise OpError( + self, + f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + + def __str__(self) -> str: + res = "cp.async GMEM -> SMEM bulk tensor copy Operation" + if self.cta_group == CtaGroup.TWO: + res += f"\n CTA group = 2" + return res + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "CopyBulkTensorTileG2SNonExecTrait": + raise NotImplementedError( + "Use cpasync.make_tma_tile_atom to obtain a copy Atom for TMA" + ) + + def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum: + if self.cta_group == CtaGroup.ONE: + return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90 + elif self.cta_group == CtaGroup.TWO: + return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm + else: + assert False, "unrecognized self.cta_group" + + +class CopyBulkTensorTileG2SNonExecTrait(Trait): + # We allow kw args to be dropped so that the user can write common code for non-multicast + # and multicast loads. + def unpack( + self, + *, + loc=None, + ip=None, + tma_bar_ptr: Optional[Pointer] = None, + tma_desc_ptr: Optional[Pointer] = None, + **kwargs, + ): + """ + Custom implementation of unpack for non-executable TMAs. + + The non-multicast TMA load requires a `tma_bar_ptr` keyword argument to be provided when + using `cute.copy`. Any other kw arguments will be ignored instead of triggering an error. + """ + if not isinstance(tma_bar_ptr, Pointer): + raise ValueError( + "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument" + ) + exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip) + attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_MBAR_PTR_FIELD_NAME}>" + attr = ir.Attribute.parse(attr_str) + exec_value = _cute_nvgpu_ir.atom_set_value( + exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip + ) + if isinstance(tma_desc_ptr, Pointer): + attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>" + attr = ir.Attribute.parse(attr_str) + exec_value = _cute_nvgpu_ir.atom_set_value( + exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip + ) + return exec_value + + +# +# TMA GMEM -> SMEM multicast copies +# + + +@dataclass(frozen=True) +class CopyBulkTensorTileG2SMulticastOp(CopyOp): + """ + Bulk tensor asynchrnous multicast GMEM to SMEM Copy Operation using the TMA unit. + + See the `PTX documentation `__. + This Operation uses TMA in the ``.tile`` mode. + """ + + cta_group: CtaGroup = CtaGroup.ONE + + admissible_archs = ["sm_90", "sm_90a", "sm_100a"] + + def __post_init__(self): + if not isinstance(self.cta_group, CtaGroup): + raise OpError( + self, "expects the 'cta_group' parameter to be a CtaGroup instance" + ) + # Arch verification + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + if (self.cta_group == CtaGroup.TWO) and arch[:5] == "sm_90": + raise OpError( + self, + f"CTA group of 2 is tcgen05-specific and is not and is not compatible with {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + + def __str__(self) -> str: + res = "cp.async GMEM -> SMEM bulk tensor multicast copy Operation" + if self.cta_group == CtaGroup.TWO: + res += f"\n CTA group = 2" + return res + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "CopyBulkTensorTileG2SMulticastNonExecTrait": + raise NotImplementedError( + "Use cpasync.make_tma_tile_atom to obtain a copy Atom for TMA" + ) + + def _to_ir(self) -> _cute_nvgpu_ir.TiledTmaLoadEnum: + if self.cta_group == CtaGroup.ONE: + return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_90_multicast + elif self.cta_group == CtaGroup.TWO: + return _cute_nvgpu_ir.TiledTmaLoadEnum.sm_100_2sm_multicast + else: + assert False, "unrecognized self.cta_group" + + +class CopyBulkTensorTileG2SMulticastNonExecTrait(Trait): + def unpack( + self, + *, + loc=None, + ip=None, + tma_bar_ptr: Optional[Pointer] = None, + mcast_mask=None, + tma_desc_ptr=None, + ): + """ + Custom implementation of unpack for non-executable TMAs. + + The multicast TMA load requires a `tma_bar_ptr` and a `mcast_mask` keyword arguments to be + provided when using `cute.copy`. + """ + if not isinstance(tma_bar_ptr, Pointer): + raise ValueError( + "expects a pointer to an mbarrier to be provided via the tma_bar_ptr kw argument" + ) + if not isinstance(mcast_mask, Integer): + raise ValueError( + "expects a multicast mask to be provided via the mcast_mask kw argument" + ) + exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip) + attr_str = f"#cute_nvgpu.atom_copy_field_tmaload" + attr = ir.Attribute.parse(attr_str) + exec_value = _cute_nvgpu_ir.atom_set_value( + exec_value, attr, tma_bar_ptr.value, loc=loc, ip=ip + ) + attr_str = f"#cute_nvgpu.atom_copy_field_tmaload" + attr = ir.Attribute.parse(attr_str) + exec_value = _cute_nvgpu_ir.atom_set_value( + exec_value, attr, Int16(mcast_mask).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + if isinstance(tma_desc_ptr, Pointer): + attr_str = f"#cute_nvgpu.atom_copy_field_tmaload<{TMA_DESC_PTR_FIELD_NAME}>" + attr = ir.Attribute.parse(attr_str) + exec_value = _cute_nvgpu_ir.atom_set_value( + exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip + ) + return exec_value + + +# +# TMA SMEM -> GMEM copies +# + + +@dataclass(frozen=True) +class CopyBulkTensorTileS2GOp(CopyOp): + """ + Bulk tensor asynchrnous SMEM to GMEM Copy Operation using the TMA unit. + + See the `PTX documentation `__. + This Operation uses TMA in the ``.tile`` mode. + """ + + admissible_archs = ["sm_90", "sm_90a", "sm_100a"] + + def __post_init__(self): + # Arch verification + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + + def __str__(self) -> str: + return "cp.async SMEM -> GMEM bulk tensor copy Operation" + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "CopyBulkTensorTileS2GTrait": + raise NotImplementedError( + "Use cpasync.make_tma_tile_atom to obtain a copy Atom for TMA" + ) + + +class CopyBulkTensorTileS2GTrait(Trait): + def unpack(self, *, loc=None, ip=None, tma_desc_ptr: Optional[Pointer] = None): + """ + Custom implementation of unpack for non-executable TMAs. + """ + exec_value = _cute_nvgpu_ir.atom_make_exec_tma(self.value, loc=loc, ip=ip) + if isinstance(tma_desc_ptr, Pointer): + attr_str = ( + f"#cute_nvgpu.atom_copy_field_tmastore<{TMA_DESC_PTR_FIELD_NAME}>" + ) + attr = ir.Attribute.parse(attr_str) + exec_value = _cute_nvgpu_ir.atom_set_value( + exec_value, attr, tma_desc_ptr.value, loc=loc, ip=ip + ) + return exec_value diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py new file mode 100644 index 00000000..92f028a2 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/cpasync/helpers.py @@ -0,0 +1,327 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Optional, Tuple, Type, Union + +from cutlass.cutlass_dsl import dsl_user_op + +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir.dialects import llvm + +from ...typing import Coord, Layout, Tensor, Tiler, Pointer, Int16, Numeric, NumericMeta +from ... import core +from .copy import ( + CopyBulkTensorTileG2SOp, + CopyBulkTensorTileG2SMulticastOp, + CopyBulkTensorTileS2GOp, + CopyBulkTensorTileG2SNonExecTrait, + CopyBulkTensorTileG2SMulticastNonExecTrait, + CopyBulkTensorTileS2GTrait, +) + + +@dsl_user_op +def make_tma_tile_atom( + op: Union[ + CopyBulkTensorTileG2SOp, + CopyBulkTensorTileG2SMulticastOp, + CopyBulkTensorTileS2GOp, + ], + gmem_tensor: Tensor, + smem_layout: Layout, + cta_tiler: Tiler, + num_multicast: int = 1, + *, + internal_type: Optional[Type[Numeric]] = None, + loc=None, + ip=None, +) -> Tuple[core.CopyAtom, Tensor]: + """ + Makes a TMA Copy Atom in the ``.tile`` mode to copy tiles of a GMEM tensor to/from and SMEM + buffer with the given Layout. + + Given + + - a GMEM tensor + - a SMEM layout + - a CTA-level Tiler + + this function figures out the bulk tensor asynchronous copy instruction to use with the maximum + "TMA vector length" to copy tiles of the GMEM tensor to/from an SMEM buffer with the provided + layout and consistent with the provided Tiler. + + This function returns two results: + + 1. the Copy Atom + 2. the so-called TMA tensor used to map logical coordinates of the GMEM tensor to coordinates \ + that the TMA unit can consume. TMA tensors have so-called basis stride elements so that the \ + associated layout can output coordinates. Otherwise, TMA tensors can be partitioned \ + similarly to any other CuTe tensors using the algebra. + + :param op: The Copy Operation to construct an Atom for + :type op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp, CopyBulkTensorTileS2GOp] + :param gmem_tensor: The GMEM tensor involved in the Copy + :type gmem_tensor: Tensor + :param smem_layout: The SMEM layout to construct the Copy Atom for + :type smem_layout: Layout + :param cta_tiler: The CTA Tiler to use + :type cta_tiler: Tiler + :param num_multicast: The multicast factor + :type num_multicast: int + :param internal_type: An optional parameter for the internal data type to use when the actual data type is not supported by the TMA unit + :type internal_type: Type[Numeric] + :return: A Copy Atom for this Operation and the associated TMA tensor + :rtype: Tuple[core.CopyAtom, Tensor] + """ + + if internal_type is not None: + if not isinstance(internal_type, NumericMeta): + raise TypeError(f"internal_type must be a Numeric, but got {internal_type}") + internal_type = internal_type.mlir_type + + cta_v_map = core.composition( + core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip), + cta_tiler, + loc=loc, + ip=ip, + ) + + if isinstance(op, CopyBulkTensorTileG2SOp): + if num_multicast != 1: + raise ValueError( + f"expects num_multicast to be 1 for non multicast G2S copies, " + f"but got {num_multicast}" + ) + res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load( + gmem_tensor.value, + smem_layout, + cta_v_map, + op._to_ir(), + num_multicast=num_multicast, + internal_type=internal_type, + loc=loc, + ip=ip, + ) + return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1] + elif isinstance(op, CopyBulkTensorTileG2SMulticastOp): + if num_multicast < 1: + raise ValueError( + f"expects num_multicast to be >= 1 for multicast G2S copies, " + f"but got {num_multicast}" + ) + res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load( + gmem_tensor.value, + smem_layout, + cta_v_map, + op._to_ir(), + num_multicast=num_multicast, + internal_type=internal_type, + loc=loc, + ip=ip, + ) + return ( + core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])), + res[1], + ) + elif isinstance(op, CopyBulkTensorTileS2GOp): + res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_store( + gmem_tensor.value, + smem_layout, + cta_v_map, + internal_type=internal_type, + loc=loc, + ip=ip, + ) + return core.CopyAtom(op, CopyBulkTensorTileS2GTrait(res[0])), res[1] + else: + raise ValueError(f"expects a bulk tensor (TMA) Copy Op, but got {op}") + + +@dsl_user_op +def tma_partition( + atom: core.CopyAtom, + cta_coord: Coord, + cta_layout: Layout, + smem_tensor: Tensor, + gmem_tensor: Tensor, + *, + loc=None, + ip=None, +) -> Tuple[Tensor, Tensor]: + """ + Tiles the GMEM and SMEM tensors for the provided TMA Copy Atom. + """ + cta_coord_val = core._pack_coord(cta_coord, loc=loc, ip=ip) + s, d = _cute_nvgpu_ir.atom_tma_partition( + atom._trait.value, + cta_coord=cta_coord_val, + cta_layout=cta_layout, + smem_tensor=smem_tensor.value, + gmem_tensor=gmem_tensor.value, + loc=loc, + ip=ip, + ) + return s, d + + +@dsl_user_op +def create_tma_multicast_mask( + cta_layout_vmnk: Layout, + cta_coord_vmnk: Coord, + mcast_mode: int, + *, + loc=None, + ip=None, +) -> Int16: + """ + Computes a multicast mask for a TMA load Copy. + + :param cta_layout_vmnk: The VMNK layout of the cluster + :type cta_layout_vmnk: Layout + :param cta_coord_vmnk: The VMNK coordinate of the current CTA + :type cta_coord_vmnk: Coord + :param mcast_mode: The tensor mode in which to multicast + :type mcast_mode: int + :return: The resulting mask + :rtype: Int16 + """ + if core.rank(cta_layout_vmnk) != 4: + raise ValueError( + f"cta_layout_vmnk must be rank 4, but got {core.pretty_str(cta_layout_vmnk)}" + ) + if core.rank(cta_coord_vmnk) != 4: + raise ValueError( + f"cta_coord_vmnk must be rank 4, but got {core.pretty_str(cta_coord_vmnk)}" + ) + return core.make_layout_image_mask( + cta_layout_vmnk, cta_coord_vmnk, mcast_mode, loc=loc, ip=ip + ) + + +@dsl_user_op +def prefetch_descriptor(tma_atom: core.CopyAtom, *, loc=None, ip=None) -> None: + """ + Prefetches the TMA descriptor associated with the TMA Atom. + """ + _cute_nvgpu_ir.prefetch_tma_desc(tma_atom._trait.value, loc=loc, ip=ip) + + +@dsl_user_op +def copy_tensormap( + tma_atom: core.CopyAtom, tensormap_ptr: Pointer, *, loc=None, ip=None +) -> None: + """ + Copies the tensormap held by a TMA Copy Atom to the memory location pointed to by the provided + pointer. + + :param tma_atom: The TMA Copy Atom + :type tma_atom: CopyAtom + :param tensormap_ptr: The pointer to the memory location to copy the tensormap to + :type tensormap_ptr: Pointer + """ + _cute_nvgpu_ir.copy_tma_desc( + tma_atom._trait.value, tensormap_ptr.value, loc=loc, ip=ip + ) + + +@dsl_user_op +def update_tma_descriptor( + tma_atom: core.CopyAtom, + gmem_tensor: Tensor, + tma_desc_ptr: Pointer, + *, + loc=None, + ip=None, +) -> None: + """ + Updates the TMA descriptor in the memory location pointed to by the provided pointer using + information from a TMA Copy Atom and the provided GMEM tensor. + + Specifically, the following fields of the TMA descriptor will be updated: + + 1. the GMEM tensor base address + 2. the GMEM tensor shape + 3. the GMEM tensor stride + + Other fields of the TMA descriptor are left unchanged. + + :param tma_atom: The TMA Copy Atom + :type tma_atom: CopyAtom + :param gmem_tensor: The GMEM tensor + :type gmem_tensor: Tensor + :param tensormap_ptr: The pointer to the memory location of the descriptor to udpate + :type tensormap_ptr: Pointer + """ + _cute_nvgpu_ir.update_tma_desc( + tma_atom._trait.value, gmem_tensor.value, tma_desc_ptr.value, loc=loc, ip=ip + ) + + +@dsl_user_op +def fence_tma_desc_acquire( + tma_desc_ptr: Pointer, + *, + loc=None, + ip=None, +) -> None: + """ + See the `PTX documentation `__. + """ + tma_desc_ptr_i64 = tma_desc_ptr.toint(loc=loc, ip=ip).ir_value() + llvm.inline_asm( + None, + [tma_desc_ptr_i64], + "fence.proxy.tensormap::generic.acquire.gpu [$0], 128;", + "l", + has_side_effects=True, + is_align_stack=False, + asm_dialect=llvm.AsmDialect.AD_ATT, + ) + + +@dsl_user_op +def cp_fence_tma_desc_release( + tma_desc_global_ptr: Pointer, + tma_desc_shared_ptr: Pointer, + *, + loc=None, + ip=None, +) -> None: + """ + See the `PTX documentation `__. + """ + tma_desc_global_ptr_i64 = tma_desc_global_ptr.toint(loc=loc, ip=ip).ir_value() + tma_desc_shared_ptr_i32 = tma_desc_shared_ptr.toint(loc=loc, ip=ip).ir_value() + llvm.inline_asm( + None, + [tma_desc_global_ptr_i64, tma_desc_shared_ptr_i32], + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [$0], [$1], 128;", + "l,r", + has_side_effects=True, + is_align_stack=False, + asm_dialect=llvm.AsmDialect.AD_ATT, + ) + + +@dsl_user_op +def fence_tma_desc_release(*, loc=None, ip=None) -> None: + """ + See the `PTX documentation `__. + """ + llvm.inline_asm( + None, + [], + "fence.proxy.tensormap::generic.release.gpu;", + "", + has_side_effects=True, + is_align_stack=False, + asm_dialect=llvm.AsmDialect.AD_ATT, + ) diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py new file mode 100644 index 00000000..020b96d8 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/helpers.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Optional, Tuple, Type, Union + +from cutlass.cutlass_dsl import dsl_user_op + +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir + +from .. import core +from ..typing import Shape, Layout, Tensor, Numeric, NumericMeta +from ...impl_utils import check_type_in +from .cpasync.copy import ( + CopyBulkTensorTileG2SOp, + CopyBulkTensorTileG2SNonExecTrait, + CopyBulkTensorTileG2SMulticastOp, + CopyBulkTensorTileG2SMulticastNonExecTrait, +) + + +#################################################################################################### +# +# TMA creation helpers for tcgen05 MMAs +# +#################################################################################################### + + +@dsl_user_op +def make_tma_tile_atom_A( + op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp], + gmem_tensor: Tensor, + smem_layout: Layout, + mma_tiler_mnk: Shape, + tiled_mma: core.TiledMma, + cluster_shape_vmnk: Shape, + *, + internal_type: Optional[Type[Numeric]] = None, + loc=None, + ip=None, +) -> Tuple[core.CopyAtom, Tensor]: + if internal_type is not None: + if not isinstance(internal_type, NumericMeta): + raise TypeError(f"internal_type must be a Numeric, but got {internal_type}") + internal_type = internal_type.mlir_type + check_type_in( + op, + [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp], + "op", + "make_tma_tile_atom_A", + ) + + ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip) + mma_tiler_mk = (mma_tiler_mnk[0], *mma_tiler_mnk[2:]) + g_tile = core.composition(ident, mma_tiler_mk, loc=loc, ip=ip) + cta_v_map = tiled_mma._thrfrg_A(g_tile) + cta_v_map = core.get(cta_v_map, mode=[1]) + cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile))) + + if isinstance(op, CopyBulkTensorTileG2SOp): + num_multicast = 1 + else: + assert isinstance(op, CopyBulkTensorTileG2SMulticastOp) + # multicast across the N-mode since those would share the same tile of A + num_multicast = core.size(cluster_shape_vmnk, mode=[2]) + + # res[0] = the IR Value for the non-executable atom instance + # res[1] = the IR Value for the associated TMA tensor + res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load( + gmem_tensor.value, + smem_layout, + cta_v_map, + op._to_ir(), + num_multicast=num_multicast, + internal_type=internal_type, + loc=loc, + ip=ip, + ) + if isinstance(op, CopyBulkTensorTileG2SOp): + return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1] + else: + assert isinstance(op, CopyBulkTensorTileG2SMulticastOp) + return ( + core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])), + res[1], + ) + + +@dsl_user_op +def make_tma_tile_atom_B( + op: Union[CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp], + gmem_tensor: Tensor, + smem_layout: Layout, + mma_tiler_mnk: Shape, + tiled_mma: core.TiledMma, + cluster_shape_vmnk: Shape, + *, + internal_type: Optional[Type[Numeric]] = None, + loc=None, + ip=None, +) -> Tuple[core.CopyAtom, Tensor]: + if internal_type is not None: + if not isinstance(internal_type, NumericMeta): + raise TypeError(f"internal_type must be a Numeric, but got {internal_type}") + internal_type = internal_type.mlir_type + check_type_in( + op, + [CopyBulkTensorTileG2SOp, CopyBulkTensorTileG2SMulticastOp], + "op", + "make_tma_tile_atom_B", + ) + + ident = core.make_identity_layout(gmem_tensor.shape, loc=loc, ip=ip) + mma_tiler_nk = (mma_tiler_mnk[1], *mma_tiler_mnk[2:]) + g_tile = core.composition(ident, mma_tiler_nk, loc=loc, ip=ip) + cta_v_map = tiled_mma._thrfrg_B(g_tile) + cta_v_map = core.get(cta_v_map, mode=[1]) + cta_v_map = core.dice(cta_v_map, (1, (1,) * core.rank(g_tile))) + + if isinstance(op, CopyBulkTensorTileG2SOp): + num_multicast = 1 + else: + assert isinstance(op, CopyBulkTensorTileG2SMulticastOp) + # multicast across the M-mode since those would share the same tile of B + num_multicast = core.size(cluster_shape_vmnk, mode=[1]) + + # res[0] = the IR Value for the non-executable atom instance + # res[1] = the IR Value for the associated TMA tensor + res = _cute_nvgpu_ir.atom_make_non_exec_tiled_tma_load( + gmem_tensor.value, + smem_layout, + cta_v_map, + op._to_ir(), + num_multicast=num_multicast, + internal_type=internal_type, + loc=loc, + ip=ip, + ) + if isinstance(op, CopyBulkTensorTileG2SOp): + return core.CopyAtom(op, CopyBulkTensorTileG2SNonExecTrait(res[0])), res[1] + else: + assert isinstance(op, CopyBulkTensorTileG2SMulticastOp) + return ( + core.CopyAtom(op, CopyBulkTensorTileG2SMulticastNonExecTrait(res[0])), + res[1], + ) + + +__all__ = [ + "make_tma_tile_atom_A", + "make_tma_tile_atom_B", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py new file mode 100644 index 00000000..4afeb527 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/__init__.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .copy import * +from .mma import * +from .helpers import * + +# __all__ is required here for documentation generation +__all__ = [ + # + # copy.py + # + "Repetition", + "Pack", + "Unpack", + "Ld16x64bOp", + "Ld16x128bOp", + "Ld16x256bOp", + "Ld16x32bx2Op", + "Ld32x32bOp", + "St16x64bOp", + "St16x128bOp", + "St16x256bOp", + "St16x32bx2Op", + "St32x32bOp", + # + # mma.py + # + "OperandMajorMode", + "OperandSource", + "CtaGroup", + "Field", + "MmaTF32Op", + "MmaF16BF16Op", + "MmaI8Op", + "MmaFP8Op", + "SmemLayoutAtomKind", + # + # helpers.py + # + "make_smem_layout_atom", + "tile_to_mma_shape", + "commit", + "is_tmem_load", + "is_tmem_store", + "get_tmem_copy_properties", + "find_tmem_tensor_col_offset", + "make_tmem_copy", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py new file mode 100644 index 00000000..283cf8fb --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/copy.py @@ -0,0 +1,465 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import enum +from dataclasses import dataclass +from typing import Type + +from cutlass.cutlass_dsl import CuTeDSL + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from ..common import OpError +from ...core import CopyOp, Trait +from ...typing import Numeric + + +class Repetition(enum.Enum): + """ + An enumeration for the number of repetitions of a given TMEM copy within the instruction. + """ + + x1 = 1 + x2 = 2 + x4 = 4 + x8 = 8 + x16 = 16 + x32 = 32 + x64 = 64 + x128 = 128 + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + @classmethod + def _missing_(cls, value): + if isinstance(value, int): + if value == 1: + return Repetition.x1 + elif value == 2: + return Repetition.x2 + elif value == 8: + return Repetition.x8 + elif value == 16: + return Repetition.x16 + elif value == 32: + return Repetition.x32 + elif value == 64: + return Repetition.x64 + elif value == 128: + return Repetition.x128 + + +class Pack(enum.Enum): + """ + An enumeration for the possible packing patterns for TMEM to RMEM copies. + """ + + NONE = enum.auto() + PACK_16b_IN_32b = enum.auto() + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + +class Unpack(enum.Enum): + """ + An enumeration for the possible unpacking patterns for RMEM to TMEM copies. + """ + + NONE = enum.auto() + UNPACK_32b_IN_16b = enum.auto() + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + +@dataclass(frozen=True) +class _LdBase(CopyOp): + repeat: Repetition = Repetition.x1 + pack: Pack = Pack.NONE + + admissible_archs = ["sm_100a"] + + def __post_init__(self) -> None: + # Arch verification + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + + if not isinstance(self.repeat, Repetition): + raise OpError( + self, + "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance", + ) + if not isinstance(self.pack, Pack): + raise OpError( + self, + "expects the 'pack' Op parameter to be a tcgen05.Pack instance", + ) + + def __str__(self) -> str: + res = ( + f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation" + + f"\n number of repetitions = {self.repeat.value}" + ) + if self.pack == Pack.PACK_16b_IN_32b: + res += f"\n with 2x 16-bit to 32b packing" + return res + + +@dataclass(frozen=True) +class Ld16x64bOp(_LdBase): + """ + 16x64b TMEM load Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x64b`` qualifier. + """ + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "Ld16x64bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get( + copy_internal_type.mlir_type, + 16, + 64, + self.repeat.value, + ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None, + ) + return Ld16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class Ld16x64bTrait(Trait): + pass + + +@dataclass(frozen=True) +class Ld16x128bOp(_LdBase): + """ + 16x128b TMEM load Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x128b`` qualifier. + """ + + def __post_init__(self) -> None: + super().__post_init__() + if self.repeat == Repetition.x128: + raise OpError( + self, + "x128 repetition is not supported", + suggestion="choose one of x1, x2, x4, x8, x16, x32, x64", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "Ld16x128bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get( + copy_internal_type.mlir_type, + 16, + 128, + self.repeat.value, + ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None, + ) + return Ld16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class Ld16x128bTrait(Trait): + pass + + +@dataclass(frozen=True) +class Ld16x256bOp(_LdBase): + """ + 16x256b TMEM load Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x256b`` qualifier. + """ + + def __post_init__(self) -> None: + super().__post_init__() + if self.repeat in (Repetition.x128, Repetition.x64): + raise OpError( + self, + "x64 and x128 repetition is not supported", + suggestion="choose one of x1, x2, x4, x8, x16, x32", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "Ld16x256bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get( + copy_internal_type.mlir_type, + 16, + 256, + self.repeat.value, + ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None, + ) + return Ld16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class Ld16x256bTrait(Trait): + pass + + +@dataclass(frozen=True) +class Ld16x32bx2Op(_LdBase): + """ + 16x32bx2 TMEM load Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x32bx2`` qualifier. + """ + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "Ld16x32bx2Trait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get( + copy_internal_type.mlir_type, + 16, + 32, + self.repeat.value, + ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None, + ) + return Ld16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class Ld16x32bx2Trait(Trait): + pass + + +@dataclass(frozen=True) +class Ld32x32bOp(_LdBase): + """ + 32x32b TMEM load Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.32x32`` qualifier. + """ + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "Ld32x32bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemLoadType.get( + copy_internal_type.mlir_type, + 32, + 32, + self.repeat.value, + ir.UnitAttr.get() if self.pack == Pack.PACK_16b_IN_32b else None, + ) + return Ld32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class Ld32x32bTrait(Trait): + pass + + +@dataclass(frozen=True) +class _StBase(CopyOp): + repeat: Repetition + unpack: Unpack = Unpack.NONE + + admissible_archs = ["sm_100a"] + + def __post_init__(self) -> None: + # Arch verification + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + + if not isinstance(self.repeat, Repetition): + raise OpError( + self, + "expects the 'repeat' Op parameter to be a tcgen05.Repetition instance", + ) + if not isinstance(self.unpack, Unpack): + raise OpError( + self, + "expects the 'pack' Op parameter to be a tcgen05.Unpack instance", + ) + + def __str__(self) -> str: + res = ( + f"tcgen05 {self.__class__.__name__[:-2]} Copy Operation" + + f"\n number of repetitions = {self.repeat.value}" + ) + if self.unpack == Unpack.UNPACK_32b_IN_16b: + res += f"\n with 32-bit to 2x 16b unpacking" + return res + + +@dataclass(frozen=True) +class St16x64bOp(_StBase): + """ + 16x64b TMEM store Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x64`` qualifier. + """ + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "St16x64bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get( + copy_internal_type.mlir_type, + 16, + 64, + self.repeat.value, + ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None, + ) + return St16x64bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class St16x64bTrait(Trait): + pass + + +@dataclass(frozen=True) +class St16x128bOp(_StBase): + """ + 16x128b TMEM store Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x128`` qualifier. + """ + + def __post_init__(self) -> None: + super().__post_init__() + if self.repeat == Repetition.x128: + raise OpError( + self, + "x128 repetition is not supported", + suggestion="choose one of x1, x2, x4, x8, x16, x32, x64", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "St16x128bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get( + copy_internal_type.mlir_type, + 16, + 128, + self.repeat.value, + ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None, + ) + return St16x128bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class St16x128bTrait(Trait): + pass + + +@dataclass(frozen=True) +class St16x256bOp(_StBase): + """ + 16x256b TMEM store Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x256`` qualifier. + """ + + def __post_init__(self) -> None: + super().__post_init__() + if self.repeat in (Repetition.x128, Repetition.x64): + raise OpError( + self, + "x64 and x128 repetition is not supported", + suggestion="choose one of x1, x2, x4, x8, x16, x32", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "St16x256bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get( + copy_internal_type.mlir_type, + 16, + 256, + self.repeat.value, + ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None, + ) + return St16x256bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class St16x256bTrait(Trait): + pass + + +@dataclass(frozen=True) +class St16x32bx2Op(_StBase): + """ + 16x32x2b TMEM store Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.16x32x2`` qualifier. + """ + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "St16x32bx2Trait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get( + copy_internal_type.mlir_type, + 16, + 32, + self.repeat.value, + ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None, + ) + return St16x32bx2Trait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class St16x32bx2Trait(Trait): + pass + + +@dataclass(frozen=True) +class St32x32bOp(_StBase): + """ + 32x32b TMEM store Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.32x32`` qualifier. + """ + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "St32x32bTrait": + ty = _cute_nvgpu_ir.CopyAtomSM100TmemStoreType.get( + copy_internal_type.mlir_type, + 32, + 32, + self.repeat.value, + ir.UnitAttr.get() if self.unpack == Unpack.UNPACK_32b_IN_16b else None, + ) + return St32x32bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class St32x32bTrait(Trait): + pass diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py new file mode 100644 index 00000000..cac64131 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/helpers.py @@ -0,0 +1,301 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import overload, Type, Tuple, Union + +from cutlass.cutlass_dsl import dsl_user_op + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir.dialects import nvvm + +from ...typing import ( + Shape, + IntTuple, + Layout, + Tensor, + Int, + Numeric, + NumericMeta, + Int16, + Int32, +) +from ... import core +from .mma import SmemLayoutAtomKind, CtaGroup +from .copy import ( + Pack, + Unpack, + Ld16x64bOp, + Ld16x128bOp, + Ld16x256bOp, + Ld16x32bx2Op, + Ld32x32bOp, + St16x64bOp, + St16x128bOp, + St16x256bOp, + St16x32bx2Op, + St32x32bOp, +) + + +#################################################################################################### +# +# Helper functions for MMA +# +#################################################################################################### + + +@dsl_user_op +def make_smem_layout_atom( + kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None +) -> core.ComposedLayout: + """ + Makes a SMEM layout Atom. + + This function creates a composed layout in unit of elements consistent with the requested layout + Atom kind and element data type. + + :param kind: The kind of layout Atom + :type kind: SmemLayoutAtomKind + :param element_type: The element data type to construct the layout for + :type element_type: Type[Numeric] + :return: The SMEM layout atom + :rtype: core.ComposedLayout + """ + if not isinstance(element_type, NumericMeta): + raise TypeError(f"element_type must be a Numeric, but got {element_type}") + + if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER): + num_contiguous_bits = 128 + sw = core.make_swizzle(0, 4, 3) + elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32): + num_contiguous_bits = 256 + sw = core.make_swizzle(1, 4, 3) + elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64): + num_contiguous_bits = 512 + sw = core.make_swizzle(2, 4, 3) + elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128): + num_contiguous_bits = 1024 + sw = core.make_swizzle(3, 4, 3) + elif kind == SmemLayoutAtomKind.MN_SW128_32B: + num_contiguous_bits = 1024 + sw = core.make_swizzle(2, 5, 2) + else: + raise ValueError("unrecognized SMEM layout atom kind") + num_contiguous_elems = num_contiguous_bits // element_type.width + + if kind in ( + SmemLayoutAtomKind.MN_INTER, + SmemLayoutAtomKind.MN_SW32, + SmemLayoutAtomKind.MN_SW64, + SmemLayoutAtomKind.MN_SW128, + SmemLayoutAtomKind.MN_SW128_32B, + ): + # M/N-major layout + return core.make_composed_layout( + sw, + 0, + core.make_layout( + (num_contiguous_elems, 8), stride=(1, num_contiguous_elems) + ), + loc=loc, + ip=ip, + ) + else: + # K-major layout + return core.make_composed_layout( + sw, + 0, + core.make_layout( + (8, num_contiguous_elems), stride=(num_contiguous_elems, 1) + ), + loc=loc, + ip=ip, + ) + + +@overload +def tile_to_mma_shape( + atom: Layout, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None +) -> Layout: ... + + +@overload +def tile_to_mma_shape( + atom: core.ComposedLayout, + mma_tile_shape: Shape, + order: IntTuple = None, + *, + loc=None, + ip=None, +) -> core.ComposedLayout: ... + + +@dsl_user_op +def tile_to_mma_shape( + atom, mma_tile_shape: Shape, order: IntTuple = None, *, loc=None, ip=None +): + """ + Tiles a layout to an MMA shape. + """ + # Default order is colexicographical + if order is None: + order = tuple(range(core.rank(mma_tile_shape) - 1)) + if core.rank(order) != core.rank(mma_tile_shape) - 1: + raise ValueError( + f"rank(order)={core.rank(order)} must be equal to " + f"rank(mma_tile_shape)-1={core.rank(mma_tile_shape)-1}" + ) + order_val = core._pack_int_tuple(order, loc=loc, ip=ip) + mma_tile_shape_val = core._pack_shape(mma_tile_shape, loc=loc, ip=ip) + + if not ( + core.is_static(atom) + and core.is_static(mma_tile_shape_val) + and core.is_static(order_val) + ): + raise ValueError("tile_to_mma_shape only supports static inputs") + + res_ty = _cute_nvgpu_ir.tile_to_mma_shape(atom, mma_tile_shape_val, order_val) + return _cute_ir.static(res_ty, loc=loc, ip=ip) + + +@dsl_user_op +def commit( + mbar_ptr: core.Pointer, + mask=None, + cta_group: CtaGroup = CtaGroup.ONE, + *, + loc=None, + ip=None, +) -> None: + """ + Perform an arrive operation on a mbarrier upon completion of previous MMA operations. + + :param mbar_ptr: A pointer to the mbarrier in SMEM + :type mbar_ptr: Pointer + :param mask: An optional multicast mask for the CTAs in the cluster to signal arrival to + :type mask: Int + """ + if cta_group == CtaGroup.ONE: + group = nvvm.Tcgen05GroupKind.CTA_1 + else: + assert cta_group == CtaGroup.TWO + group = nvvm.Tcgen05GroupKind.CTA_2 + + mbar_ptr = mbar_ptr.llvm_ptr + if mask is not None: + mask = Int16(mask).ir_value(loc=loc, ip=ip) + nvvm.tcgen05_commit_arrive( + mbar_ptr, multicast_mask=mask, group=group, loc=loc, ip=ip + ) + else: + nvvm.tcgen05_commit_arrive(mbar_ptr, group=group, loc=loc, ip=ip) + return + + +#################################################################################################### +# +# Helper functions for Copies +# +#################################################################################################### + + +def is_tmem_load(atom: core.CopyAtom) -> bool: + """ + Returns whether a CopyAtom instance is a TMEM load. + """ + return isinstance( + atom.op, + ( + Ld16x64bOp, + Ld16x128bOp, + Ld16x256bOp, + Ld16x32bx2Op, + Ld32x32bOp, + ), + ) + + +def is_tmem_store(atom: core.CopyAtom) -> bool: + """ + Returns whether a CopyAtom instance is a TMEM store. + """ + return isinstance( + atom.op, + ( + St16x64bOp, + St16x128bOp, + St16x256bOp, + St16x32bx2Op, + St32x32bOp, + ), + ) + + +def get_tmem_copy_properties( + atom: core.CopyAtom, +) -> Tuple[int, int, int, Union[Pack, Unpack]]: + """ + Returns the properties of a TMEM copy atom (number of data paths, bits, repetitions, + and whether packing/unpacking is used). + """ + if isinstance(atom.op, (Ld16x64bOp, St16x64bOp)): + num_dp, num_bits = 16, 64 + elif isinstance(atom.op, (Ld16x128bOp, St16x128bOp)): + num_dp, num_bits = 16, 128 + elif isinstance(atom.op, (Ld16x256bOp, St16x256bOp)): + num_dp, num_bits = 16, 256 + elif isinstance(atom.op, (Ld16x32bx2Op, St16x32bx2Op)): + num_dp, num_bits = 16, 32 + elif isinstance(atom.op, (Ld32x32bOp, St32x32bOp)): + num_dp, num_bits = 32, 32 + else: + raise ValueError(f"expects 'atom' to be a TMEM copy, but got {atom}") + if is_tmem_load(atom): + return num_dp, num_bits, atom.op.repeat.value, atom.op.pack + else: + assert is_tmem_store(atom), "atom must be a TMEM store" + return num_dp, num_bits, atom.op.repeat.value, atom.op.unpack + + +@dsl_user_op +def find_tmem_tensor_col_offset(tmem_tensor: Tensor, *, loc=None, ip=None) -> Int: + """ + Computes the TMEM column offset given a TMEM tensor. + + :param tmem_tensor: The TMEM tensor to use to compute the columns offset + :type tmem_tensor: Tensor + :return: The columns offset + :rtype: Int + """ + tmem_col_mask = 0x0000FFFF + offset = ( + core.cosize(core.recast_tensor(tmem_tensor, Int32).layout, loc=loc, ip=ip) + & tmem_col_mask + ) + if isinstance(offset, int): + return offset + return Int32(offset, loc=loc, ip=ip) + + +@dsl_user_op +def make_tmem_copy( + atom: core.CopyAtom, tmem_tensor: Tensor, *, loc=None, ip=None +) -> core.TiledCopy: + """ + Makes a Tiled Copy instance from a TMEM Copy Atom and a TMEM tensor. + """ + tiled_copy_val = _cute_nvgpu_ir.atom_make_tmem_copy( + atom._trait.value, tmem_tensor.value, loc=loc, ip=ip + ) + new_trait = type(atom._trait)(tiled_copy_val) + return core.TiledCopy(atom.op, new_trait) diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py new file mode 100644 index 00000000..096a4e12 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/tcgen05/mma.py @@ -0,0 +1,603 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import enum +from dataclasses import dataclass +from typing import Type + +from cutlass.cutlass_dsl import CuTeDSL, T + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from ..common import OpError +from ...core import MmaOp, Trait, _pack_shape, rank, depth +from ...typing import ( + Shape, + Float8E5M2, + Float8E4M3FN, + Float16, + BFloat16, + Float32, + TFloat32, + Boolean, + Int8, + Uint8, + Int32, + Numeric, +) + + +#################################################################################################### +# +# MMA Ops and Traits +# +#################################################################################################### + + +class OperandMajorMode(enum.Enum): + """ + An enumeration for the majorness of the input operands of the MMA. + """ + + MN = _cute_ir.MajorMode.mn + K = _cute_ir.MajorMode.k + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + @classmethod + def _missing_(cls, value): + if isinstance(value, str): + value = value.upper() + if value == "MN": + return OperandMajorMode.MN + elif value == "K": + return OperandMajorMode.K + + def _to_ir(self) -> _cute_ir.MajorMode: + return self.value + + +class OperandSource(enum.Enum): + """ + An enumeration for the source memory location of the A input operand of the MMA. + """ + + TMEM = _cute_ir.MmaFragKind.tmem + SMEM = _cute_ir.MmaFragKind.smem_desc + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + def _to_ir(self) -> _cute_ir.MmaFragKind: + return self.value + + +class CtaGroup(enum.Enum): + """ + An enumeration for the ``cta_group`` qualifier of the MMA. + """ + + ONE = 1 + TWO = 2 + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + +class Field(enum.Enum): + """ + An enumeration for the fields of the MMA Atom that can be modified at runtime. + """ + + NEGATE_A = "neg_a" + NEGATE_B = "neg_b" + ACCUMULATE = "accum_c" + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + def _to_ir_field_name(self) -> str: + return self.value + + +# Base class for all tcgen05 MMA Ops used to factor out some internal code +@dataclass(frozen=True) +class MmaOp(MmaOp): + a_dtype: Type[Numeric] + b_dtype: Type[Numeric] + acc_dtype: Type[Numeric] + shape_mnk: Shape + cta_group: CtaGroup + a_src: OperandSource + a_major_mode: OperandMajorMode + b_major_mode: OperandMajorMode + + admissible_archs = ["sm_100a"] + + def __post_init__(self) -> None: + # Verify arch + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + # Verify that the user provided enum values + if not isinstance(self.cta_group, CtaGroup): + raise OpError( + self, + "expects the 'cta_group' Op parameter to be a tcgen05.CtaGroup instance", + ) + if not isinstance(self.a_src, OperandSource): + raise OpError( + self, + "expects the 'a_src' Op parameter to be a tcgen05.OperandSource instance", + ) + if not isinstance(self.a_major_mode, OperandMajorMode): + raise OpError( + self, + "expects the 'a_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance", + ) + if not isinstance(self.b_major_mode, OperandMajorMode): + raise OpError( + self, + "expects the 'b_major_mode' Op parameter to be a tcgen05.OperandMajorMode instance", + ) + # Verify the instruction shape + if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1): + raise OpError( + self, + f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, " + f"but got {self.shape_mnk}", + ) + m, n = self.shape_mnk[0], self.shape_mnk[1] + if self.cta_group == CtaGroup.ONE: + if m not in [64, 128]: + raise OpError(self, f"expects the M-mode to be 64 or 128, but got {m}") + if m == 64: + if (n < 8) or (n > 256) or (n % 8 != 0): + raise OpError( + self, + f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0, but got {n}", + ) + elif m == 128: + if (n < 16) or (n > 256) or (n % 16 != 0): + raise OpError( + self, + f"expects the N-mode to satisfy 8 <= N <= 256 and N % 16 == 0, but got {n}", + ) + else: + if m not in [128, 256]: + raise OpError(self, f"expects the M-mode to be 128 or 256, but got {m}") + if (n < 32) or (n > 256) or (n % 32 != 0): + raise OpError( + self, + f"expects the N-mode to satisfy 32 <= N <= 256 and N % 32 == 0, but got {n}", + ) + + def __str__(self) -> str: + return ( + self.__class__.descriptive_name # type: ignore + + f"\n A data type = {self.a_dtype}" + + f"\n B data type = {self.b_dtype}" + + f"\n Accumulator data type = {self.acc_dtype}" + + f"\n CTA group = {self.cta_group}" + + f"\n A source location = {self.a_src}" + + f"\n A major mode = {self.a_major_mode}" + + f"\n B major mode = {self.b_major_mode}" + + f"\n Instruction shape MNK = {self.shape_mnk}" + ) + + +class MmaTrait(Trait): + admissible_fields = [Field.ACCUMULATE, Field.NEGATE_A, Field.NEGATE_B] + + def set(self, field, value, *, loc=None, ip=None) -> None: + if field not in self.admissible_fields: + raise ValueError( + f"expects field to be one of {self.admissible_fields}, but got {field}" + ) + field_name = f"#cute_nvgpu.atom_mma_field_sm100<{field._to_ir_field_name()}>" + attr = ir.Attribute.parse(field_name) + self.value = _cute_nvgpu_ir.atom_set_value( + self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + + +# +# TF32 MMA +# + + +@dataclass(frozen=True) +class MmaTF32Op(MmaOp): + """ + TF32 tcgen05 MMA Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.kind::tf32`` qualifier. + """ + + descriptive_name = "tcgen05 TF32 MMA Operation" + + def __init__( + self, + instruction_shape: Shape, + cta_group: CtaGroup, + a_src: OperandSource, + a_major_mode: OperandMajorMode, + b_major_mode: OperandMajorMode, + ) -> None: + super().__init__( + TFloat32, + TFloat32, + Float32, + instruction_shape, + cta_group, + a_src, + a_major_mode, + b_major_mode, + ) + self._verify() + + def _verify(self) -> None: + # Verify the instruction shape + instruction_k = 8 + if rank(self.shape_mnk) == 2: + object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k)) + if self.shape_mnk[2] != instruction_k: + raise OpError( + self, + f"expects the instruction extent in the K-mode to be {instruction_k}, " + f"but got {self.shape_mnk[2]}", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaTF32Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get( + shape_mnk.type.attribute, + self.cta_group.value, + self.a_major_mode._to_ir(), + self.b_major_mode._to_ir(), + self.a_dtype.mlir_type, + self.b_dtype.mlir_type, + self.acc_dtype.mlir_type, + self.a_src._to_ir(), + 0, + ) + return MmaTF32Trait( + _cute_nvgpu_ir.make_sm100_mma( + ty, + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +class MmaTF32Trait(MmaTrait): + pass + + +# +# F16/BF16 MMA +# + + +@dataclass(frozen=True) +class MmaF16BF16Op(MmaOp): + """ + F16/BF16 tcgen05 MMA Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.kind::f16`` qualifier. + """ + + descriptive_name = "tcgen05 F16/BF16 MMA Operation" + + def __init__( + self, + ab_dtype: Type[Numeric], + acc_dtype: Type[Numeric], + instruction_shape: Shape, + cta_group: CtaGroup, + a_src: OperandSource, + a_major_mode: OperandMajorMode, + b_major_mode: OperandMajorMode, + ) -> None: + super().__init__( + ab_dtype, + ab_dtype, + acc_dtype, + instruction_shape, + cta_group, + a_src, + a_major_mode, + b_major_mode, + ) + self._verify() + + def _verify(self) -> None: + # Input data type verification + if self.a_dtype not in [Float16, BFloat16]: + raise OpError( + self, + "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16", + ) + assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same" + # Accumulator data type verification + if self.acc_dtype not in [Float16, Float32]: + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32", + ) + # Instruction shape verification + instruction_k = 16 + if rank(self.shape_mnk) == 2: + object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k)) + if self.shape_mnk[2] != instruction_k: + raise OpError( + self, + f"expects the instruction extent in the K-mode to be {instruction_k}, " + f"but got {self.shape_mnk[2]}", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get( + shape_mnk.type.attribute, + self.cta_group.value, + self.a_major_mode._to_ir(), + self.b_major_mode._to_ir(), + self.a_dtype.mlir_type, + self.b_dtype.mlir_type, + self.acc_dtype.mlir_type, + self.a_src._to_ir(), + 0, + ) + return MmaF16BF16Trait( + _cute_nvgpu_ir.make_sm100_mma( + ty, + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +class MmaF16BF16Trait(MmaTrait): + pass + + +# +# I8 MMA +# + + +@dataclass(frozen=True) +class MmaI8Op(MmaOp): + """ + I8 tcgen05 MMA Operation. + + See the `PTX documentation `__. + This Operation corresponds to the ``.kind::i8`` qualifier. + """ + + descriptive_name = "tcgen05 I8 MMA Operation" + + def __init__( + self, + ab_dtype: Type[Numeric], + instruction_shape: Shape, + cta_group: CtaGroup, + a_src: OperandSource, + a_major_mode: OperandMajorMode, + b_major_mode: OperandMajorMode, + ) -> None: + super().__init__( + ab_dtype, + ab_dtype, + Int32, + instruction_shape, + cta_group, + a_src, + a_major_mode, + b_major_mode, + ) + self._verify() + + def _verify(self) -> None: + # Input data type verification + if self.a_dtype not in [Int8, Uint8]: + raise OpError( + self, + "expects the 'ab_dtype' Op parameter to be one of Int8 or Uint8", + ) + assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same" + # Instruction shape verification + instruction_k = 32 + if rank(self.shape_mnk) == 2: + object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k)) + if self.shape_mnk[2] != instruction_k: + raise OpError( + self, + f"expects the instruction extent in the K-mode to be {instruction_k}, " + f"but got {self.shape_mnk[2]}", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaI8Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get( + shape_mnk.type.attribute, + self.cta_group.value, + self.a_major_mode._to_ir(), + self.b_major_mode._to_ir(), + (T.si8() if self.a_dtype.signed else T.ui8()), + (T.si8() if self.b_dtype.signed else T.ui8()), + T.si32(), + self.a_src._to_ir(), + 0, + ) + return MmaI8Trait( + _cute_nvgpu_ir.make_sm100_mma( + ty, + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +class MmaI8Trait(MmaTrait): + pass + + +# +# F8F6F4 MMA +# + + +@dataclass(frozen=True) +class MmaFP8Op(MmaOp): + """ + F8 tcgen05 MMA Operation. + + See the `PTX documentation `__. + """ + + descriptive_name = "tcgen05 F8 MMA Operation" + + def __init__( + self, + ab_dtype: Type[Numeric], + acc_dtype: Type[Numeric], + instruction_shape: Shape, + cta_group: CtaGroup, + a_src: OperandSource, + a_major_mode: OperandMajorMode, + b_major_mode: OperandMajorMode, + ) -> None: + + super().__init__( + ab_dtype, + ab_dtype, + acc_dtype, + instruction_shape, + cta_group, + a_src, + a_major_mode, + b_major_mode, + ) + self._verify() + + def _verify(self) -> None: + # Input data type verification + if self.a_dtype not in [Float8E5M2, Float8E4M3FN]: + raise OpError( + self, + "expects the 'ab_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN", + ) + assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same" + # Accumulator data type verification + if self.acc_dtype not in [Float16, Float32]: + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32", + ) + # Instruction shape verification + instruction_k = 32 + if rank(self.shape_mnk) == 2: + object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k)) + if self.shape_mnk[2] != instruction_k: + raise OpError( + self, + f"expects the instruction extent in the K-mode to be {instruction_k}, " + f"but got {self.shape_mnk[2]}", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaFP8Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM100UMMAType.get( + shape_mnk.type.attribute, + self.cta_group.value, + self.a_major_mode._to_ir(), + self.b_major_mode._to_ir(), + self.a_dtype.mlir_type, + self.b_dtype.mlir_type, + self.acc_dtype.mlir_type, + self.a_src._to_ir(), + 0, + ) + return MmaFP8Trait( + _cute_nvgpu_ir.make_sm100_mma( + ty, + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + Boolean(False).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +class MmaFP8Trait(MmaTrait): + pass + + +#################################################################################################### +# +# SMEM layout atoms +# +#################################################################################################### + + +class SmemLayoutAtomKind(enum.Enum): + """ + Enum class for the kinds of SMEM layout atoms for SM100. + + Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can be + used to construct an SMEM layout using blocked product for operand A or B such that the + resulting layout is legal for both TMA and UMMA. + + Note that there are other ways of creating legal layouts for operand A and B. + """ + + MN_INTER = enum.auto() + MN_SW32 = enum.auto() + MN_SW64 = enum.auto() + MN_SW128 = enum.auto() + MN_SW128_32B = enum.auto() + K_INTER = enum.auto() + K_SW32 = enum.auto() + K_SW64 = enum.auto() + K_SW128 = enum.auto() diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py new file mode 100644 index 00000000..c2b3f7cf --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/warp/__init__.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .copy import * +from .mma import * + + +# __all__ is required here for documentation generation +__all__ = [ + # mma.py + "MmaF16BF16Op", + # copy.py + "LdMatrix8x8x16bOp", + "LdMatrix16x16x8bOp", + "StMatrix8x8x16bOp", + "StMatrix16x8x8bOp", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py b/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py new file mode 100644 index 00000000..a6ad4ca8 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/warp/copy.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from dataclasses import dataclass +from typing import Type + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from ..common import OpError +from ...core import CopyOp, Trait, _pack_shape +from ...typing import Numeric + + +@dataclass(frozen=True) +class BaseOp(CopyOp): + transpose: bool = False + num_matrices: int = 1 + + def __post_init__(self) -> None: + if not isinstance(self.transpose, bool): + raise OpError( + self, + "expects the 'transpose' Op parameter to be a bool instance", + ) + + def __str__(self) -> str: + res = ( + f"{self.__class__.__name__[:-2]} Copy Operation" + + f"\n number of matrices = {self.num_matrices}" + ) + if self.transpose: + res += f"\n transposed" + return res + + +@dataclass(frozen=True) +class LdMatrix8x8x16bOp(BaseOp): + """ + 8x8 ``ldmatrix`` Operation. + + See the `PTX documentation `__. + This operation corresponds to the ``.m8n8`` qualifier. + """ + + def __post_init__(self) -> None: + super().__post_init__() + if self.num_matrices not in [1, 2, 4]: + raise OpError( + self, + "expects the 'num_matrices' Op parameter to be one of [1,2,4]", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "LdMatrix8x8x16bTrait": + mode = _pack_shape((8, 8), loc=loc, ip=ip) + ty = _cute_nvgpu_ir.CopyAtomLdsmType.get( + copy_internal_type.mlir_type, + mode.type.attribute, + _cute_nvgpu_ir.LdsmSzPattern.u16, + self.num_matrices, + ir.UnitAttr.get() if self.transpose else None, + ) + return LdMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class LdMatrix8x8x16bTrait(Trait): + pass + + +@dataclass(frozen=True) +class LdMatrix16x16x8bOp(BaseOp): + """ + 16x16 8-bit ``ldmatrix`` Operation. + + See the `PTX documentation `__. + This operation corresponds to the ``.m16n16`` and the ``.b16`` qualifiers. + """ + + def __init__(self, num_matrices: int) -> None: + super().__init__(transpose=True, num_matrices=num_matrices) + self._verify() + + def _verify(self): + assert self.transpose, "transpose must be True" + if self.num_matrices not in [1, 2]: + raise OpError( + self, + "expects the 'num_matrices' Op parameter to be one of [1,2]", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "LdMatrix16x16x8bTrait": + mode = _pack_shape((16, 16), loc=loc, ip=ip) + ty = _cute_nvgpu_ir.CopyAtomLdsmType.get( + copy_internal_type.mlir_type, + mode.type.attribute, + _cute_nvgpu_ir.LdsmSzPattern.u8, + self.num_matrices, + ir.UnitAttr.get(), + ) + return LdMatrix16x16x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class LdMatrix16x16x8bTrait(Trait): + pass + + +@dataclass(frozen=True) +class StMatrix8x8x16bOp(BaseOp): + """ + 8x8 ``stmatrix`` Operation. + + See the `PTX documentation `__. + This operation corresponds to the ``m8n8`` qualifier. + """ + + def __post_init__(self) -> None: + super().__post_init__() + if self.num_matrices not in [1, 2, 4]: + raise OpError( + self, + "expects the 'num_matrices' Op parameter to be one of [1,2,4]", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "StMatrix8x8x16bTrait": + mode = _pack_shape((8, 8), loc=loc, ip=ip) + ty = _cute_nvgpu_ir.CopyAtomStsmType.get( + copy_internal_type.mlir_type, + mode.type.attribute, + self.num_matrices, + ir.UnitAttr.get() if self.transpose else None, + ) + return StMatrix8x8x16bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class StMatrix8x8x16bTrait(Trait): + pass + + +@dataclass(frozen=True) +class StMatrix16x8x8bOp(BaseOp): + """ + 16x8 ``stmatrix`` Operation. + + See the `PTX documentation `__. + This operation corresponds to the ``m16n8`` qualifier. + """ + + def __init__(self, num_matrices: int) -> None: + super().__init__(transpose=True, num_matrices=num_matrices) + self._verify() + + def _verify(self): + if self.num_matrices not in [1, 2, 4]: + assert self.transpose, "transpose must be True" + raise OpError( + self, + "expects the 'num_matrices' Op parameter to be one of [1,2,4]", + ) + + def _make_trait( + self, copy_internal_type: Type[Numeric], *, loc=None, ip=None, **kwargs + ) -> "StMatrix16x8x8bTrait": + mode = _pack_shape((16, 8), loc=loc, ip=ip) + ty = _cute_nvgpu_ir.CopyAtomStsmType.get( + copy_internal_type.mlir_type, + mode.type.attribute, + self.num_matrices, + ir.UnitAttr.get(), + ) + return StMatrix16x8x8bTrait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + +class StMatrix16x8x8bTrait(Trait): + pass diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py b/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py new file mode 100644 index 00000000..d7fe3b3b --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/warp/mma.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from dataclasses import dataclass +from typing import Type + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir + +from ..common import OpError +from ...core import MmaOp, Trait, _pack_shape +from ...typing import Shape, Float16, BFloat16, Float32, Numeric + + +@dataclass(frozen=True) +class MmaF16BF16Op(MmaOp): + """ + F16/BF16 tcgen05 MMA Operation. + + See the `PTX documentation `__. + This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands. + """ + + ab_dtype: Type[Numeric] + acc_dtype: Type[Numeric] + shape_mnk: Shape + + def __post_init__(self) -> None: + if self.ab_dtype not in [Float16, BFloat16]: + raise OpError( + self, + "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16", + ) + if self.acc_dtype not in [Float16, Float32]: + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32", + ) + if (self.ab_dtype == BFloat16) and (self.acc_dtype != Float32): + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16", + ) + if self.shape_mnk not in [(16, 8, 8), (16, 8, 16)]: + raise OpError( + self, + "expects the 'shape_mnk' Op parameter to be one of (16,8,8) or (16,8,16)", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM80Type.get( + shape_mnk.type.attribute, + self.ab_dtype.mlir_type, + self.ab_dtype.mlir_type, + self.acc_dtype.mlir_type, + ) + return MmaF16BF16Trait(_cute_ir.atom(ty, loc=loc, ip=ip)) + + def __str__(self) -> str: + return ( + "warp-level F16/BF16 MMA Operation" + + f"\n A/B data type = {self.ab_dtype}" + + f"\n Accumulator data type = {self.acc_dtype}" + + f"\n Instruction shape MNK = {self.shape_mnk}" + ) + + +class MmaF16BF16Trait(Trait): + pass diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py new file mode 100644 index 00000000..49a40165 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/__init__.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .mma import * +from .helpers import * + +# __all__ is required here for documentation generation +__all__ = [ + # mma.py + "OperandMajorMode", + "OperandSource", + "Field", + "MmaF16BF16Op", + "MmaF8Op", + "SmemLayoutAtomKind", + # helpers.py + "make_smem_layout_atom", + "fence", + "commit_group", + "wait_group", +] diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py new file mode 100644 index 00000000..f6284134 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/helpers.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Type + +from cutlass.cutlass_dsl import dsl_user_op + +from cutlass._mlir.dialects import nvvm + +from ...typing import Numeric, NumericMeta +from ... import core +from .mma import SmemLayoutAtomKind + + +@dsl_user_op +def make_smem_layout_atom( + kind: SmemLayoutAtomKind, element_type: Type[Numeric], *, loc=None, ip=None +) -> core.ComposedLayout: + """ + Makes a SMEM layout Atom. + + This function creates a composed layout in unit of elements consistent with the requested layout + Atom kind and element data type. + + :param kind: The kind of layout Atom + :type kind: SmemLayoutAtomKind + :param element_type: The element data type to construct the layout for + :type element_type: Type[Numeric] + :return: The SMEM layout atom + :rtype: core.ComposedLayout + """ + if not isinstance(element_type, NumericMeta): + raise TypeError(f"element_type must be a Numeric, but got {element_type}") + + if kind in (SmemLayoutAtomKind.MN_INTER, SmemLayoutAtomKind.K_INTER): + num_contiguous_bits = 128 + sw = core.make_swizzle(0, 4, 3) + elif kind in (SmemLayoutAtomKind.MN_SW32, SmemLayoutAtomKind.K_SW32): + num_contiguous_bits = 256 + sw = core.make_swizzle(1, 4, 3) + elif kind in (SmemLayoutAtomKind.MN_SW64, SmemLayoutAtomKind.K_SW64): + num_contiguous_bits = 512 + sw = core.make_swizzle(2, 4, 3) + elif kind in (SmemLayoutAtomKind.MN_SW128, SmemLayoutAtomKind.K_SW128): + num_contiguous_bits = 1024 + sw = core.make_swizzle(3, 4, 3) + else: + raise ValueError("unrecognized SMEM layout atom kind") + num_contiguous_elems = num_contiguous_bits // element_type.width + + if kind in ( + SmemLayoutAtomKind.MN_INTER, + SmemLayoutAtomKind.MN_SW32, + SmemLayoutAtomKind.MN_SW64, + SmemLayoutAtomKind.MN_SW128, + ): + # M/N-major layout + return core.make_composed_layout( + sw, + 0, + core.make_layout( + (num_contiguous_elems, 8), stride=(1, num_contiguous_elems) + ), + loc=loc, + ip=ip, + ) + else: + # K-major layout + return core.make_composed_layout( + sw, + 0, + core.make_layout( + (8, num_contiguous_elems), stride=(num_contiguous_elems, 1) + ), + loc=loc, + ip=ip, + ) + + +@dsl_user_op +def fence(*, loc=None, ip=None) -> None: + """ + See the `PTX documentation `__. + """ + nvvm.wgmma_fence_aligned(loc=None, ip=None) + + +@dsl_user_op +def commit_group(*, loc=None, ip=None) -> None: + """ + See the `PTX documentation `__. + """ + nvvm.wgmma_commit_group_sync_aligned(loc=loc, ip=ip) + + +@dsl_user_op +def wait_group(group, *, loc=None, ip=None) -> None: + """ + See the `PTX documentation `__. + """ + nvvm.wgmma_wait_group_sync_aligned(group, loc=loc, ip=ip) diff --git a/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py new file mode 100644 index 00000000..b3749574 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/nvgpu/warpgroup/mma.py @@ -0,0 +1,380 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import enum +from dataclasses import dataclass +from typing import Type + +from cutlass.cutlass_dsl import CuTeDSL + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir +from cutlass._mlir import ir + +from ..common import OpError +from ...core import MmaOp, Trait, _pack_shape, rank, depth +from ...typing import ( + Shape, + Float16, + BFloat16, + Float32, + Boolean, + Float8E5M2, + Float8E4M3FN, + Numeric, +) + + +#################################################################################################### +# +# MMA Ops and Traits +# +#################################################################################################### + + +class OperandMajorMode(enum.Enum): + """ + An enumeration for the majorness of the input operands of the MMA. + """ + + MN = _cute_ir.MajorMode.mn + K = _cute_ir.MajorMode.k + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + @classmethod + def _missing_(cls, value): + if isinstance(value, str): + value = value.upper() + if value == "MN": + return OperandMajorMode.MN + elif value == "K": + return OperandMajorMode.K + + def _to_ir(self) -> _cute_ir.MajorMode: + return self.value + + +class OperandSource(enum.Enum): + """ + An enumeration for the source memory location of the A input operand of the MMA. + """ + + RMEM = _cute_ir.MmaFragKind.rmem + SMEM = _cute_ir.MmaFragKind.smem_desc + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + def _to_ir(self) -> _cute_ir.MmaFragKind: + return self.value + + +class Field(enum.Enum): + """ + An enumeration for the fields of the MMA Atom that can be modified at runtime. + """ + + ACCUMULATE = "accum_c" + + def __str__(self) -> str: + return f"{self.__class__.__name__}.{self.name}" + + def __repr__(self) -> str: + return f"<{self.__class__.__name__}.{self.name}>" + + def _to_ir_field_name(self) -> str: + return self.value + + +@dataclass(frozen=True) +class MmaOp(MmaOp): + a_dtype: Type[Numeric] + b_dtype: Type[Numeric] + acc_dtype: Type[Numeric] + shape_mnk: Shape + a_src: OperandSource + a_major_mode: OperandMajorMode + b_major_mode: OperandMajorMode + + admissible_archs = ["sm_90a"] + + def __post_init__(self) -> None: + # Verify arch + arch = CuTeDSL._get_dsl().envar.arch + if arch not in self.admissible_archs: + raise OpError( + self, + f"expects arch to be one of {self.admissible_archs}, but got {arch}", + suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture", + ) + # Verify that the user provided enum values + if not isinstance(self.a_src, OperandSource): + raise OpError( + self, + "expects the 'a_src' Op parameter to be a warpgroup.OperandSource instance", + ) + if not isinstance(self.a_major_mode, OperandMajorMode): + raise OpError( + self, + "expects the 'a_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance", + ) + if not isinstance(self.b_major_mode, OperandMajorMode): + raise OpError( + self, + "expects the 'b_major_mode' Op parameter to be a warpgroup.OperandMajorMode instance", + ) + # Verify instruction shape + if (rank(self.shape_mnk) not in [2, 3]) or (depth(self.shape_mnk) != 1): + raise OpError( + self, + f"expected a flat rank 2 or 3 tuple for the 'shape_mnk' Op parameter, " + f"but got {self.shape_mnk}", + ) + m, n = self.shape_mnk[0], self.shape_mnk[1] + if m != 64: + raise OpError(self, f"expects the M-mode to be 64, but got {m}") + if (n < 8) or (n > 256) or (n % 8 != 0): + raise OpError( + self, + f"expects the N-mode to satisfy 8 <= N <= 256 and N % 8 == 0. but got {n}", + ) + + def __str__(self) -> str: + return ( + self.__class__.descriptive_name # type: ignore + + f"\n A data type = {self.a_dtype}" + + f"\n B data type = {self.b_dtype}" + + f"\n Accumulator data type = {self.acc_dtype}" + + f"\n A source location = {self.a_src}" + + f"\n A major mode = {self.a_major_mode}" + + f"\n B major mode = {self.b_major_mode}" + + f"\n Instruction shape MNK = {self.shape_mnk}" + ) + + +class MmaTrait(Trait): + admissible_fields = [Field.ACCUMULATE] + + def set(self, field, value, *, loc=None, ip=None) -> None: + if field not in self.admissible_fields: + raise ValueError( + f"invalid field, must be {Field.ACCUMULATE}, but got {field}" + ) + field_name = f"#cute_nvgpu.atom_mma_field_sm90<{field._to_ir_field_name()}>" + attr = ir.Attribute.parse(field_name) + self.value = _cute_nvgpu_ir.atom_set_value( + self.value, attr, Boolean(value).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + + +@dataclass(frozen=True) +class MmaF16BF16Op(MmaOp): + """ + F16/BF16 warpgroup MMA Operation. + + See the `PTX documentation `__. + This Operation covers the instructions using the ``.f16`` or ``.bf16`` qualifiers for the input operands. + """ + + descriptive_name = "warpgroup F16/BF16 MMA Operation" + + def __init__( + self, + ab_dtype: Type[Numeric], + acc_dtype: Type[Numeric], + instruction_shape: Shape, + a_src: OperandSource, + a_major_mode: OperandMajorMode, + b_major_mode: OperandMajorMode, + ) -> None: + super().__init__( + ab_dtype, + ab_dtype, + acc_dtype, + instruction_shape, + a_src, + a_major_mode, + b_major_mode, + ) + self._verify() + + def _verify(self) -> None: + # Input data type verification + if self.a_dtype not in [Float16, BFloat16]: + raise OpError( + self, + "expects the 'ab_dtype' Op parameter to be one of Float16 or BFloat16", + ) + assert self.b_dtype == self.a_dtype, "a_dtype and b_dtype must be the same" + # Accumulator data type verification + if self.acc_dtype not in [Float16, Float32]: + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be one of Float16 or Float32", + ) + if (self.a_dtype == BFloat16) and (self.acc_dtype != Float32): + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be Float32 when 'ab_dtype' is BFloat16", + ) + # Verify the instruction shape + instruction_k = 16 + if rank(self.shape_mnk) == 2: + object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k)) + if self.shape_mnk[2] != instruction_k: + raise OpError( + self, + f"expects the instruction extent in the K-mode to be {instruction_k}, " + f"but got {self.shape_mnk[2]}", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF16BF16Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM90Type.get( + shape_mnk.type.attribute, + self.a_major_mode._to_ir(), + self.b_major_mode._to_ir(), + self.a_dtype.mlir_type, + self.b_dtype.mlir_type, + self.acc_dtype.mlir_type, + self.a_src._to_ir(), + ) + return MmaF16BF16Trait( + _cute_nvgpu_ir.make_sm90_mma( + ty, + Boolean(False).ir_value(loc=loc, ip=ip), + loc=loc, + ip=ip, + ) + ) + + +class MmaF16BF16Trait(MmaTrait): + pass + + +@dataclass(frozen=True) +class MmaF8Op(MmaOp): + """ + F16/BF16 warpgroup MMA Operation. + + See the `PTX documentation `__. + This Operation covers the instructions using the ``.e4m3`` or ``.e5m2`` qualifiers for the input operands. + """ + + descriptive_name = "warpgroup F8 MMA Operation" + + def __init__( + self, + a_dtype: Type[Numeric], + b_dtype: Type[Numeric], + acc_dtype: Type[Numeric], + instruction_shape: Shape, + a_src: OperandSource, + a_major_mode: OperandMajorMode, + b_major_mode: OperandMajorMode, + ) -> None: + super().__init__( + a_dtype, + b_dtype, + acc_dtype, + instruction_shape, + a_src, + a_major_mode, + b_major_mode, + ) + self._verify() + + def _verify(self): + # Input data type verification + if self.a_dtype not in [Float8E5M2, Float8E4M3FN]: + raise OpError( + self, + "expects the 'a_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN", + ) + if self.b_dtype not in [Float8E5M2, Float8E4M3FN]: + raise OpError( + self, + "expects the 'b_dtype' Op parameter to be one of Float8E5M2 or Float8E4M3FN", + ) + # Accumulator data type verification + if self.acc_dtype != Float32: + raise OpError( + self, + "expects the 'acc_dtype' Op parameter to be Float32", + ) + # Verify the instruction shape + instruction_k = 32 + if rank(self.shape_mnk) == 2: + object.__setattr__(self, "shape_mnk", (*self.shape_mnk, instruction_k)) + if self.shape_mnk[2] != instruction_k: + raise OpError( + self, + f"expects the instruction extent in the K-mode to be {instruction_k}, " + f"but got {self.shape_mnk[2]}", + ) + + def _make_trait(self, *, loc=None, ip=None, **kwargs) -> "MmaF8Trait": + shape_mnk = _pack_shape(self.shape_mnk, loc=loc, ip=ip) + ty = _cute_nvgpu_ir.MmaAtomSM90Type.get( + shape_mnk.type.attribute, + self.a_major_mode._to_ir(), + self.b_major_mode._to_ir(), + self.a_dtype.mlir_type, + self.b_dtype.mlir_type, + self.acc_dtype.mlir_type, + self.a_src._to_ir(), + ) + return MmaF8Trait( + _cute_nvgpu_ir.make_sm90_mma( + ty, Boolean(False).ir_value(loc=loc, ip=ip), loc=loc, ip=ip + ) + ) + + +class MmaF8Trait(MmaTrait): + pass + + +#################################################################################################### +# +# SMEM layout atoms +# +#################################################################################################### + + +class SmemLayoutAtomKind(enum.Enum): + """ + Enum class for the kinds of SMEM layout atoms for SM90. + + Given a swizzle kind, an SMEM layout atom is the compact layout of smallest size that can + be used to construct an SMEM layout using blocked product for operand A or B such that the + resulting layout is legal for both TMA and UMMA. + + Note that there are other ways of creating legal layouts for operand A and B. + """ + + MN_INTER = enum.auto() + MN_SW32 = enum.auto() + MN_SW64 = enum.auto() + MN_SW128 = enum.auto() + K_INTER = enum.auto() + K_SW32 = enum.auto() + K_SW64 = enum.auto() + K_SW128 = enum.auto() diff --git a/python/CuTeDSL/cutlass/cute/runtime.py b/python/CuTeDSL/cutlass/cute/runtime.py new file mode 100644 index 00000000..47e67b88 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/runtime.py @@ -0,0 +1,515 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import ctypes +from functools import lru_cache +import itertools +import operator +from time import time +from typing import Union + +# MLIR modules imports +from cutlass._mlir import ir +import cutlass._mlir.dialects.cute as _cute_ir + +from cutlass.cutlass_dsl import TensorFormat, JitArgAdapterRegistry + +# Local modules imports +from .typing import ( + AddressSpace, + Tensor, + Type, + Pointer, + Boolean, + Numeric, + Float4E2M1FN, + Int64, + Int32, + Int16, + Int8, + Uint64, + Uint32, + Uint16, + Uint8, + Float64, + Float32, + Float16, + BFloat16, + Float8E5M2, +) +from .core import find, _Tensor as CoreTensor + + +class _Pointer(Pointer): + """Runtime representation of a pointer that can inter-operate with various data structures, + including numpy arrays and device memory. + + :param pointer: The pointer to the data + :type pointer: int or pointer-like object + :param dtype: Data type of the elements pointed to + :type dtype: Type + :param mem_space: Memory space where the pointer resides, defaults to generic + :type mem_space: _cute_ir.AddressSpace, optional + :param assumed_align: Assumed alignment of input pointer in bytes, defaults to None + :type assumed_align: int, optional + + :ivar _pointer: The underlying pointer + :ivar _dtype: Data type of the elements + :ivar _addr_space: Memory space of the pointer + :ivar _assumed_align: Alignment of the pointer in bytes + :ivar _desc: C-type descriptor for the pointer + :ivar _c_pointer: C-compatible pointer representation + """ + + def __init__( + self, + pointer, + dtype, + mem_space: _cute_ir.AddressSpace = _cute_ir.AddressSpace.generic, + assumed_align=None, + ): + self._pointer = pointer + self._dtype = dtype + self._addr_space = mem_space + + is_in_device = mem_space == _cute_ir.AddressSpace.gmem + if assumed_align is None: + if is_in_device: + self._assumed_align = 32 + else: + self._assumed_align = dtype.width // 8 + else: + self._assumed_align = assumed_align + + class PtrDescriptor(ctypes.Structure): + """A ctype descriptor for CuTe memref ptr""" + + _fields_ = [("ptr", ctypes.c_void_p)] + + def __str__(self): + return f"0x{self.ptr:016x}" + + self._desc = PtrDescriptor(int(self._pointer)) + self._c_pointer = ctypes.cast(ctypes.pointer(self._desc), ctypes.c_void_p) + assert ( + self._desc.ptr % self._assumed_align == 0 + ), f"pointer must be {self._assumed_align} bytes aligned" + + def size_in_bytes(self) -> int: + return ctypes.sizeof(self._desc) + + def __get_mlir_types__(self): + return [self.mlir_type] + + def __c_pointers__(self): + return [self._c_pointer] + + def __new_from_mlir_values__(self, values): + assert len(values) == 1 + return values[0] + + # Move mlir Type out of __init__ to decouple with mlir Context + @property + def mlir_type(self) -> ir.Type: + return _cute_ir.PtrType.get( + self._dtype.mlir_type, self._addr_space, self._assumed_align + ) + + @property + def element_type(self) -> Type[Numeric]: + return self._dtype + + @property + def memspace(self): + return self._addr_space + + def verify(self, expected_py_type): + if expected_py_type is Pointer: + return True + elif isinstance(expected_py_type, ir.Value) and expected_py_type.ty is Pointer: + return True + + return False + + def __str__(self) -> str: + return f"Ptr<0x{self._desc.ptr:016x}@{self._addr_space}>" + + def __repr__(self): + return self.__str__() + + +class _Tensor(Tensor): + def __init__( + self, + tensor, + assumed_align=None, + ): + # If tensor is already a DLPack object, use it directly + if hasattr(tensor, "__dlpack_device__") and not hasattr(tensor, "__dlpack__"): + self._dlpack_data = tensor + else: + self._dlpack_data = tensor.__dlpack__() + self._dltensor_wrapper = None + self._assumed_align = assumed_align + self._is_dynamic = False + self._memref_desc = None + self._dtype = None + + @property + def __class__(self) -> Type[Tensor]: + # Cheat to let `type(_Tensor())` to return cute.Tensor + return Tensor + + @staticmethod + def lazily_load_dltensor(func): + """Decorator to lazily load the DLTensorWrapper. + + This decorator loads the DLTensorWrapper when needed, + avoiding overhead in the critical path of calling JIT functions. + """ + + def wrapper(self, *args, **kwargs): + if self._dltensor_wrapper is None: + self._dltensor_wrapper = _cute_ir.DLTensorWrapper(self._dlpack_data) + return func(self, *args, **kwargs) + + return wrapper + + @lazily_load_dltensor + def mark_layout_dynamic(self, leading_dim: int | None = None): + """Marks the tensor layout as dynamic based on the leading dimension. + + :param leading_dim: The leading dimension of the layout, defaults to None + :type leading_dim: int, optional + + When ``leading_dim`` is None, automatically deduces the leading dimension from the tensor layout. + The layout can be deduced only when exactly one dimension has a stride of 1. Raises an error + if the layout cannot be automatically deduced. + + When ``leading_dim`` is explicitly specified, marks the layout as dynamic while setting the + stride at ``leading_dim`` to 1. Also validates that the specified ``leading_dim`` is consistent + with the existing layout by checking that the corresponding stride of that dimension is 1. + + Limitation: only support flat layout for now. Will work on supporting nested layout in the future. + + :return: The tensor with dynamic layout + :rtype: _Tensor + """ + self._dltensor_wrapper.mark_layout_dynamic(leading_dim) + return self + + @lazily_load_dltensor + def mark_compact_shape_dynamic( + self, + mode: int, + stride_order: tuple[int, ...] | None = None, + divisibility: int = 1, + ): + """Marks the tensor shape as dynamic and propagates dynamic and divisibility information to the corresponding strides. + + :param mode: The mode of the compact shape, defaults to 0 + :type mode: int + :param stride_order: Consistent with `torch.Tensor.dim_order`. Defaults to None. + Indicates the order of the modes (dimensions) if the current layout were converted to row-major order. + It starts from the outermost to the innermost dimension. + :type stride_order: tuple[int, ...], optional + :param divisibility: The divisibility constraint for the compact shape, defaults to 1 + :type divisibility: int, optional + :return: The tensor with dynamic compact shape + :rtype: _Tensor + + If ``stride_order`` is not provided, the stride ordering will be automatically deduced from the layout. + Automatic deduction is only possible when exactly one dimension has a stride of 1 (compact layout). + An error is raised if automatic deduction fails. + + If ``stride_order`` is explicitly specified, it does the consistency check with the layout. + + For example: + - Layout: (4,2):(1,4) has stride_order: (1,0) indicates the innermost dimension is 0(`4:1`), the outermost dimension is 1(`2:4`) + - Layout: (5,3,2,4):(3,1,15,30) has stride_order: (3,2,0,1) indicates the innermost dimension is 1(`3:1`), the outermost dimension is 3(`4:30`). + + Using `torch.Tensor.dim_order()` to get the stride order of the torch tensor. + .. code-block:: python + a = torch.empty(3, 4) + t = cute.runtime.from_dlpack(a) + t = t.mark_compact_shape_dynamic(mode=0, stride_order=a.dim_order()) + """ + self._dltensor_wrapper.mark_compact_shape_dynamic( + mode, stride_order, divisibility + ) + return self + + @property + @lazily_load_dltensor + def element_type(self) -> Type[Numeric]: + if self._dtype is None: + self._dtype = self._dltensor_wrapper.dtype + return self._dtype + + @element_type.setter + def element_type(self, new_type): + """Set the element type of the tensor. + + :warning: This API is added for narrow precision before we have a clean `recast_tensor` story. + + :note: It is only used for the case that frameworks don't natively support narrow precision but we get tensor + from frameworks with storage type like uint8. + + **Example**: + + .. code-block:: python + + # Create a tensor from a numpy array + import numpy as np + from cutlass.cute import from_dlpack + + # Create a tensor with Float32 elements + a = np.zeros(shape, dtype=np.uint8) + tensor = from_dlpack(a) + + # Change the element type to Float4E2M1FN even storage type is uint8 + tensor.element_type = cutlass.Float4E2M1FN + + src = from_dlpack(... data tensor ...) + # convert and initialize narrow precision tensor + cute.testing.convert(src, tensor) + """ + self._dtype = new_type + + @property + @lazily_load_dltensor + def memspace(self): + return self._dltensor_wrapper.address_space + + @property + @lazily_load_dltensor + def size_in_bytes(self) -> int: + return self._dltensor_wrapper.size_in_bytes() + + @property + @lazily_load_dltensor + def mlir_type(self) -> ir.Type: + return self._dltensor_wrapper.get_type( + self.element_type.mlir_type, self._assumed_align + ) + + @lazily_load_dltensor + def __str__(self) -> str: + return f"Tensor<0x{self._dltensor_wrapper.str}>" + + def __repr__(self): + return self.__str__() + + def __setitem__(self, crd, value): + raise TypeError(f"runtime._Tensor is not indexable") + + def __getitem__(self, crd): + raise TypeError(f"runtime._Tensor is not indexable") + + @property + @lazily_load_dltensor + def iterator(self): + return _Pointer( + self._dltensor_wrapper.data_ptr, + self.element_type, + self.memspace, + self._assumed_align, + ) + + @property + def layout(self): + raise NotImplementedError( + f"layout property is not supported in runtime, support in future" + ) + + @property + @lazily_load_dltensor + def shape(self): + return self._dltensor_wrapper.shape + + @property + @lazily_load_dltensor + def stride(self): + strides = self._dltensor_wrapper.stride + if strides is None: + strides = itertools.accumulate( + reversed(self.shape), func=operator.mul, initial=1 + ) + strides = tuple(reversed(list(strides)[:-1])) + + return strides + + @property + @lru_cache(maxsize=128, typed=True) + def leading_dim(self): + """Get the leading dimension of this Tensor. + + :return: The leading dimension index or indices + :rtype: int or tuple or None + + The return value depends on the tensor's stride pattern: + + * If a single leading dimension is found, returns an integer index + * If nested leading dimensions are found, returns a tuple of indices + * If no leading dimension is found, returns None + """ + return find(1, self.stride, exclude_when=(1, self.shape)) + + def fill(self, value: Numeric): + raise TypeError(f"fill function is not supported in runtime") + + @property + @lazily_load_dltensor + def data_ptr(self): + return self._dltensor_wrapper.data_ptr + + @lazily_load_dltensor + def __c_pointers__(self): + self._memref_desc = self._dltensor_wrapper.build_memref_desc( + self._assumed_align + ) + return [_cute_ir.pycapsule_get_pointer(self._memref_desc)] + + def __get_mlir_types__(self): + return [self.mlir_type] + + def __new_from_mlir_values__(self, values): + assert len(values) == 1 + assert isinstance(values[0], CoreTensor) + return CoreTensor(values[0].value, self._dtype) + + +def from_dlpack( + tensor_dlpack, + assumed_align=None, +) -> Tensor: + """Convert from tensor object supporting __dlpack__() to a CuTe Tensor. + + :param tensor_dlpack: Tensor object that supports the DLPack protocol + :type tensor_dlpack: object + :param assumed_align: Assumed alignment of the tensor (bytes), defaults to None, + if None, will use the element size bytes as the assumed alignment. + :type assumed_align: int, optional + :return: A CuTe Tensor object + :rtype: Tensor + + Examples: + .. code-block:: python + + import torch + from cutlass.cute.runtime import from_dlpack + x = torch.randn(100, 100) + y = from_dlpack(x) + y.shape + # (100, 100) + type(y) + # + """ + return _Tensor( + tensor_dlpack, + assumed_align=assumed_align, + ) + + +def make_ptr( + dtype: Type[Numeric], + value: Union[int, ctypes._Pointer], + mem_space: AddressSpace = AddressSpace.generic, + assumed_align=None, +) -> Pointer: + """Create a pointer from a memory address + + :param dtype: Data type of the pointer elements + :type dtype: Type[Numeric] + :param value: Memory address as integer or ctypes pointer + :type value: Union[int, ctypes._Pointer] + :param mem_space: Memory address space, defaults to AddressSpace.generic + :type mem_space: AddressSpace, optional + :param align_bytes: Alignment in bytes, defaults to None + :type align_bytes: int, optional + :return: A pointer object + :rtype: Pointer + + .. code-block:: python + + import numpy as np + import ctypes + + from cutlass import Float32 + from cutlass.cute.runtime import make_ptr + + # Create a numpy array + a = np.random.randn(16, 32).astype(np.float32) + + # Get pointer address as integer + ptr_address = a.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) + + # Create pointer from address + y = make_ptr(cutlass.Float32, ptr_address) + + # Check properties + print(y.element_type) + print(type(y)) # + """ + # check if value is int or ctypes.POINTER + if isinstance(value, int): + address_value = value + elif isinstance(value, ctypes._Pointer): + # get address value + address_value = ctypes.cast(value, ctypes.c_void_p).value + assert address_value is not None, "Pointer address is None" + else: + raise TypeError( + f"Expect int or ctypes.POINTER for value but got {type(value)=}" + ) + + return _Pointer(address_value, dtype, mem_space, assumed_align=assumed_align) + + +class TensorAdapter: + """ + Convert a DLPack protocol supported tensor/array to a cute tensor. + """ + + # Need reference these capsules to avoid being garbage collected + tensor_capsules = [] + + def __init__(self, arg): + self._arg = from_dlpack(arg).mark_layout_dynamic() + self.tensor_capsules.append(self._arg) + + def __new_from_mlir_values__(self, values): + return self._arg.__new_from_mlir_values__(values) + + def __c_pointers__(self): + return self._arg.__c_pointers__() + + def __get_mlir_types__(self): + return self._arg.__get_mlir_types__() + + +# ------------------------------------------------------------------------- +# Try to register_jit_arg_adapter for TensorAdapter +# ------------------------------------------------------------------------- + +try: # Register for numpy.ndarray + import numpy + + JitArgAdapterRegistry.register_jit_arg_adapter(numpy.ndarray)(TensorAdapter) +except ImportError: + pass # silent attempt, suppress error + +try: # Register for torch.Tensor + import torch + + JitArgAdapterRegistry.register_jit_arg_adapter(torch.Tensor)(TensorAdapter) +except ImportError: + pass # silent attempt, suppress error diff --git a/python/CuTeDSL/cutlass/cute/testing.py b/python/CuTeDSL/cutlass/cute/testing.py new file mode 100644 index 00000000..90fb1fb2 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/testing.py @@ -0,0 +1,285 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import random +import numpy as np +import functools +import hashlib + +from cutlass.cutlass_dsl import ( + const, + T, + CuTeDSL, + BaseDSL, + t, + Constexpr, + detect_gpu_arch, +) + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.ir as ir +from cutlass._mlir.dialects import nvvm, cf, vector, builtin + +from cutlass.cute import core +from cutlass.cute import nvgpu +from typing import Type +from inspect import isclass + + +def assert_(cond, msg=None): + if isinstance(cond, ir.Value): + if ir.VectorType.isinstance(cond.type): + assert ( + cond.type.element_type == T.bool() + ), f"only expects vector type with boolean elements, but got {cond.type}" + cond_val = vector.multi_reduction( + vector.CombiningKind.AND, cond, const(True), range(cond.type.rank) + ) + else: + cond_val = cond + else: + cond_val = const(cond, t.Boolean) + + cf.assert_(cond_val, msg if msg else "") + + +def _maybe_recast_tensor_from_f4(src: core.Tensor, tv_layout: core.Layout): + if src.element_type.width == 4: + tv_layout = core.recast_layout(8, 4, tv_layout) + src = core.recast_tensor(src, dtype=t.Int8) + return src, tv_layout + + +def _maybe_recast_to_f4(input: core.TensorSSA, dtype: Type[core.Numeric]): + """Conditionally recasts the tensor to 4-bit type if the destination type is 4-bit. + + :param input: The input tensor to recast. + :param dtype: The target numeric type to potentially recast to. + :raises TypeError: If dtype is not a subclass of Numeric. + :return: A new tensor recast to 4-bit if dtype is 4-bit, otherwise returns self unchanged. + """ + if not isclass(dtype) or not issubclass(dtype, core.Numeric): + raise TypeError(f"dst_ty must be a type of Numeric, but got {dtype}") + + if dtype.width == 4: + recast_shape = core.recast_layout(4, 8, core.make_layout(input.shape)).shape + i4_vec = vector.bitcast( + T.vector(input.type.shape[0] * 2, T.i(4)), input.maybe_downcast() + ) + res_vect = builtin.unrealized_conversion_cast( + [T.vector(i4_vec.type.shape[0], dtype.mlir_type)], [i4_vec] + ) + return core.TensorSSA(res_vect, recast_shape, dtype) + return input + + +def _maybe_recast_from_f4(input: core.TensorSSA, src_dtype: Type[core.Numeric]): + """Conditionally recasts the tensor from 4-bit type if the source type is 4-bit. + + :param input: The input tensor to recast. + :param src_dtype: The source numeric type to potentially recast from. + :raises TypeError: If src_dtype is not a subclass of Numeric. + :return: A new tensor recast from 4-bit if src_dtype is 4-bit, otherwise returns self unchanged. + """ + if not isclass(src_dtype) or not issubclass(src_dtype, core.Numeric): + raise TypeError(f"src_ty must be a type of Numeric, but got {src_dtype}") + + if src_dtype.width == 4: + recast_shape = core.recast_layout(8, 4, core.make_layout(input.shape)).shape + i4_vec = builtin.unrealized_conversion_cast( + [T.vector(input.type.shape[0], T.i(4))], [input.maybe_downcast()] + ) + res_vect = vector.bitcast(T.vector(i4_vec.type.shape[0] // 2, T.i8()), i4_vec) + return core.TensorSSA(res_vect, recast_shape, core.Int8) + return input + + +@CuTeDSL.kernel +def _convert_kernel( + gSrc: core.Tensor, + gDst: core.Tensor, + cSrc: core.Tensor, + src_tv_layout: core.Layout, + dst_tv_layout: core.Layout, + src_shape: core.Shape, + src_ty, + dst_ty, +): + tidx = nvvm.read_ptx_sreg_tid_x(T.i32()) + bidx = nvvm.read_ptx_sreg_ctaid_x(T.i32()) + + cta_coord = (None, bidx) + # logical idx -> address + ctaSrc = gSrc[cta_coord] # (...,TileV,...) + ctaDst = gDst[cta_coord] # (...,TileV,...) + ctaCSrc = cSrc[cta_coord] # (...,TileV,...) + # print(f"ctaSrc = {ctaSrc.type}") + + # compose with CTA TV layout + # tid, vid -> address + tidfrgSrc = core.composition(ctaSrc, src_tv_layout) # (T,V) + tidfrgDst = core.composition(ctaDst, dst_tv_layout) # (T,V) + tidfrgCSrc = core.composition(ctaCSrc, src_tv_layout) # (T,V) + # print(f"tidfrgSrc = {tidfrgSrc.type}") + + # slice for threads + thr_coord = (tidx, None) + thrSrc = tidfrgSrc[thr_coord] # (V) + thrDst = tidfrgDst[thr_coord] # (V) + thrCSrc = tidfrgCSrc[thr_coord] # (V) + # print(f"thrSrc = {thrSrc.type}") + + # predicate + if core.elem_less(thrCSrc[0], src_shape): + # allocate fragments for gmem->rmem + frgSrc = core.make_fragment( + core.get(src_tv_layout, mode=[1]), gSrc.element_type + ) # (V) + frgDst = core.make_fragment( + core.get(dst_tv_layout, mode=[1]), gDst.element_type + ) # (V) + # print(f"frgSrc = {frgSrc.type}") + + # Move data to reg address space + copy_atom_load = core.make_copy_atom(nvgpu.CopyUniversalOp(), gSrc.element_type) + core.copy(copy_atom_load, thrSrc, frgSrc) + + vec_src = frgSrc.load() + vec_src = _maybe_recast_to_f4(vec_src, src_ty) + vec_dst = vec_src.to(dst_ty) + vec_dst = _maybe_recast_from_f4(vec_dst, dst_ty) + frgDst.store(vec_dst) + + # Copy the results back to c + copy_atom_stg = core.make_copy_atom(nvgpu.CopyUniversalOp(), gDst.element_type) + core.copy(copy_atom_stg, frgDst, thrDst) + + +@CuTeDSL.jit(preprocess=False) +def _convert( + src: core.Tensor, + dst: core.Tensor, + leading_mode: Constexpr, + elem_per_copy: Constexpr, +): + + # Step 1. figure proper tv_layout + src_ty = src.element_type + dst_ty = dst.element_type + + tv_layout = core.make_layout((128, elem_per_copy), stride=(elem_per_copy, 1)) + + # Step 2. maybe recast from f4 tensor + src, src_tv_layout = _maybe_recast_tensor_from_f4(src, tv_layout) + dst, dst_tv_layout = _maybe_recast_tensor_from_f4(dst, tv_layout) + src_shape = src.shape + # predicate tensor + idA = core.make_identity_tensor(src.shape) + + # Step 3. select a proper tiling pattern as (...,TileV, ...) + src_cta_tiler = [ + 1, + ] * core.rank(src.layout) + src_cta_tiler[leading_mode] = core.size(src_tv_layout) # (...,TileV,...) + dst_cta_tiler = [ + 1, + ] * core.rank(dst.layout) + dst_cta_tiler[leading_mode] = core.size(dst_tv_layout) # (...,TileV,...) + + # Step 4. partition input and output tensor by cta tiler. + gS = core.zipped_divide( + src, tuple(src_cta_tiler) + ) # ((...,TileV,...),(...,RestV,...)) + cS = core.zipped_divide( + idA, tuple(src_cta_tiler) + ) # ((...,TileV,...),(...,RestV,...)) + gD = core.zipped_divide( + dst, tuple(dst_cta_tiler) + ) # ((...,TileV,...),(...,RestV,...)) + # print(f"{gS.type=}") + + _convert_kernel( + gS, + gD, + cS, + src_tv_layout, + dst_tv_layout, + src_shape, + src_ty, + dst_ty, + ).launch( + grid=[core.size(gS, mode=[1]), 1, 1], + block=[core.size(src_tv_layout, mode=[0]), 1, 1], + ) + + +# Converts from src tensor to dst tensor, their logical shape are required to be the same. +# And when src or dst dtype is narrow precision(Float4E2M1FN/Float8E8M0FNU/Float8E4M3FN), the shape of +# their leading dimension should be 4(fp8)/8(fp4) element align. (nvgpu.cvt_fptrunc/cvt_fpext +# needs 32-bits aligned input/output) +def convert(src: core.Tensor, dst: core.Tensor): + assert len(src.shape) == len( + dst.shape + ), "Shape of src and dst tensors should be the same rank." + # find leading mode + leading_mode = np.argmin([np.min(s) for s in src.stride]) + + elem_per_copy = 2 + + if src.element_type.width == 4 or dst.element_type.width == 4: + elem_per_copy = 8 + elif src.element_type.width == 8 or dst.element_type.width == 8: + elem_per_copy = 4 + assert ( + src.shape[leading_mode] % elem_per_copy == 0 + and dst.shape[leading_mode] % elem_per_copy == 0 + ) + _convert(src, dst, leading_mode, elem_per_copy) + + +######################################### +# Testing utilities +######################################### + + +def sample_pytest(rand_cfg=None): + """ + Decorator to randomly sample pytest parametrized tests. + rand_cfg: Tuple[int, float] - (random_seed, sample_ratio) + Sampling is disabled when: + - A specific test is selected (via -k or direct test path) + - Not running under pytest + """ + import functools + import os + import random + import pytest + import sys + + seed, sample_ratio = rand_cfg + random.seed(seed) + + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + if rand_cfg is not None and "PYTEST_CURRENT_TEST" in os.environ: + # Check if test was explicitly selected like ::test_name[param1-param2-...] + if "-k" in sys.argv or any(".py::" in arg for arg in sys.argv): + # Test was explicitly selected, don't skip + return func(*args, **kwargs) + + if random.uniform(0.0, 1.0) > sample_ratio: + pytest.skip(f"Randomly skipped (sampling ratio: {sample_ratio})") + return func(*args, **kwargs) + + return wrapper + + return decorator diff --git a/python/CuTeDSL/cutlass/cute/typing.py b/python/CuTeDSL/cutlass/cute/typing.py new file mode 100644 index 00000000..48ac76c4 --- /dev/null +++ b/python/CuTeDSL/cutlass/cute/typing.py @@ -0,0 +1,193 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from abc import ABC, abstractmethod +from typing import ForwardRef, Tuple, Union, Any, Type, List + +from cutlass.base_dsl.typing import * + +from cutlass._mlir import ir +import cutlass._mlir.extras.types as T +from cutlass._mlir.dialects.cute import AddressSpace + + +Int = Union[int, Integer] + + +ScaledBasis = ForwardRef("ScaledBasis") + + +IntTuple = Union[Int, Tuple["IntTuple", ...]] +Shape = Union[Int, Tuple["Shape", ...]] +Stride = Union[Int, ScaledBasis, Tuple["Stride", ...]] +Coord = Union[Int, None, Tuple["Coord", ...]] + + +class Layout(ir.Value): + def __init__(self, op_result): + super().__init__(op_result) + + def __str__(self): ... + + def get_hier_coord(self, idx) -> Coord: + """Return the (hierarchical) ND logical coordinate corresponding to the linear index""" + ... + + @property + def shape(self, *, loc=None, ip=None) -> Shape: ... + + @property + def stride(self, *, loc=None, ip=None) -> Stride: ... + + +Tile = Union[Int, None, Layout, Tuple["Tile", ...]] + +# XTuple is super set of above types +XTuple = Union[IntTuple, Shape, Stride, Coord, Tile] + +Tiler = Union[Shape, Layout, Tile] + + +class Pointer: + """ + Abstract base class for CuTe jit function and runtime _Pointer + """ + + def __extract_mlir_values__(self): + # Doesn't matter just return a value + return [self] + + +class Tensor(ABC): + """ + Abstract base class for CuTe jit function and runtime _Tensor + + A CuTe Tensor is iterator with layout + + :Examples: + + Create tensor from torch.tensor with Host Runtime: + + .. code-block:: python + + >>> import torch + >>> from cutlass.cute.runtime import from_dlpack + >>> mA = from_dlpack(torch.tensor([1, 3, 5], dtype=torch.int32)) + >>> mA.shape + (3,) + >>> mA.stride + (1,) + >>> mA.layout + (3,):(1,) + + Define JIT function: + + .. code-block:: python + + @cute.jit + def add(a: Tensor, b: Tensor, res: Tensor): ... + + Call JIT function from python: + + .. code-block:: python + + >>> import torch + >>> a = torch.tensor([1, 3, 5], dtype=torch.int32) + >>> b = torch.tensor([2, 4, 6], dtype=torch.int32) + >>> c = torch.zeros([3], dtype=torch.int32) + >>> mA = from_dlpack(a) + >>> mB = from_dlpack(b) + >>> mC = from_dlpack(c) + >>> add(mA, mB, mC) + >>> c + tensor([3, 7, 11], dtype=torch.int32) + """ + + def __str__(self): ... + + @abstractmethod + def __getitem__(self, idx) -> Union["Tensor", ir.Value, IntTuple]: ... + + @abstractmethod + def __setitem__(self, idx, value): ... + + @property + @abstractmethod + def element_type(self) -> Union[Type[Numeric], Type[IntTuple]]: ... + + @element_type.setter + def element_type(self, new_type): ... + + @property + @abstractmethod + def memspace(self) -> AddressSpace: ... + + @property + @abstractmethod + def iterator(self): ... + + @property + def layout(self) -> Union[Layout, "ComposedLayout"]: ... + + @property + def shape(self) -> Shape: ... + + def load(self, *, loc=None, ip=None) -> "TensorSSA": ... + + def store(self, data: "TensorSSA", *, loc=None, ip=None): ... + + def mark_layout_dynamic(self, leading_dim: int|None = None) -> "Tensor": ... + + def mark_compact_shape_dynamic( + self, mode: int, stride_order: tuple[int, ...]|None = None, divisibility: int = 1 + ) -> "Tensor": ... + + @abstractmethod + def fill(self, value: Numeric) -> None: ... + + +__all__ = [ + "Coord", + "Numeric", + "Integer", + "Boolean", + "Int8", + "Int16", + "Int32", + "Int64", + "Uint8", + "Uint16", + "Uint32", + "Uint64", + "Float", + "Float16", + "BFloat16", + "TFloat32", + "Float32", + "Float64", + "Float8E5M2", + "Float8E4M3FN", + "Float8E4M3B11FNUZ", + "Float8E4M3", + "Float8E8M0FNU", + "Float4E2M1FN", + "Float6E2M3FN", + "Float6E3M2FN", + "IntTuple", + "Layout", + "Pointer", + "Shape", + "Stride", + "Tensor", + "Tile", + "Tiler", + "XTuple", +] diff --git a/python/CuTeDSL/cutlass/impl_utils.py b/python/CuTeDSL/cutlass/impl_utils.py new file mode 100644 index 00000000..0bb9b520 --- /dev/null +++ b/python/CuTeDSL/cutlass/impl_utils.py @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + + +def check_value_in( + value, possible_values: list, value_description: str, prefix="" +) -> None: + if value not in possible_values: + err_msg = prefix + if err_msg != "": + err_msg += ": " + err_msg += f"invalid {value_description}, got {value}, must be one of {possible_values}" + raise ValueError(err_msg) + + +def check_type_in(ty, possible_types: list, type_description: str, prefix="") -> None: + if not isinstance(ty, type): + ty = type(ty) + if ty not in possible_types: + err_msg = prefix + if err_msg != "": + err_msg += ": " + err_msg += f"invalid type for {type_description}, got {ty}, must be one of {possible_types}" + raise TypeError(err_msg) diff --git a/python/CuTeDSL/cutlass/torch.py b/python/CuTeDSL/cutlass/torch.py new file mode 100644 index 00000000..0126fb04 --- /dev/null +++ b/python/CuTeDSL/cutlass/torch.py @@ -0,0 +1,169 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from dataclasses import dataclass +from enum import Enum +from typing import Optional, Type, Union + +from cutlass.cute.typing import ( + Numeric, + Boolean, + Float, + Integer, + TFloat32, + Float8E4M3B11FNUZ, + Float8E4M3FN, + Float8E5M2, + Float8E8M0FNU, + Float4E2M1FN, + Tensor, +) +from cutlass.cute.runtime import from_dlpack +import cutlass.cute as cute +import torch + + +def dtype(ty: Type[Numeric]): + """ + Return the corresponding torch.dtype per the given DSL type + """ + torch_dtype = getattr(torch, ty.__name__.lower(), None) + + torch_type_map = { + Boolean: torch.bool, + # TFloat32 is just alias of float32 + TFloat32: torch.float32, + Float8E5M2: torch.float8_e5m2, + Float8E4M3FN: torch.float8_e4m3fn, + Float8E4M3B11FNUZ: torch.float8_e4m3fnuz, + } + if torch_dtype is None: + torch_dtype = torch_type_map.get(ty) + + if torch_dtype is None: + raise TypeError(f"{ty} is not supported by torch") + return torch_dtype + + +@dataclass +class ScalarInitConfig: + """Configuration for scalar initialization""" + + value: float = 0.0 + + +@dataclass +class RandomInitConfig: + """Configuration for random initialization""" + + min_val: int = -2 + max_val: int = 2 + + +@dataclass +class GaussianInitConfig: + """Configuration for Gaussian initialization""" + + mean: float = 0.0 + std: float = 1.0 + scale: float = 1.0 + + +class TensorInitType(Enum): + """Enumeration of tensor initialization types""" + + SKIP = "skip" + SCALAR = "scalar" + RANDOM = "random" + GAUSSIAN = "gaussian" + + +def create_and_permute_torch_tensor( + shape, + dtype: "torch.dtype", + permute_order=None, + init_type: TensorInitType = TensorInitType.RANDOM, + init_config: Optional[ + Union[RandomInitConfig, ScalarInitConfig, GaussianInitConfig] + ] = None, +) -> "torch.Tensor": + """ + Create a torch tensor with specified shape and dtype. Optionally permute it and initialize it with specified init type and config + """ + init_dtype = torch.int32 if init_type == TensorInitType.RANDOM else torch.float32 + init_torch_tensor = torch.empty(*shape, dtype=init_dtype) + if init_type == TensorInitType.SKIP: + assert init_config is None + f32_torch_tensor = init_torch_tensor + elif init_type == TensorInitType.SCALAR: + if init_config is None: + init_config = ScalarInitConfig() + else: + if not isinstance(init_config, ScalarInitConfig): + raise ValueError("init_config must be ScalarInitConfig()") + f32_torch_tensor = init_torch_tensor.fill_(init_config.value) + elif init_type == TensorInitType.RANDOM: + if init_config is None: + init_config = RandomInitConfig() + else: + if not isinstance(init_config, RandomInitConfig): + raise ValueError("init_config must be RandomInitConfig()") + f32_torch_tensor = init_torch_tensor.random_( + init_config.min_val, init_config.max_val + ).to(dtype=torch.float32) + elif init_type == TensorInitType.GAUSSIAN: + if init_config is None: + init_config = GaussianInitConfig() + else: + if not isinstance(init_config, GaussianInitConfig): + raise ValueError("init_config must be GaussianInitConfig()") + f32_torch_tensor = init_torch_tensor.normal_(init_config.mean, init_config.std) + f32_torch_tensor = f32_torch_tensor * (1 << init_config.scale) + else: + raise ValueError(f"Invalid init type: {init_type}") + + if permute_order is not None: + f32_torch_tensor = f32_torch_tensor.permute(permute_order) + + dtype_torch_tensor = f32_torch_tensor.to(dtype=dtype) + + return dtype_torch_tensor + + +def convert_cute_tensor( + f32_torch_tensor: "torch.Tensor", + cute_tensor: Tensor, + dtype: Type[Numeric], + is_dynamic_layout: bool = True, +) -> Tensor: + """ + Change the value of the cute tensor to make its value converted from a fp32 torch tensor. + Used for fp8 types tensor creatation now. + """ + # if torch_tensor is on cpu, create a gpu copy + if f32_torch_tensor.device.type == "cpu": + f32_torch_tensor = f32_torch_tensor.cuda() + + # Fp8 type need explicit type conversion + if dtype in { + Float8E5M2, + Float8E4M3FN, + Float8E8M0FNU, + Float4E2M1FN, + }: + fp32_cute_tensor = from_dlpack(f32_torch_tensor) + if is_dynamic_layout: + fp32_cute_tensor = fp32_cute_tensor.mark_layout_dynamic( + f32_torch_tensor.dim_order()[-1] + ) + # Copy and convert from f32 cute tensor to dtype cute tensor + cute.testing.convert(fp32_cute_tensor, cute_tensor) + return cute_tensor diff --git a/python/CuTeDSL/cutlass/utils/README.md b/python/CuTeDSL/cutlass/utils/README.md new file mode 100644 index 00000000..3a583ed4 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/README.md @@ -0,0 +1,9 @@ +# Utilities + +This folder contains various utilties for kernel authoring. Specifically, the implementation of the +followings can be considered experimental and subject to breaking changes: + +- static persistent tile scheduler defined in [`static_persistent_tile_scheduler.py`](./static_persistent_tile_scheduler.py) +- pipeline abstractions defined in [`pipeline.py`](./pipeline.py) +- grouped GEMM utilties defined [`grouped_gemm_tile_scheduler_helper.py`](./grouped_gemm_tile_scheduler_helper.py) + and [`tensormap_manager.py`](./tensormap_manager.py) diff --git a/python/CuTeDSL/cutlass/utils/__init__.py b/python/CuTeDSL/cutlass/utils/__init__.py new file mode 100644 index 00000000..dc3fdbcd --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/__init__.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .static_persistent_tile_scheduler import ( + WorkTileInfo, + PersistentTileSchedulerParams, + StaticPersistentTileScheduler, +) + +from .pipeline import ( + Agent, + CooperativeGroup, + PipelineUserType, + PipelineState, + make_pipeline_state, + PipelineAsync, + PipelineTmaAsync, + PipelineTmaUmma, + PipelineUmmaAsync, + PipelineTmaStore, + pipeline_init_wait, +) + +from .hardware_info import ( + HardwareInfo, +) + +from .blackwell_helpers import ( + compute_epilogue_tile_shape, + get_smem_store_op, + get_tmem_load_op, + get_num_tmem_alloc_cols, + make_smem_layout_a, + make_smem_layout_b, + make_smem_layout_epi, + make_trivial_tiled_mma, +) + +from .hopper_helpers import ( + sm90_get_smem_store_op, +) + +from .grouped_gemm_tile_scheduler_helper import ( + GroupSearchResult, + GroupedGemmGroupSearchState, + GroupedGemmTileSchedulerHelper, + create_initial_search_state, +) + +from .tensormap_manager import ( + TensorMapUpdateMode, + TensorMapManager, +) + +from .smem_allocator import SmemAllocator + +from .layout import LayoutEnum + +__all__ = [ + "WorkTileInfo", + "PersistentTileSchedulerParams", + "StaticPersistentTileScheduler", + "TensorMapUpdateMode", + "TensorMapManager", + "GroupSearchResult", + "GroupedGemmGroupSearchState", + "create_initial_search_state", + "GroupedGemmTileSchedulerHelper", + "HardwareInfo", +] diff --git a/python/CuTeDSL/cutlass/utils/ampere_helpers.py b/python/CuTeDSL/cutlass/utils/ampere_helpers.py new file mode 100644 index 00000000..1ba97e1c --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/ampere_helpers.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from enum import Enum + + +class SmemCapacity(Enum): + SM80_SMEM_CAPACITY_BYTES = (164 - 1) * 1024 + SM86_SMEM_CAPACITY_BYTES = (100 - 1) * 1024 + SM89_SMEM_CAPACITY_BYTES = (100 - 1) * 1024 + + +# Dictionary to map compute capability to SMEM capacity +SMEM_CAPACITY = { + "sm80": SmemCapacity.SM80_SMEM_CAPACITY_BYTES.value, + "sm86": SmemCapacity.SM86_SMEM_CAPACITY_BYTES.value, + "sm89": SmemCapacity.SM89_SMEM_CAPACITY_BYTES.value, +} diff --git a/python/CuTeDSL/cutlass/utils/blackwell_helpers.py b/python/CuTeDSL/cutlass/utils/blackwell_helpers.py new file mode 100644 index 00000000..ca01ad49 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/blackwell_helpers.py @@ -0,0 +1,910 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from enum import Enum +from math import log2, ceil +from typing import List, Type, Union, Tuple + +from cutlass.cutlass_dsl import ( + Float16, + BFloat16, + TFloat32, + Float32, + Uint8, + Int8, + Float8E4M3FN, + Float8E5M2, + Numeric, + NumericMeta, + dsl_user_op, +) +import cutlass.cute as cute +from cutlass.cute.nvgpu.common import CopyUniversalOp +from cutlass.cute.nvgpu.warp import StMatrix8x8x16bOp, StMatrix16x8x8bOp +from cutlass.cute.nvgpu.tcgen05 import ( + MmaF16BF16Op, + MmaTF32Op, + MmaI8Op, + MmaFP8Op, + OperandSource, + OperandMajorMode, + CtaGroup, + Ld16x64bOp, + Ld16x128bOp, + Ld16x256bOp, + Ld16x32bx2Op, + Ld32x32bOp, + Repetition, + Pack, + find_tmem_tensor_col_offset, + SmemLayoutAtomKind, + make_smem_layout_atom, + tile_to_mma_shape, + is_tmem_load, + get_tmem_copy_properties, +) +from cutlass.utils.layout import LayoutEnum + +@dsl_user_op +def compute_epilogue_tile_shape( + cta_tile_shape: cute.Shape, + use_2cta_instrs: bool, + layout_d: LayoutEnum, + elem_ty_d: Type[Numeric], + *, + layout_c: LayoutEnum = None, + elem_ty_c: Union[Type[Numeric], None] = None, + loc=None, + ip=None, +) -> cute.Tile: + """Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one. + + :param cta_tile_shape: A tuple or list representing the dimensions of the CTA tile, where + cta_tile_shape[0] corresponds to the height (M) and cta_tile_shape[1] + corresponds to the width (N) of the tile. + :type cta_tile_shape: cute.Shape + :param use_2cta_instrs: A flag indicating whether the configuration is for a 2SM setup. + :type use_2cta_instrs: bool + :param layout_d: The layout enum of the output tensor D. + :type layout_d: LayoutEnum + :param elem_ty_d: The element type of output tensor D. + :type elem_ty_d: Type[Numeric] + :param layout_c: The layout enum of the input tensor C. Defaults to None. + :type layout_c: LayoutEnum, optional + :param elem_ty_c: The element type for input tensor C. Defaults to None. + :type elem_ty_c: Union[Type[Numeric], None], optional + + :return: Returns epilog tiler, which is used in subsequent epilog partitions. + :rtype: cute.Tile + + :raises ValueError: If the computed tile cute.size does not meet minimum requirements based on CTA dimensions. + """ + + def validate_type(ty, ty_name): + if not isinstance(ty, NumericMeta): + raise TypeError(f"{ty_name} must be Numeric, but got {ty}") + + validate_type(elem_ty_d, "elem_ty_d") + if elem_ty_c is not None: + validate_type(elem_ty_c, "elem_ty_c") + + cta_m, cta_n = cta_tile_shape[:2] + (warp_m, warp_n) = (2, 2) if (cta_m == 64 and use_2cta_instrs) else (4, 1) + disable_source = elem_ty_c == None + max_bits = ( + elem_ty_d.width if disable_source else max(elem_ty_c.width, elem_ty_d.width) + ) + + dp_full = 32 + tile_m = min(cta_m, dp_full * warp_m) + n_perf = 0 + if disable_source: + if max_bits == 4: + compute_elts = 8192 + else: + compute_elts = 4096 + n_perf = compute_elts // tile_m + else: + if max_bits == 32: + n_perf = 16 if (cta_m > 64 and cta_n <= 128) else 32 + elif max_bits == 16: + n_perf = 32 if cta_n <= 128 else 64 + else: + n_perf = 64 + + d_is_m_major = layout_d.is_m_major_c() + c_is_m_major = True if layout_c is None else layout_c.is_m_major_c() + + n_min_d = ( + 8 * warp_n + if d_is_m_major + else (128 * warp_n if elem_ty_d.width == 6 else 128 // elem_ty_d.width * warp_n) + ) + n_min_c = ( + 8 * warp_n + if (c_is_m_major or disable_source) + else (128 * warp_n if elem_ty_c.width == 6 else 128 // elem_ty_c.width * warp_n) + ) + tile_n = min(cta_n, max(n_perf, n_min_c, n_min_d)) + + if cta_n < n_min_c or cta_n < n_min_d: + raise ValueError(f"CTA tile too small: {cta_tile_shape=}") + + # stride by tmem warp layout and return a by-mode tiler + tile_m_layout = cute.make_layout(tile_m, loc=loc, ip=ip) + tile_n_layout = cute.make_layout( + (tile_n // warp_n, warp_n), stride=(1, cta_n // warp_n), loc=loc, ip=ip + ) + return (tile_m_layout, cute.coalesce(tile_n_layout, loc=loc, ip=ip)) + + +@dsl_user_op +def get_smem_store_op( + layout_d: LayoutEnum, + elem_ty_d: Type[Numeric], + elem_ty_acc: Type[Numeric], + tiled_tmem_load: cute.TiledCopy, + *, + loc=None, + ip=None, +) -> cute.CopyAtom: + """Selects the largest vectorized smem store atom available subject to + constraint of gmem layout and chosen TMEM_LOAD's thread-value ownership. + + :param layout_d: The layout enum of the output tensor D. + :type layout_d: LayoutEnum + :param elem_ty_d: The element type for output tensor D. + :type elem_ty_d: Type[Numeric] + :param elem_ty_acc: The element type for accumulator. + :type elem_ty_acc: Type[Numeric] + :param tiled_tmem_load: An instance of TiledCopy that represents the tmem load operation. + :type tiled_tmem_load: cute.TiledCopy + + :return: Either SmemStoreMatrix or SimtSyncCopy, based on the input parameters. + :rtype: cute.CopyAtom + """ + + def validate_type(ty, ty_name): + if not isinstance(ty, NumericMeta): + raise TypeError(f"{ty_name} must be a Numeric, but got {ty}") + + validate_type(elem_ty_d, "elem_ty_d") + validate_type(elem_ty_acc, "elem_ty_acc") + + is_m_major = layout_d.is_m_major_c() + is_n_major = layout_d.is_n_major_c() + + if not is_tmem_load(tiled_tmem_load): + return cute.make_copy_atom(CopyUniversalOp(), elem_ty_d, loc=loc, ip=ip) + + num_dp, num_bits, num_rep, pack = get_tmem_copy_properties(tiled_tmem_load) + + use_stmatrix_m8n8_4x = ( + all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 32, + is_n_major, + num_dp == 16, + num_bits == 128, + num_rep in (2, 4, 8, 16, 32, 64), + pack == Pack.NONE, + ] + ) + or all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 16, + num_dp == 16, + num_bits == 256, + num_rep in (2, 4, 8, 16, 32), + pack == Pack.NONE, + ] + ) + or all( + [ + elem_ty_acc.width == 16, + elem_ty_d.width == 16, + num_dp == 16, + num_bits == 128, + num_rep in (2, 4, 8, 16, 32, 64), + pack == Pack.PACK_16b_IN_32b, + ] + ) + ) + use_stmatrix_m16n8_4x = all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 8, + is_m_major, + num_dp == 16, + num_bits == 256, + num_rep in (4, 8, 16, 32), + pack == Pack.NONE, + ] + ) + use_stmatrix_m8n8_2x = ( + all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 32, + is_n_major, + num_dp == 16, + num_bits == 128, + num_rep == 1, + pack == Pack.NONE, + ] + ) + or all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 16, + num_dp == 16, + num_bits == 256, + num_rep == 1, + pack == Pack.NONE, + ] + ) + or all( + [ + elem_ty_acc.width == 16, + elem_ty_d.width == 16, + num_dp == 16, + num_bits == 128, + num_rep == 1, + pack == Pack.PACK_16b_IN_32b, + ] + ) + ) + use_stmatrix_m16n8_2x = all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 8, + is_m_major, + num_dp == 16, + num_bits == 256, + num_rep == 2, + pack == Pack.NONE, + ] + ) + use_stmatrix_m16n8_1x = all( + [ + elem_ty_acc.width == 32, + elem_ty_d.width == 8, + is_m_major, + num_dp == 16, + num_bits == 256, + num_rep == 1, + pack == Pack.NONE, + ] + ) + + if use_stmatrix_m8n8_4x: + op = StMatrix8x8x16bOp(is_m_major, 4) + return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip) + elif use_stmatrix_m8n8_2x: + op = StMatrix8x8x16bOp(is_m_major, 2) + return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip) + elif use_stmatrix_m16n8_4x: + op = StMatrix16x8x8bOp(4) + return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip) + elif use_stmatrix_m16n8_2x: + op = StMatrix16x8x8bOp(2) + return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip) + elif use_stmatrix_m16n8_1x: + op = StMatrix16x8x8bOp(1) + return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip) + else: + op = CopyUniversalOp() + return cute.make_copy_atom(op, elem_ty_d, loc=loc, ip=ip) + + +@dsl_user_op +def get_tmem_load_op( + cta_tile_shape: cute.Shape, + layout_d: LayoutEnum, + elem_ty_d: Type[Numeric], + elem_ty_acc: Type[Numeric], + epi_tile: cute.Tile, + use_2cta_instrs: bool, + *, + loc=None, + ip=None, +) -> cute.CopyAtom: + """Finds a performant TMEM_LOAD copy op for the selected epilogue + tile (epi_tile), element types, and tcgen05.mma instruction used. + + :param cta_tile_shape: A tuple or list representing the dimensions of the CTA tile. + :type cta_tile_shape: cute.Shape + :param layout_d: The layout enum of the output tensor D. + :type layout_d: LayoutEnum + :param elem_ty_d: The element type for output tensor D. + :type elem_ty_d: Type[Numeric] + :param elem_ty_acc: The element type for accumulation. + :type elem_ty_acc: Type[Numeric] + :param epi_tile: The epilogue tile configuration. + :type epi_tile: cute.Tile + :param use_2cta_instrs: A flag indicating whether the configuration is for 2 SMs. + :type use_2cta_instrs: bool + + :return: An instance of Sm100TmemLoad with the computed configuration. + :rtype: cute.CopyAtom + + :raises ValueError: If the function cannot handle the given combination of accumulation + and dimension types, or if it cannot determine the appropriate configuration based on + the input parameters. + """ + is_m_major = layout_d.is_m_major_c() + + acc_bits = elem_ty_acc.width + d_bits = elem_ty_d.width + + tmem_warp_shape_mn = ( + (2, 2) if (cta_tile_shape[0] == 64 and use_2cta_instrs) else (4, 1) + ) + epilog_tile_shape_mn = cute.product_each( + cute.shape(epi_tile, loc=loc, ip=ip), loc=loc, ip=ip + ) + epilog_warp_tile_shape_mn = cute.shape_div( + epilog_tile_shape_mn, tmem_warp_shape_mn, loc=loc, ip=ip + ) + + num_dp = cute.size(epilog_warp_tile_shape_mn[0], loc=loc, ip=ip) + if num_dp not in {16, 32}: + raise ValueError("Cta tile and 2sm config does not generate correct num dp.") + + num_col_bits = cute.size(epilog_warp_tile_shape_mn[1], loc=loc, ip=ip) * acc_bits + + tmem_dp = 0 + tmem_bit = 0 + tmem_rep = 0 + tmem_pack16b = False + if acc_bits == 32 and d_bits == 32: + if num_dp == 16: + if is_m_major: + tmem_dp = 16 + tmem_bit = 256 + else: + tmem_dp = 16 + tmem_bit = 128 + else: + tmem_dp = 32 + tmem_bit = 32 + elif acc_bits == 32 and d_bits == 16: + if num_dp == 16: + if is_m_major: + tmem_dp = 16 + tmem_bit = 256 + else: + tmem_dp = 16 + tmem_bit = 256 + else: + if is_m_major: + tmem_dp = 16 + tmem_bit = 256 + else: + tmem_dp = 32 + tmem_bit = 32 + elif acc_bits == 32 and d_bits == 8: + if num_dp == 16: + if is_m_major: + tmem_dp = 16 + tmem_bit = 256 + else: + tmem_dp = 16 + tmem_bit = 32 + else: + if is_m_major: + tmem_dp = 16 + tmem_bit = 256 + else: + tmem_dp = 32 + tmem_bit = 32 + elif acc_bits == 16 and d_bits == 16: + tmem_pack16b = True + if num_dp == 16: + if is_m_major: + tmem_dp = 16 + tmem_bit = 128 + else: + tmem_dp = 16 + tmem_bit = 128 + else: + if is_m_major: + tmem_dp = 16 + tmem_bit = 128 + else: + tmem_dp = 32 + tmem_bit = 32 + elif acc_bits == 32 and d_bits == 6: + if not num_dp == 32: + raise ValueError("Num dp must be 32.") + tmem_dp = 32 + tmem_bit = 32 + elif acc_bits == 32 and d_bits == 4: + if not num_dp == 32: + raise ValueError("Num dp must be 32.") + tmem_dp = 32 + tmem_bit = 32 + else: + raise ValueError( + f"Can not handle acc/d type combination: {elem_ty_acc=}, {elem_ty_d=}" + ) + + num_bit_div = tmem_bit + if tmem_dp == 16 and tmem_bit == 32: + num_bit_div = 64 + + if (num_col_bits % (num_bit_div * 128) == 0) and ( + (tmem_dp == 16 and tmem_bit == 64) + or (tmem_dp == 16 and tmem_bit == 32) + or (tmem_dp == 32 and tmem_bit == 32) + ): + tmem_rep = 128 + elif (num_col_bits % (num_bit_div * 64) == 0) and ( + (tmem_dp == 16 and tmem_bit == 128) + or (tmem_dp == 16 and tmem_bit == 64) + or (tmem_dp == 16 and tmem_bit == 32) + or (tmem_dp == 32 and tmem_bit == 32) + ): + tmem_rep = 64 + elif num_col_bits % (num_bit_div * 32) == 0: + tmem_rep = 32 + elif num_col_bits % (num_bit_div * 16) == 0: + tmem_rep = 16 + elif num_col_bits % (num_bit_div * 8) == 0: + tmem_rep = 8 + elif num_col_bits % (num_bit_div * 4) == 0: + tmem_rep = 4 + elif num_col_bits % (num_bit_div * 2) == 0: + tmem_rep = 2 + elif num_col_bits % (num_bit_div * 1) == 0: + tmem_rep = 1 + else: + raise ValueError("Can not pick tmem_rep based on cta tile shape and tmem atom.") + + if tmem_dp == 16 and tmem_bit == 64: + op = Ld16x64bOp( + Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE + ) + return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip) + elif tmem_dp == 16 and tmem_bit == 128: + op = Ld16x128bOp( + Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE + ) + return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip) + elif tmem_dp == 16 and tmem_bit == 256: + op = Ld16x256bOp( + Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE + ) + return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip) + elif tmem_dp == 16 and tmem_bit == 32: + op = Ld16x32bx2Op( + Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE + ) + return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip) + + elif tmem_dp == 32 and tmem_bit == 32: + op = Ld32x32bOp( + Repetition(tmem_rep), Pack.PACK_16b_IN_32b if tmem_pack16b else Pack.NONE + ) + return cute.make_copy_atom(op, elem_ty_acc, loc=loc, ip=ip) + else: + raise ValueError() + + +def get_num_tmem_alloc_cols( + tmem_tensors: Union[cute.Tensor, List[cute.Tensor]], rounding=True +) -> int: + """Get the total number of TMEM allocation columns for the given TMEM tensors. + + :param tmem_tensors: The TMEM tensors to get the number of allocation columns for. + :type tmem_tensors: Union[cute.Tensor, List[cute.Tensor]] + :param rounding: Whether to round up the number of allocation columns to the nearest power of 2. + :type rounding: bool + + :return: The total number of TMEM allocation columns. + :rtype: int + + :raises ValueError: If the number of TMEM allocation columns exceeds the maximum capacity of 512 or is less than 32. + """ + # Turn tmem_tensors into a list + if isinstance(tmem_tensors, cute.Tensor): + tmem_tensors = [tmem_tensors] + + # For each tensor in tmem_tensors, find the tmem_tensor_col_offset + num_tmem_alloc_cols_per_tensor = [ + find_tmem_tensor_col_offset(t) for t in tmem_tensors + ] + + # Sum up the num_tmem_alloc_cols_per_tensor + num_tmem_alloc_cols = sum(num_tmem_alloc_cols_per_tensor) + + # Round up num_tmem_cols_total to the nearest power of 2 + if rounding: + num_tmem_alloc_cols = 1 << ceil(log2(num_tmem_alloc_cols)) + + # Validate the number of TMEM allocation columns + SM100_TMEM_CAPACITY_COLUMNS = 512 + SM100_TMEM_MIN_ALLOC_COLUMNS = 32 + if ( + num_tmem_alloc_cols > SM100_TMEM_CAPACITY_COLUMNS + or num_tmem_alloc_cols < SM100_TMEM_MIN_ALLOC_COLUMNS + ): + raise ValueError( + f"TMEM allocation columns {num_tmem_alloc_cols} exceeds the maximum capacity of {SM100_TMEM_CAPACITY_COLUMNS} or less than {SM100_TMEM_MIN_ALLOC_COLUMNS}" + ) + return num_tmem_alloc_cols + + +def get_smem_layout_atom_ab( + major_mode: OperandMajorMode, + element_type: Type[Numeric], + smem_shape_mn_k: Tuple[int, int], + *, + loc=None, + ip=None, +) -> SmemLayoutAtomKind: + """Simple heuristics to select the optimal SMEM layout atom based on the + majorness, the data type, and the major mode size. + + :param major_mode: The major mode for the SMEM tensor is K major. + :type major_mode: OperandMajorMode + :param element_type: The element type for the SMEM tensor. + :type element_type: Type[Numeric] + :param smem_shape_mn_k: The shape of the SMEM tensor. + :type smem_shape_mn_k: Tuple[int, int] + + :return: The SMEM layout atom kind + :rtype: SmemLayoutAtomKind + """ + is_k_major = major_mode == OperandMajorMode.K + major_mode_size = smem_shape_mn_k[1] if is_k_major else smem_shape_mn_k[0] + + assert major_mode_size % 8 == 0 + sw128_num_contiguous_bits = 1024 + sw64_num_contiguous_bits = 512 + sw32_num_contiguous_bits = 256 + inter_num_contiguous_bits = 128 + major_mode_size_bits = major_mode_size * element_type.width + assert major_mode_size_bits % inter_num_contiguous_bits == 0 + + if not is_k_major: + if (element_type.width == 32) and ( + major_mode_size_bits % sw128_num_contiguous_bits == 0 + ): + return SmemLayoutAtomKind.MN_SW128_32B + if major_mode_size_bits % sw128_num_contiguous_bits == 0: + return SmemLayoutAtomKind.MN_SW128 + if major_mode_size_bits % sw64_num_contiguous_bits == 0: + return SmemLayoutAtomKind.MN_SW64 + if major_mode_size_bits % sw32_num_contiguous_bits == 0: + return SmemLayoutAtomKind.MN_SW32 + return SmemLayoutAtomKind.MN_INTER + if major_mode_size_bits % sw128_num_contiguous_bits == 0: + return SmemLayoutAtomKind.K_SW128 + if major_mode_size_bits % sw64_num_contiguous_bits == 0: + return SmemLayoutAtomKind.K_SW64 + if major_mode_size_bits % sw32_num_contiguous_bits == 0: + return SmemLayoutAtomKind.K_SW32 + return SmemLayoutAtomKind.K_INTER + + +@dsl_user_op +def make_smem_layout_a( + tiled_mma: cute.TiledMma, + mma_tiler_mnk: cute.Tile, + a_dtype: Type[Numeric], + num_stages: int, + *, + loc=None, + ip=None, +) -> Union[cute.Layout, cute.ComposedLayout]: + """This function helps with: + 1. Get the partitioned shape of the A tensor based on the tiled_mma & MMA tiler. + 2. Select the heuristic SMEM layout atom based on the A tensor's majorness, the data type, and the major mode size. + 3. cute.Tile the SMEM layout atom to the MMA tile shape. + 4. Stage the SMEM layout based on the number of stages. + + :param tiled_mma: The tiled MMA used to partition tensor A + :type tiled_mma: cute.TiledMma + :param mma_tiler_mnk: The MMA tile shape + :type mma_tiler_mnk: cute.cute.Tile + :param a_dtype: The element type for tensor A + :type a_dtype: Type[Numeric] + :param num_stages: The number of pipeline stages for tensor A + :type num_stages: int + + :return: SMEM layout for tensor A + :rtype: Union[cute.Layout, cute.ComposedLayout] + """ + + is_k_major = tiled_mma.op.a_major_mode == OperandMajorMode.K + a_smem_shape = tiled_mma.partition_shape_A( + cute.dice(mma_tiler_mnk, (1, None, 1), loc=loc, ip=ip) + ) + a_smem_shape_mn_k = ( + cute.size(a_smem_shape[0][0], loc=loc, ip=ip) * a_smem_shape[1], + cute.size(a_smem_shape[0][1], loc=loc, ip=ip) * a_smem_shape[2], + ) + a_smem_layout_atom = make_smem_layout_atom( + get_smem_layout_atom_ab( + tiled_mma.op.a_major_mode, + a_dtype, + a_smem_shape_mn_k, + loc=loc, + ip=ip, + ), + a_dtype, + loc=loc, + ip=ip, + ) + a_smem_layout_staged = tile_to_mma_shape( + a_smem_layout_atom, + cute.append(a_smem_shape, num_stages, loc=loc, ip=ip), + order=((1, 0, 2) if not is_k_major else (0, 1, 2)), + loc=loc, + ip=ip, + ) + return a_smem_layout_staged + + +@dsl_user_op +def make_smem_layout_b( + tiled_mma: cute.TiledMma, + mma_tiler_mnk: cute.Tile, + b_dtype: Type[Numeric], + num_stages: int, + *, + loc=None, + ip=None, +) -> Union[cute.Layout, cute.ComposedLayout]: + """This function helps: + 1. Get the partitioned shape of the B tensor based on the tiled_mma & MMA tiler. + 2. Select the heuristic SMEM layout atom based on the B tensor's majorness, the data type, and the major mode size. + 3. cute.Tile the SMEM layout atom to the MMA tile shape. + 4. Stage the SMEM layout based on the number of stages. + + :param tiled_mma: The tiled MMA which is used to partition the B tensor. + :type tiled_mma: cute.TiledMma + :param mma_tiler_mnk: The MMA tile shape. + :type mma_tiler_mnk: cute.cute.Tile + :param b_dtype: The element type for the B tensor. + :type b_dtype: Type[Numeric] + :param num_stages: The stage of the B tensor. + :type num_stages: int + + :return: SMEM layout for the B tensor. + :rtype: Union[cute.Layout, cute.ComposedLayout] + """ + + is_k_major = tiled_mma.op.b_major_mode == OperandMajorMode.K + b_smem_shape = tiled_mma.partition_shape_B( + cute.dice(mma_tiler_mnk, (None, 1, 1), loc=loc, ip=ip) + ) + b_smem_shape_nk = ( + cute.size(b_smem_shape[0][0], loc=loc, ip=ip) * b_smem_shape[1], + cute.size(b_smem_shape[0][1], loc=loc, ip=ip) * b_smem_shape[2], + ) + b_smem_layout_atom = make_smem_layout_atom( + get_smem_layout_atom_ab( + tiled_mma.op.b_major_mode, + b_dtype, + b_smem_shape_nk, + loc=loc, + ip=ip, + ), + b_dtype, + loc=loc, + ip=ip, + ) + b_smem_layout_staged = tile_to_mma_shape( + b_smem_layout_atom, + cute.append(b_smem_shape, num_stages, loc=loc, ip=ip), + order=((1, 0, 2) if not is_k_major else (0, 1, 2)), + loc=loc, + ip=ip, + ) + + return b_smem_layout_staged + +@dsl_user_op +def get_smem_layout_atom_epi( + layout: LayoutEnum, + element_type: Type[Numeric], + epi_tile: cute.Tile, + *, + loc=None, + ip=None, +) -> SmemLayoutAtomKind: + """Simple heuristics to select the optimal SMEM layout atom for epilog tensors. + + :param layout: The layout enum for the SMEM tensor. + :type layout: LayoutEnum + :param element_type: The element type for the SMEM tensor. + :type element_type: Type[Numeric] + :param epi_tile: The epilogue tile shape. + :type epi_tile: cute.Tile + + :return: The SMEM layout atom kind + :rtype: SmemLayoutAtomKind + """ + # Get the max contiguous tile usable by TMA + tma_shape = tuple( + ( + # assumes get<0>(epi_tile) is coalesced and unit stride + cute.coalesce(cute.right_inverse(x, loc=loc, ip=ip), loc=loc, ip=ip).shape + if isinstance(x, cute.Layout) + else x + ) + for x in epi_tile + ) + + if layout.is_m_major_c(): + # ColMajor C/D (M-major) + return get_smem_layout_atom_ab( + OperandMajorMode.MN, element_type, tma_shape, loc=loc, ip=ip + ) + else: + # RowMajor C/D (N-major) + return get_smem_layout_atom_ab( + OperandMajorMode.K, element_type, tma_shape, loc=loc, ip=ip + ) + + +@dsl_user_op +def make_smem_layout_epi( + epi_dtype: Type[Numeric], + epi_layout: LayoutEnum, + epi_tile: cute.Tile, + epi_stage: int, + *, + loc=None, + ip=None, +) -> Union[cute.Layout, cute.ComposedLayout]: + """This function helps: + 1. Select the heuristic SMEM layout atom based on the epilog tile shape, + the epilog tensor's majorness, and the element type. + 2. cute.Tile the SMEM layout atom to the epilog tile shape. + 3. Stage the SMEM layout based on the number of stages. + + :param epi_dtype: The element type for the epilog tensor. + :type epi_dtype: Type[Numeric] + :param epi_layout: The layout enum for the epilog tensor. + :type epi_layout: LayoutEnum + :param epi_tile: The epilogue tile shape. + :type epi_tile: cute.cute.Tile + :param epi_stage: The stage of the epilog tensor. + :type epi_stage: int + + :return: SMEM layout for epilog tensors (usually C & D which are processed in the epilog) + :rtype: Union[cute.Layout, cute.ComposedLayout] + """ + + epilog_shape = cute.product_each( + cute.shape(epi_tile, loc=loc, ip=ip), loc=loc, ip=ip + ) + + c_smem_layout_atom = make_smem_layout_atom( + get_smem_layout_atom_epi( + epi_layout, + epi_dtype, + epi_tile, + loc=loc, + ip=ip, + ), + epi_dtype, + loc=loc, + ip=ip, + ) + epi_smem_layout_staged = cute.tile_to_shape( + c_smem_layout_atom, + cute.append(epilog_shape, epi_stage, loc=loc, ip=ip), + order=((1, 0, 2) if not epi_layout.is_n_major_c() else (0, 1, 2)), + loc=loc, + ip=ip, + ) + + return epi_smem_layout_staged + + +class SmemCapacity(Enum): + SM100_SMEM_CAPACITY_BYTES = (228 - 1) * 1024 + SM120_SMEM_CAPACITY_BYTES = (100 - 1) * 1024 + + +# Dictionary to map compute capability to SMEM capacity +SMEM_CAPACITY = { + "sm100": SmemCapacity.SM100_SMEM_CAPACITY_BYTES.value, + "sm120": SmemCapacity.SM120_SMEM_CAPACITY_BYTES.value, +} + +@dsl_user_op +def make_trivial_tiled_mma( + ab_dtype: Type[Numeric], + a_leading_mode: OperandMajorMode, + b_leading_mode: OperandMajorMode, + acc_dtype: Type[Numeric], + cta_group: CtaGroup, + mma_tiler_mn: Tuple[int, int], + a_source: OperandSource = OperandSource.SMEM, + *, + loc=None, + ip=None, +) -> cute.TiledMma: + """Make a tiled MMA atom with given data type, leading dimension, cta group and mma tile shape. + By default, the MMA atom is created with SMEM operand source for A. + + :param ab_dtype: Data type of operands A and B. + :type ab_dtype: type[Numeric] + :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N). + :type a_leading_mode: tcgen05.OperandMajorMode + :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N). + :type b_leading_mode: tcgen05.OperandMajorMode + :param acc_dtype: Data type of the accumulator. + :type acc_dtype: type[Numeric] + :param cta_group: The CTA group to use. + :type cta_group: tcgen05.CtaGroup + :param mma_tiler_mn: The shape (M, N, K) of the MMA tiler. + :type mma_tiler_mn: Tuple[int, int] + :param a_source: The source of operand A (SMEM by default or TMEM). + :type a_source: OperandSource + + :return: A tiled MMA atom. + :rtype: cute.TiledMma + + :raises TypeError: If the data type is not supported. + """ + + if ab_dtype in {Float16, BFloat16}: + mma_op = MmaF16BF16Op( + ab_dtype, + acc_dtype, + (*mma_tiler_mn, 16), + cta_group, + a_source, + a_leading_mode, + b_leading_mode, + ) + elif ab_dtype in {TFloat32, Float32}: + mma_op = MmaTF32Op( + (*mma_tiler_mn, 8), + cta_group, + a_source, + a_leading_mode, + b_leading_mode, + ) + elif ab_dtype in { + Uint8, + Int8, + }: + mma_op = MmaI8Op( + ab_dtype, + (*mma_tiler_mn, 32), + cta_group, + a_source, + a_leading_mode, + b_leading_mode, + ) + elif ab_dtype in {Float8E4M3FN, Float8E5M2}: + mma_op = MmaFP8Op( + ab_dtype, + acc_dtype, + (*mma_tiler_mn, 32), + cta_group, + a_source, + a_leading_mode, + b_leading_mode, + ) + else: + raise TypeError(f"unsupported ab_dtype, got {ab_dtype}") + + return cute.make_tiled_mma(cute.make_mma_atom(mma_op)) diff --git a/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py b/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py new file mode 100644 index 00000000..a51bae62 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/grouped_gemm_tile_scheduler_helper.py @@ -0,0 +1,466 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import List, Tuple + +import cutlass.cute as cute +from cutlass.cutlass_dsl import Int32, extract_mlir_values, new_from_mlir_values +from cutlass._mlir import ir + +from cutlass.utils.static_persistent_tile_scheduler import PersistentTileSchedulerParams + + +class GroupSearchResult: + """ + The result of the group search for grouped gemm. + + :param group_idx: The result group index + :type group_idx: Int32 + :param cta_tile_idx_m: CTA tile index along M dimension after rasterization + :type cta_tile_idx_m: Int32 + :param cta_tile_idx_n: CTA tile index along N dimension after rasterization + :type cta_tile_idx_n: Int32 + :param problem_shape_m: The M dimension of the gemm problem + :type problem_shape_m: Int32 + :param problem_shape_n: The N dimension of the gemm problem + :type problem_shape_n: Int32 + :param problem_shape_k: The K dimension of the gemm problem + :type problem_shape_k: Int32 + :param cta_tile_count_k: Number of tiles along K dimension + :type cta_tile_count_k: Int32 + """ + + def __init__( + self, + group_idx: Int32, + cta_tile_idx_m: Int32, + cta_tile_idx_n: Int32, + problem_shape_m: Int32, + problem_shape_n: Int32, + problem_shape_k: Int32, + cta_tile_count_k: Int32, + ) -> None: + self.group_idx = group_idx + self.cta_tile_idx_m = cta_tile_idx_m + self.cta_tile_idx_n = cta_tile_idx_n + self.problem_shape_m = problem_shape_m + self.problem_shape_n = problem_shape_n + self.problem_shape_k = problem_shape_k + self.cta_tile_count_k = cta_tile_count_k + + def __extract_mlir_values__(self) -> List[ir.Value]: + values = extract_mlir_values(self.group_idx) + values.extend(extract_mlir_values(self.cta_tile_idx_m)) + values.extend(extract_mlir_values(self.cta_tile_idx_n)) + values.extend(extract_mlir_values(self.problem_shape_m)) + values.extend(extract_mlir_values(self.problem_shape_n)) + values.extend(extract_mlir_values(self.problem_shape_k)) + values.extend(extract_mlir_values(self.cta_tile_count_k)) + return values + + def __new_from_mlir_values__(self, values: List[ir.Value]) -> "GroupSearchResult": + assert len(values) == 7 + return GroupSearchResult(*tuple(values)) + + +class GroupedGemmGroupSearchState: + """ + The state of group index search for grouped gemm. + + The state will be initialized once and updated in every round of group index search. + + :param start_group_idx: The group idx to start the search with + :type start_group_idx: Int32 + :param tile_count_prev_group: Number of tiles before the matched group + :type tile_count_prev_group: Int32 + :param tile_count_searched: Number of tiles we have searched. When the matched group is found, + it records the number of tiles including the matched group + :type tile_count_searched: Int32 + """ + + def __init__( + self, + start_group_idx: Int32, + tile_count_prev_group: Int32, + tile_count_searched: Int32, + ) -> None: + self.start_group_idx = start_group_idx + self.tile_count_prev_group = tile_count_prev_group + self.tile_count_searched = tile_count_searched + + def __extract_mlir_values__(self) -> List[ir.Value]: + values = extract_mlir_values(self.start_group_idx) + values.extend(extract_mlir_values(self.tile_count_prev_group)) + values.extend(extract_mlir_values(self.tile_count_searched)) + return values + + def __new_from_mlir_values__( + self, values: List[ir.Value] + ) -> "GroupedGemmGroupSearchState": + start_group_idx = new_from_mlir_values(self.start_group_idx, [values[0]]) + tile_count_prev_group = new_from_mlir_values( + self.tile_count_prev_group, [values[1]] + ) + tile_count_searched = new_from_mlir_values( + self.tile_count_searched, [values[2]] + ) + return GroupedGemmGroupSearchState( + start_group_idx, tile_count_prev_group, tile_count_searched + ) + + +def create_initial_search_state() -> GroupedGemmGroupSearchState: + """ + Create an initial search state for grouped gemm. + + :return: A new search state with initial values + :rtype: GroupedGemmGroupSearchState + """ + return GroupedGemmGroupSearchState( + start_group_idx=Int32(0), + tile_count_prev_group=Int32(0), + tile_count_searched=Int32(0), + ) + + +class GroupedGemmTileSchedulerHelper: + """ + A helper to translate the raw block index (x, y, z) from tile scheduler to real CTA tile index for grouped gemm. + + :param group_count: Number of groups in current grouped gemm problem + :type group_count: int + :param tile_sched_params: Parameter used to create the tile scheduler this helper works with + :type tile_sched_params: PersistentTileSchedulerParams + :param cluster_tile_shape_mnk: The shape of cluster tile as (m, n, k) + :type cluster_tile_shape_mnk: tuple[int, int, int] + :param search_state: The initial search state + :type search_state: GroupedGemmGroupSearchState + """ + + def __init__( + self, + group_count: int, + tile_sched_params: PersistentTileSchedulerParams, + cluster_tile_shape_mnk: tuple[int, int, int], + search_state: GroupedGemmGroupSearchState, + ) -> None: + self.tile_sched_params = tile_sched_params + self.group_count = group_count + self.lane_idx = cute.arch.lane_idx() + self.cluster_tile_shape_mnk = cluster_tile_shape_mnk + self.search_state = search_state + + def __extract_mlir_values__(self) -> List[ir.Value]: + values = extract_mlir_values(self.tile_sched_params) + values.extend(extract_mlir_values(self.search_state)) + return values + + def __new_from_mlir_values__( + self, values: List[ir.Value] + ) -> "GroupedGemmTileSchedulerHelper": + tile_sched_params = new_from_mlir_values(self.tile_sched_params, values) + search_state = new_from_mlir_values(self.search_state, values[1:]) + return GroupedGemmTileSchedulerHelper( + self.group_count, + tile_sched_params, + self.cluster_tile_shape_mnk, + search_state, + ) + + def delinearize_z( + self, + cta_tile_coord: tuple, + problem_shape_mnkl: cute.Tensor, + ) -> GroupSearchResult: + """ + Delinearize the linear z index and return GroupSearchResult. + + This function should be used by warps that need to know the CTA tile index on M and N dimensions. + + :param cta_tile_coord: The raw CTA coordinate from tile scheduler + :type cta_tile_coord: tuple of Int32 + :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for each group + :type problem_shape_mnkl: cute.Tensor + :return: The search result containing group index and tile coordinates + :rtype: GroupSearchResult + """ + # delinear the z coord + linear_idx = cta_tile_coord[2] + group_idx, problem_mnkl = self._group_search_and_load_problem_shape( + linear_idx, + problem_shape_mnkl, + self.search_state.start_group_idx, + self.search_state.tile_count_prev_group, + ) + # linear index local to current group + cluster_tile_idx_in_current_group = ( + linear_idx - self.search_state.tile_count_prev_group + ) + cluster_count_m, cluster_count_n, cluster_count_k = cute.ceil_div( + (problem_mnkl[0], problem_mnkl[1], problem_mnkl[2]), + ( + self.cluster_tile_shape_mnk[0], + self.cluster_tile_shape_mnk[1], + self.cluster_tile_shape_mnk[2], + ), + ) + # decompose to get indices on M and N + cta_tile_idx_m, cta_tile_idx_n = self._compute_cta_tile_coord( + cluster_tile_idx_in_current_group, + cta_tile_coord, + cluster_count_m, + cluster_count_n, + ) + return GroupSearchResult( + group_idx, + cta_tile_idx_m, + cta_tile_idx_n, + problem_mnkl[0], + problem_mnkl[1], + problem_mnkl[2], + cluster_count_k, + ) + + def search_cluster_tile_count_k( + self, + cta_tile_coord: tuple, + problem_shape_mnkl: cute.Tensor, + ) -> Tuple[Int32, Int32]: + """ + Search the matched group for given linear index and compute the number of tiles along K dimension for the matched group. + + This function should be used by warps that are only interested in the number of tiles along K dimension. + + :param cta_tile_coord: The raw CTA coordinate from tile scheduler + :type cta_tile_coord: tuple of Int32 + :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups + :type problem_shape_mnkl: cute.Tensor + :return: A tuple containing cluster count along K dimension and the group index + :rtype: Tuple[Int32, Int32] + """ + group_idx, problem_mnk = self._group_search_and_load_problem_shape( + cta_tile_coord[2], + problem_shape_mnkl, + self.search_state.start_group_idx, + self.search_state.tile_count_prev_group, + ) + cluster_count_k = ( + problem_mnk[2] + self.cluster_tile_shape_mnk[2] - 1 + ) // self.cluster_tile_shape_mnk[2] + return cluster_count_k, group_idx + + @cute.jit + def _prefix_sum(self, value_per_thread: Int32) -> Int32: + """ + Perform prefix sum within a full warp. + + :param value_per_thread: The value for this thread to contribute to the prefix sum + :type value_per_thread: Int32 + :return: The prefix sum result for this thread + :rtype: Int32 + """ + clamp_value = 0 + idx = 1 + sum_per_thread = value_per_thread + while idx < cute.arch.WARP_SIZE: + value = cute.arch.shuffle_sync_up( + sum_per_thread, idx, mask_and_clamp=clamp_value + ) + if self.lane_idx >= idx: + sum_per_thread += value + idx = idx << 1 + return sum_per_thread + + def _get_problem_for_group( + self, problem_shape_mnkl: cute.Tensor, group_idx: Int32 + ) -> cute.Tensor: + """ + Load gemm problem (m,n,k,l) for the specified group from global memory to register. + + :param problem_shape_mnkl: Tensor in global memory with layout (group_count, 4):(4, 1) + :type problem_shape_mnkl: cute.Tensor + :param group_idx: The index of the group to load + :type group_idx: Int32 + :return: The problem shape tensor for the specified group + :rtype: cute.Tensor + """ + cur_problem_mnkl = cute.make_fragment( + cute.make_layout(4), problem_shape_mnkl.element_type + ) + cute.autovec_copy(problem_shape_mnkl[(group_idx, None)], cur_problem_mnkl) + return cur_problem_mnkl + + def _get_cluster_tile_count_mn(self, problem_shape: cute.Tensor) -> Int32: + """ + Compute total cluster count. + + :param problem_shape: Tensor containing problem shape (m, n, k, l) + :type problem_shape: cute.Tensor + :return: The total cluster tile count for M and N dimensions + :rtype: Int32 + """ + cur_ntile_m = ( + problem_shape[0] + self.cluster_tile_shape_mnk[0] - 1 + ) // self.cluster_tile_shape_mnk[0] + cur_ntile_n = ( + problem_shape[1] + self.cluster_tile_shape_mnk[1] - 1 + ) // self.cluster_tile_shape_mnk[1] + cur_ntile_mn = cur_ntile_m * cur_ntile_n + return cur_ntile_mn + + def _compute_cta_tile_coord( + self, + cluster_tile_idx: Int32, + cta_tile_coord_in_cluster: tuple, + cluster_tile_count_m: Int32, + cluster_tile_count_n: Int32, + ) -> tuple: + """ + Compute CTA tile indices along M and N dimensions based on the linear index within a group. + + It uses the AlongM mode to decompose the linear index onto M and N dimensions. + + :param cluster_tile_idx: The linear index within a group + :type cluster_tile_idx: Int32 + :param cta_tile_coord_in_cluster: CTA indices along M and N dimensions within a cluster + :type cta_tile_coord_in_cluster: tuple of Int32 + :param cluster_tile_count_m: The number of clusters along M dimension of the matched group + :type cluster_tile_count_m: Int32 + :param cluster_tile_count_n: The number of clusters along N dimension of the matched group + :type cluster_tile_count_n: Int32 + :return: A tuple containing CTA tile indices along M and N dimensions + :rtype: tuple of (Int32, Int32) + """ + cluster_layout_mn = cute.make_layout( + (cluster_tile_count_m, cluster_tile_count_n) + ) + (mi, ni) = cluster_layout_mn.get_hier_coord(cluster_tile_idx) + cta_tile_idx_m = ( + mi * self.tile_sched_params.cluster_shape_mn[0] + + cta_tile_coord_in_cluster[0] + ) + cta_tile_idx_n = ( + ni * self.tile_sched_params.cluster_shape_mn[1] + + cta_tile_coord_in_cluster[1] + ) + return (cta_tile_idx_m, cta_tile_idx_n) + + @cute.jit + def _group_search( + self, + linear_idx: Int32, + problem_shape_mnkl: cute.Tensor, + init_group_idx: Int32, + init_tile_count_searched: Int32, + ) -> GroupedGemmGroupSearchState: + """ + Search which group the linear index belongs to. + + :param linear_idx: The linear index to be decomposed + :type linear_idx: Int32 + :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups + :type problem_shape_mnkl: cute.Tensor + :param init_group_idx: The group idx to start the search with + :type init_group_idx: Int32 + :param init_tile_count_searched: The number of tiles we have searched + :type init_tile_count_searched: Int32 + :return: The updated search state + :rtype: GroupedGemmGroupSearchState + """ + c_0 = Int32(0).ir_value() + last_lane_idx = cute.arch.WARP_SIZE - 1 + + tile_count_searched = init_tile_count_searched + start_group_idx = init_group_idx + not_found = linear_idx >= tile_count_searched + tile_count_prev_group = self.search_state.tile_count_prev_group + while not_found: + # get group to search for current lane + cur_group_idx = start_group_idx + self.lane_idx + # check if the group to be checked is out of range + inside_group_bound = cur_group_idx < self.group_count + cur_ntile_mn = c_0 + if inside_group_bound: + # get problem size of current group + cur_problem_mnkl = self._get_problem_for_group( + problem_shape_mnkl, cur_group_idx + ) + cur_ntile_mn = self._get_cluster_tile_count_mn(cur_problem_mnkl) + # compute tile count from beginning to current group(included) + total_cluster_tile_count_ps_per_thread = self._prefix_sum(cur_ntile_mn) + cluster_tile_count_end_per_thread = ( + total_cluster_tile_count_ps_per_thread + tile_count_searched + ) + + group_not_in_window = linear_idx >= cluster_tile_count_end_per_thread + hitted_group_idx_in_search_window = cute.arch.popc( + cute.arch.vote_ballot_sync(group_not_in_window) + ) + not_found = hitted_group_idx_in_search_window == cute.arch.WARP_SIZE + start_group_idx = hitted_group_idx_in_search_window + start_group_idx + hit_the_1st_problem_in_search_window = ( + hitted_group_idx_in_search_window == c_0 + ) + tile_count_prev_group = tile_count_searched + if hit_the_1st_problem_in_search_window == False: + tile_count_prev_group = cute.arch.shuffle_sync( + cluster_tile_count_end_per_thread, + hitted_group_idx_in_search_window - 1, + ) + + # If no matched group, then get new_cluster_tile_count_end from last lane + # Otherwise, get new_cluster_tile_count_end from the hitted group + lane_idx_for_cluster_tile_count_end = hitted_group_idx_in_search_window + if not_found: + lane_idx_for_cluster_tile_count_end = last_lane_idx + tile_count_searched = cute.arch.shuffle_sync( + cluster_tile_count_end_per_thread, + lane_idx_for_cluster_tile_count_end, + ) + + return GroupedGemmGroupSearchState( + start_group_idx, + tile_count_prev_group, + tile_count_searched, + ) + + def _group_search_and_load_problem_shape( + self, + linear_idx: Int32, + problem_shape_mnkl: cute.Tensor, + start_group_idx: Int32, + tile_count_searched: Int32, + ) -> Tuple[Int32, cute.Tensor]: + """ + Perform group search and load problem shape for the matched group. + + :param linear_idx: The linear index to be decomposed + :type linear_idx: Int32 + :param problem_shape_mnkl: Tensor containing gemm problem size (M, N, K, L) for all groups + :type problem_shape_mnkl: cute.Tensor + :param start_group_idx: The group idx to start the search with + :type start_group_idx: Int32 + :param tile_count_searched: The number of tiles we have searched + :type tile_count_searched: Int32 + :return: A tuple containing the final group index and the problem shape tensor + :rtype: Tuple[Int32, cute.Tensor] + """ + self.search_state = self._group_search( + linear_idx, + problem_shape_mnkl, + start_group_idx, + tile_count_searched, + ) + # get final group search state + final_group_idx = self.search_state.start_group_idx + # let's revisit if it's better to broadcast problem_shape_mnk in group_search + problem_mnkl = self._get_problem_for_group(problem_shape_mnkl, final_group_idx) + return final_group_idx, problem_mnkl diff --git a/python/CuTeDSL/cutlass/utils/hardware_info.py b/python/CuTeDSL/cutlass/utils/hardware_info.py new file mode 100644 index 00000000..e86fcbef --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/hardware_info.py @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from cuda.bindings import driver, nvrtc + +import cutlass.cute as cute + +""" +This class is used to get the hardware info of given GPU device. +It provides methods to get the max active clusters for given cluster size. + +Prerequisite: +- CUDA driver is initialized via `driver.cuInit` or other CUDA APIs. +- CUDA context is created via `driver.cuCtxCreate` or other CUDA APIs. + +""" + + +class HardwareInfo: + """ + device_id: CUDA device ID to get the hardware info. + """ + + def __init__(self, device_id: int = 0): + count = self._checkCudaErrors(driver.cuDeviceGetCount()) + if device_id >= count: + raise ValueError( + f"Device ID {device_id} is out of range for device count {count}" + ) + self.device_id = device_id + self.device = self._checkCudaErrors(driver.cuDeviceGet(device_id)) + self.context = self._checkCudaErrors(driver.cuCtxGetCurrent()) + self.driver_version = self._checkCudaErrors(driver.cuDriverGetVersion()) + + # Getting the max active clusters for a given cluster size + def get_max_active_clusters(self, cluster_size: int) -> int: + self._get_device_function() + if self._cuda_driver_version_lt(11, 8): + raise RuntimeError( + "CUDA Driver version < 11.8, cannot get _max_active_clusters" + ) + if cluster_size <= 0 or cluster_size > 32: + raise ValueError( + f"Cluster size must be between 1 and 32, {cluster_size} is not supported" + ) + + max_shared_memory_per_block = self._checkCudaErrors( + driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, + self.device, + ) + ) + self._checkCudaErrors( + driver.cuFuncSetAttribute( + self.kernel, + driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, + max_shared_memory_per_block, + ) + ) + max_dynamic_shared_memory = self._checkCudaErrors( + driver.cuOccupancyAvailableDynamicSMemPerBlock( + self.kernel, 1, 1 # numBlocks # blockSize + ) + ) + max_active_blocks = self._checkCudaErrors( + driver.cuOccupancyMaxActiveBlocksPerMultiprocessor( + self.kernel, 1, max_dynamic_shared_memory # blockSize, + ) + ) + # allow non-portable cluster size to support detection of non-portable cluster size + self._checkCudaErrors( + driver.cuFuncSetAttribute( + self.kernel, + driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED, + 1, + ) + ) + # prepare launch configuration + launch_config = driver.CUlaunchConfig() + launch_config.blockDimX = 128 + launch_config.blockDimY = 1 + launch_config.blockDimZ = 1 + launch_config.sharedMemBytes = max_dynamic_shared_memory + launch_config.numAttrs = 1 + # max possible cluster size is 32 + cluster_dims_attr = driver.CUlaunchAttribute() + cluster_dims_attr.id = ( + driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION + ) + value = driver.CUlaunchAttributeValue() + value.clusterDim.x = cluster_size + value.clusterDim.y = 1 + value.clusterDim.z = 1 + cluster_dims_attr.value = value + launch_config.attrs = [cluster_dims_attr] + launch_config.gridDimX = cluster_size + launch_config.gridDimY = max_active_blocks + launch_config.gridDimZ = 1 + + num_clusters = self._checkCudaErrors( + driver.cuOccupancyMaxActiveClusters(self.kernel, launch_config) + ) + return num_clusters + + def get_l2_cache_size_in_bytes(self) -> int: + return self._checkCudaErrors( + driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, + self.device, + ) + ) + + def get_device_multiprocessor_count(self) -> int: + return self._checkCudaErrors( + driver.cuDeviceGetAttribute( + driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + self.device, + ) + ) + + def _checkCudaErrors(self, result) -> None: + if result[0].value: + raise RuntimeError( + "CUDA error code={}({})".format( + result[0].value, self._cudaGetErrorEnum(result[0]) + ) + ) + # CUDA APIs always return the status as the first element of the result tuple + if len(result) == 1: + return None + elif len(result) == 2: + return result[1] + else: + return result[1:] + + def _cudaGetErrorEnum(self, error) -> str: + if isinstance(error, driver.CUresult): + err, name = driver.cuGetErrorName(error) + return name if err == driver.CUresult.CUDA_SUCCESS else "" + elif isinstance(error, nvrtc.nvrtcResult): + return nvrtc.nvrtcGetErrorString(error)[1] + else: + raise RuntimeError("Unknown error type: {}".format(error)) + + def _cuda_driver_version_ge(self, major: int, minor: int) -> bool: + return self.driver_version >= (major * 1000 + 10 * minor) + + def _cuda_driver_version_lt(self, major: int, minor: int) -> bool: + return not self._cuda_driver_version_ge(major, minor) + + @cute.kernel + def _empty_kernel(self): + return + + @cute.jit + def _host_function(self): + self._empty_kernel().launch( + grid=[1, 1, 1], + block=[1, 1, 1], + ) + + # get a empty kernel to compute occupancy + def _get_device_function(self) -> None: + self.compiled_kernel = cute.compile(self._host_function) + self.module = next(iter(self.compiled_kernel.cuda_modules.modules)).cuda_module + self.kernel = next(iter(self.compiled_kernel.cuda_modules.modules)).kernel_ptr diff --git a/python/CuTeDSL/cutlass/utils/hopper_helpers.py b/python/CuTeDSL/cutlass/utils/hopper_helpers.py new file mode 100644 index 00000000..d29daf50 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/hopper_helpers.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Type, Tuple +from enum import Enum + +from cutlass.utils.layout import LayoutEnum +from cutlass.cutlass_dsl import ( + Float16, + BFloat16, + Float8E5M2, + Float8E4M3FN, + Numeric, + NumericMeta, + dsl_user_op, +) + +import cutlass +import cutlass.cute as cute +from cutlass.cute.nvgpu.common import CopyUniversalOp +from cutlass.cute.nvgpu.warp import StMatrix8x8x16bOp +from cutlass.cute.nvgpu.warpgroup import ( + MmaF16BF16Op, + MmaF8Op, + OperandMajorMode, + OperandSource, +) + +@dsl_user_op +def sm90_get_smem_store_op( + layout_d: LayoutEnum, + elem_ty_d: Type[Numeric], + elem_ty_acc: Type[Numeric], + *, + loc=None, + ip=None, +) -> cute.CopyAtom: + """ + Selects the largest vectorized smem store atom available subject to constraint of gmem layout. + + Parameters: + ----------- + layout_d : LayoutEnum + The layout enum of the output tensor D. + + elem_ty_d : Type[Numeric] + The element type for output tensor D. + + elem_ty_acc : Type[Numeric] + The element type for accumulator. + + Returns: + -------- + Either SmemStoreMatrix or SimtSyncCopy, based on the input parameters. + """ + + def validate_type(ty, ty_name): + if not isinstance(ty, NumericMeta): + raise TypeError(f"{ty_name} must be a Numeric, but got {ty}") + + validate_type(elem_ty_d, "elem_ty_d") + validate_type(elem_ty_acc, "elem_ty_acc") + + is_m_major = layout_d.is_m_major_c() + + if elem_ty_d.width == 16: + return cute.make_copy_atom( + StMatrix8x8x16bOp(is_m_major, 4), elem_ty_d, loc=loc, ip=ip + ) + else: + return cute.make_copy_atom(CopyUniversalOp(), elem_ty_d, loc=loc, ip=ip) + + +class SmemCapacity(Enum): + SM90_SMEM_CAPACITY_BYTES = (228 - 1) * 1024 + + +# Dictionary to map compute capability to SMEM capacity +SMEM_CAPACITY = { + "sm90": SmemCapacity.SM90_SMEM_CAPACITY_BYTES.value, +} + +def make_trivial_tiled_mma( + a_dtype: Type[Numeric], + b_dtype: Type[Numeric], + a_leading_mode: OperandMajorMode, + b_leading_mode: OperandMajorMode, + acc_dtype: Type[Numeric], + atom_layout_mnk: Tuple[int, int, int], + tiler_mn: Tuple[int, int], +) -> cute.TiledMma: + """Make a tiled MMA atom with given data type, leading dimension, cta group and mma tile shape. + By default, the MMA atom is created with SMEM operand source for A. + + :param a_dtype: Data type of operand A. + :type a_dtype: type[Numeric] + :param b_dtype: Data type of operand B. + :type b_dtype: type[Numeric] + :param a_leading_mode: Leading dimension of operand A (1 for K, 0 for M/N). + :type a_leading_mode: warpgroup.OperandMajorMode + :param b_leading_mode: Leading dimension of operand B (1 for K, 0 for M/N). + :type b_leading_mode: warpgroup.OperandMajorMode + :param acc_dtype: Data type of the accumulator. + :type acc_dtype: type[Numeric] + :param atom_layout_mnk: A integer tuple describing the tiling of Atom across threads. + :type atom_layout_mnk: Tuple[int, int, int] + :param tiler_mn: The shape (M, N) of the cta tiler. + :type tiler_mn: Tuple[int, int] + + :return: A tiled MMA atom. + :rtype: cute.TiledMma + + :raises TypeError: If the data type is not supported. + """ + + if a_dtype in {Float16, BFloat16}: + if cutlass.const_expr(a_dtype != b_dtype): + raise TypeError(f"Type mismatch: {a_dtype} != {b_dtype}") + if cutlass.const_expr(a_dtype.width != b_dtype.width): + raise TypeError(f"Type width mismatch: {a_dtype.width} != {b_dtype.width}") + + mma_op = MmaF16BF16Op( + a_dtype, + acc_dtype, + (*tiler_mn, 16), + OperandSource.SMEM, + a_leading_mode, + b_leading_mode, + ) + elif a_dtype in {Float8E4M3FN, Float8E5M2} and b_dtype in { + Float8E4M3FN, + Float8E5M2, + }: + mma_op = MmaF8Op( + a_dtype, + b_dtype, + acc_dtype, + (*tiler_mn, 32), + OperandSource.SMEM, + a_leading_mode, + b_leading_mode, + ) + else: + raise TypeError(f"unsupported a_dtype and b_dtype, got {a_dtype} and {b_dtype}") + + return cute.make_tiled_mma(cute.make_mma_atom(mma_op), atom_layout_mnk) + +def get_smem_layout_atom( + layout: LayoutEnum, + element_type: Type[Numeric], + major_mode_size: int, + *, + loc=None, + ip=None, +): + """Select the optimal shared memory layout atom based on parameters. + + :param layout: Layout enum of the tensor + :type layout: LayoutEnum + :param element_type: Data type of the elements + :type element_type: type[cutlass.Numeric] + :param major_mode_size: Size of the major mode dimension + :type major_mode_size: int + + :return: Selected shared memory layout atom kind + :rtype: cute.nvgpu.warpgroup.SmemLayoutAtomKind + """ + assert major_mode_size % 8 == 0 + sw128_num_contiguous_bits = 1024 + sw64_num_contiguous_bits = 512 + sw32_num_contiguous_bits = 256 + major_mode_size_bits = major_mode_size * element_type.width + if layout.sm90_mma_major_mode() == OperandMajorMode.MN: + if major_mode_size_bits % sw128_num_contiguous_bits == 0: + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW128 + if major_mode_size_bits % sw64_num_contiguous_bits == 0: + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW64 + if major_mode_size_bits % sw32_num_contiguous_bits == 0: + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_SW32 + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.MN_INTER + if major_mode_size_bits % sw128_num_contiguous_bits == 0: + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW128 + if major_mode_size_bits % sw64_num_contiguous_bits == 0: + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW64 + if major_mode_size_bits % sw32_num_contiguous_bits == 0: + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_SW32 + return cute.nvgpu.warpgroup.SmemLayoutAtomKind.K_INTER diff --git a/python/CuTeDSL/cutlass/utils/layout.py b/python/CuTeDSL/cutlass/utils/layout.py new file mode 100644 index 00000000..a1261d4d --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/layout.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from enum import Enum + +import cutlass.cute as cute +from cutlass.cute.nvgpu import warpgroup +from cutlass.cute.nvgpu import tcgen05 + + +class LayoutEnum(Enum): + ROW_MAJOR = "row_major" + COL_MAJOR = "col_major" + + def mma_major_mode(self): + return ( + tcgen05.OperandMajorMode.K + if self == LayoutEnum.ROW_MAJOR + else tcgen05.OperandMajorMode.MN + ) + + def sm90_mma_major_mode(self): + return ( + warpgroup.OperandMajorMode.K + if self == LayoutEnum.ROW_MAJOR + else warpgroup.OperandMajorMode.MN + ) + + def is_k_major_a(self): + return self == LayoutEnum.ROW_MAJOR + + def is_m_major_a(self): + return self == LayoutEnum.COL_MAJOR + + def is_k_major_b(self): + return self == LayoutEnum.COL_MAJOR + + def is_n_major_b(self): + return self == LayoutEnum.ROW_MAJOR + + def is_n_major_c(self): + return self == LayoutEnum.ROW_MAJOR + + def is_m_major_c(self): + return self == LayoutEnum.COL_MAJOR + + @staticmethod + def from_tensor(tensor: cute.Tensor) -> "LayoutEnum": + ret = None + if tensor.leading_dim == 1: + ret = LayoutEnum.ROW_MAJOR + elif tensor.leading_dim == 0: + ret = LayoutEnum.COL_MAJOR + else: + raise ValueError(f"Invalid leading dimension: {tensor.leading_dim}") + + return ret + + +__all__ = ["LayoutEnum"] diff --git a/python/CuTeDSL/cutlass/utils/pipeline.py b/python/CuTeDSL/cutlass/utils/pipeline.py new file mode 100644 index 00000000..a339a3e3 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/pipeline.py @@ -0,0 +1,984 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +import enum +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Optional + +from cutlass.cutlass_dsl import Boolean, Int32, Int64, T, if_generate, and_, or_ + +import cutlass._mlir.dialects.cute as _cute_ir + +import cutlass.cute as cute + + +############################################################################## +# Agent class +############################################################################## + + +class Agent(enum.Enum): + """ + Agent indicates what is participating in the pipeline synchronization. + """ + # Arbitrary grouping of N threads + Thread = enum.auto() + # Same as AsyncThread, but includes all threads in the block + ThreadBlock = enum.auto() + # Same as AsyncThread, but includes all threads in the cluster + ThreadBlockCluster = enum.auto() + + +class CooperativeGroup: + """ + CooperativeGroup contains size and alignment restrictions for an Agent. + """ + def __init__(self, agent: Agent, size: int = 1, alignment: int = 1): + if agent is Agent.Thread: + assert size > 0 + if size == 32: + assert ( + size == alignment + ), "Error: Alignment does not match number of threads in a warp." + elif size == 128: + assert ( + size == alignment + ), "Error: Alignment does not match number of threads in a warpgroup." + elif agent is Agent.ThreadBlock: + assert False, "Error: Not yet supported." + elif agent is Agent.ThreadBlockCluster: + assert False, "Error: Not yet supported." + else: + # Should never reach this state + size = 0 + + if size <= 0: + raise ValueError( + "Error: The number of threads in a CooperativeGroup must be more than 0." + ) + + # Size indicates how many threads are participating in this CooperativeGroup + self.size = size + # Agent indicates the type of thread group + self.agent = agent + + +class _PipelineOp(enum.Enum): + """ + PipelineOp assigns an operation to an agent corresponding to a specific hardware feature. + """ + # async-threads + AsyncThread = enum.auto() + # Blackwell (SM100a) MMA instruction + TCGen05Mma = enum.auto() + # Tensor Memory Accelerator load + TmaLoad = enum.auto() + # TMA Store consuming smem produced by AsyncThread + TmaStore = enum.auto() + + +def _get_pipeline_op(type_str): + return _PipelineOp(type_str) + + +############################################################################## +# SyncObjectArray class +############################################################################## + + +class SyncObjectArray(ABC): + """ + SyncObjectArray is an abstract base class for different types of hardware synchronizations (e.g. smem barriers, named barriers, fences) + """ + + @abstractmethod + def wait(self): + pass + + @abstractmethod + def arrive(self): + pass + + @abstractmethod + def get_barrier(self): + pass + + +class MbarrierArray(SyncObjectArray): + """ + MbarrierArray implements an abstraction for an array of smem barriers. + """ + + def __init__( + self, + barrier_storage: cute.Pointer, + num_stages: int, + agent: tuple[_PipelineOp, CooperativeGroup], + tx_count: int = 0, + ): + self.barrier_storage = barrier_storage + self.tx_count = tx_count + self.num_stages = num_stages + self.op_type, self.cg = agent + self.arrive_count = self.cg.size + + if self.num_stages <= 0: + raise ValueError("Error: Mbarrier stage count must be greater than 0.") + if self.arrive_count <= 0: + raise ValueError("Error: Mbarrier arrive count must be greater than 0.") + if self.op_type is _PipelineOp.TmaLoad and self.tx_count <= 0: + raise ValueError( + "Error: Mbarrier tx count must be greater than 0 for TMA ops." + ) + + # Using a tensor to store mbarrier i64 ptrs + self.mbarrier_array = cute.make_fragment(cute.make_layout(num_stages), Int64) + for i in range(num_stages): + self.mbarrier_array[i] = _cute_ir.ptrtoint( + T.i64(), (self.barrier_storage + i).value + ) + + # Mbarrier initialization in constructor + self.mbarrier_init() + + # Mbarrier initialization + def mbarrier_init(self): + """ + Initializes an array of mbarriers using warp 0. + """ + def then_body(): + for index in range(self.num_stages): + cute.arch.mbarrier_init_arrive_cnt( + _mbarrier_i64_to_ptr(self.mbarrier_array[index]), self.arrive_count + ) + + warp_idx = cute.arch.warp_idx() + warp_idx = cute.arch.make_warp_uniform(warp_idx) + + if_generate(warp_idx == 0, then_body) + + def arrive(self, index: int, dst: int): + """ + Select the arrive corresponding to this MbarrierArray's PipelineOp + :param index: Index of the mbarrier in the array to arrive on + :type index: int + :param dst: Destination parameter for selective arrival, which can be either a mask or destination cta rank. When None, both TCGen05Mma and AsyncThread will arrive on their local mbarrier. + - For TCGen05Mma, dst serves as a multicast mask (e.g., 0b1011 allows arrive signal to be multicast to CTAs in the cluster with rank = 0, 1, and 3). + - For AsyncThread, dst serves as a destination cta rank (e.g., 3 means threads will arrive on the mbarrier with rank = 3 in the cluster). + :type dst: int | None + """ + if self.op_type is _PipelineOp.AsyncThread: + self.arrive_mbarrier(index, dst) + elif self.op_type is _PipelineOp.TCGen05Mma: + self.arrive_tcgen05mma(index, dst) + elif self.op_type in [_PipelineOp.TmaLoad]: + self.arrive_and_expect_tx(index, self.tx_count) + else: + print(_get_pipeline_op(self.op_type)) + assert False, "Error: MbarrierArray is not supported for this PipelineOp." + + def arrive_mbarrier(self, index: int, dst_rank: int): + if dst_rank is None: + cute.arch.mbarrier_arrive(_mbarrier_i64_to_ptr(self.mbarrier_array[index])) + else: + cute.arch.mbarrier_arrive( + _mbarrier_i64_to_ptr(self.mbarrier_array[index]), dst_rank + ) + + def arrive_tcgen05mma(self, index: int, mask: int): + if mask is None: + with cute.arch.elect_one(): + cute.nvgpu.tcgen05.commit( + _mbarrier_i64_to_ptr(self.mbarrier_array[index]) + ) + else: + with cute.arch.elect_one(): + cute.nvgpu.tcgen05.commit( + _mbarrier_i64_to_ptr(self.mbarrier_array[index]), + mask, + cute.nvgpu.tcgen05.CtaGroup.TWO, + ) + + def arrive_and_expect_tx(self, index: int, tx_count: int): + with cute.arch.elect_one(): + cute.arch.mbarrier_init_tx_bytes( + _mbarrier_i64_to_ptr(self.mbarrier_array[index]), tx_count + ) + + def try_wait(self, index: int, phase: int): + return cute.arch.mbarrier_try_wait( + _mbarrier_i64_to_ptr(self.mbarrier_array[index]), phase + ) + + def wait(self, index: int, phase: int): + cute.arch.mbarrier_wait(_mbarrier_i64_to_ptr(self.mbarrier_array[index]), phase) + + def get_barrier(self, index: int) -> cute.Pointer: + return _mbarrier_i64_to_ptr(self.mbarrier_array[index]) + + +class TmaStoreFence(SyncObjectArray): + """ + TmaStoreFence is used for a multi-stage epilogue buffer. + """ + + def __init__( + self, + num_stages: int = 0, + ): + if num_stages <= 0: + raise ValueError("Mbarrier stage count must be greater than 0.") + + self.num_stages = num_stages + + def arrive(self): + cute.arch.cp_async_bulk_commit_group() + + def wait(self): + cute.arch.cp_async_bulk_wait_group(self.num_stages - 1, read=True) + + # TmaStoreFence doesn't have mbarriers + def get_barrier(self): + assert ( + False + ), "Error: TmaStoreFence doesn't use mbarriers and cannot return a barrier." + + def tail(self): + cute.arch.cp_async_bulk_wait_group(0, read=True) + + +############################################################################## +# PipelineState class +############################################################################## + + +class PipelineUserType(enum.Enum): + Producer = enum.auto() + Consumer = enum.auto() + + +class PipelineState: + """ + Pipeline state contains an index and phase bit corresponding to the current position in the circular buffer. + """ + + def __init__(self, stages: int, count, index, phase): + self._stages = stages + self._count = count + self._index = index + self._phase = phase + + def clone(self) -> "PipelineState": + return PipelineState(self.stages, self._count, self.index, self.phase) + + @property + def index(self) -> Int32: + return self._index + + @property + def count(self) -> Int32: + return self._count + + @property + def stages(self) -> int: + return self._stages + + @property + def phase(self) -> Int32: + return self._phase + + def reset_count(self): + self._count = Int32(0) + + def advance(self): + self._index += 1 + self._count += 1 + + def then_body(index, phase): + new_index = Int32(0) + new_phase = phase ^ 1 + return new_index, new_phase + + def else_body(index, phase): + return index, phase + + self._index, self._phase = if_generate( + self._index == self.stages, + then_body, + else_body, + [self.index, self.phase], + [Int32, Int32], + ) + + def reverse(self): + self._index -= 1 + self._count -= 1 + + def then_body(index, phase): + new_index = Int32(self.stages - 1) + new_phase = phase ^ 1 + return new_index, new_phase + + def else_body(index, phase): + return index, phase + + self._index, self._phase = if_generate( + self._index == -1, + then_body, + else_body, + [self.index, self.phase], + [Int32, Int32], + ) + + def __get_mlir_types__(self): + return [self._count.type, self._index.type, self._phase.type] + + def __extract_mlir_values__(self): + count = self._count + index = self._index + phase = self._phase + return [count.ir_value(), index.ir_value(), phase.ir_value()] + + # This can be overridden by derived classes + def __new_from_mlir_values__(self, values): + return PipelineState( + self.stages, Int32(values[0]), Int32(values[1]), Int32(values[2]) + ) + + +def make_pipeline_state(type: PipelineUserType, stages: int): + """ + Creates a pipeline state. Producers are assumed to start with an empty buffer and have a flipped phase bit of 1. + """ + if type is PipelineUserType.Producer: + return PipelineState( + stages, + Int32(0), + Int32(0), + Int32(1), + ) + elif type is PipelineUserType.Consumer: + return PipelineState( + stages, + Int32(0), + Int32(0), + Int32(0), + ) + else: + assert ( + False + ), "Error: invalid PipelineUserType specified for make_pipeline_state." + + +############################################################################## +# Pipeline classes +############################################################################## + + +@dataclass(frozen=True) +class PipelineAsync: + """ + PipelineAsync is a generic pipeline class where both the producer and consumer are + AsyncThreads. It also serves as a base class for specialized pipeline classes. + """ + sync_object_array_full: SyncObjectArray + sync_object_array_empty: SyncObjectArray + num_stages: Int32 + producer_mask: Int32 + consumer_mask: Int32 + + @staticmethod + def _make_sync_object_array( + barrier_storage: cute.Pointer, + num_stages: Int32, + agent: tuple[_PipelineOp, CooperativeGroup], + tx_count: int = 0, + ) -> SyncObjectArray: + """ + Returns a SyncObjectArray corresponding to an agent's PipelineOp. + """ + if agent[0] in [ + _PipelineOp.AsyncThread, + _PipelineOp.TmaLoad, + _PipelineOp.TCGen05Mma, + ]: + return MbarrierArray( + barrier_storage=barrier_storage, + num_stages=num_stages, + agent=agent, + tx_count=tx_count, + ) + elif agent[0] is _PipelineOp.TmaStore: + # Path taken for AsyncTmaStore + return TmaStoreFence(num_stages=num_stages) + else: + assert False, "Error: Invalid PipelineOp specified." + + @staticmethod + def create( + barrier_storage: cute.Pointer, + num_stages: Int32, + producer_group: CooperativeGroup, + consumer_group: CooperativeGroup, + producer_mask: Int32 = None, + consumer_mask: Int32 = None, + ): + """ + This helper function computes any necessary attributes and returns an instance of PipelineAsync. + :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers + :type barrier_storage: cute.Pointer + :param num_stages: Number of buffer stages for this pipeline + :type num_stages: Int32 + :param producer_group: CooperativeGroup for the producer agent + :type producer_group: CooperativeGroup + :param consumer_group: CooperativeGroup for the consumer agent + :type consumer_group: CooperativeGroup + :param producer_mask: Mask for signaling arrives for the producer agent + :type producer_mask: Int32 | None + :param consumer_mask: Mask for signaling arrives for the consumer agent + :type consumer_mask: Int32 | None + """ + producer_type = _PipelineOp.AsyncThread + consumer_type = _PipelineOp.AsyncThread + + producer = (producer_type, producer_group) + consumer = (consumer_type, consumer_group) + + sync_object_array_full = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8), num_stages, producer + ) + sync_object_array_empty = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8) + num_stages, num_stages, consumer + ) + + pipeline_init_wait() + + return PipelineAsync( + sync_object_array_full, + sync_object_array_empty, + num_stages, + producer_mask, + consumer_mask, + ) + + def producer_acquire( + self, state: PipelineState, try_acquire_token: Optional[Boolean] = None + ): + if_generate( + try_acquire_token is None or try_acquire_token == 0, + lambda: self.sync_object_array_empty.wait(state.index, state.phase), + ) + + def producer_try_acquire(self, state: PipelineState): + return self.sync_object_array_empty.try_wait(state.index, state.phase) + + def producer_commit(self, state: PipelineState): + self.sync_object_array_full.arrive(state.index, self.producer_mask) + + def consumer_wait( + self, state: PipelineState, try_wait_token: Optional[Boolean] = None + ): + if_generate( + try_wait_token is None or try_wait_token == 0, + lambda: self.sync_object_array_full.wait(state.index, state.phase), + ) + + def consumer_try_wait(self, state: PipelineState): + return self.sync_object_array_full.try_wait(state.index, state.phase) + + def consumer_release(self, state: PipelineState): + self.sync_object_array_empty.arrive(state.index, self.consumer_mask) + + def producer_get_barrier(self, state: PipelineState) -> cute.Pointer: + return self.sync_object_array_full.get_barrier(state.index) + + def producer_tail(self, state: PipelineState): + """ + Make sure the last used buffer empty signal is visible to producer. + Producer tail is usually executed by producer before exit, to avoid dangling + mbarrier arrive signals after kernel exit. + + :param state: The pipeline state that points to next useful buffer + :type state: PipelineState + """ + # Assume state contains that next useful buffer + # So we only need to advance to num_stages - 1 times to last used buffer + for i in range(self.num_stages - 1): + state.advance() + self.producer_acquire(state) + + +@dataclass(frozen=True) +class PipelineTmaAsync(PipelineAsync): + """ + PipelineTmaAsync is used for TMA producers and AsyncThread consumers (e.g. Hopper mainloops). + """ + is_signalling_thread: bool + + @staticmethod + def init_empty_barrier_arrive_signal(cta_layout_vmnk: cute.Layout): + """ + Initialize the empty barrier arrive signal + This function returns the destination cta rank and a boolean indicating if the signalling thread is the same as the current thread + """ + # Logic to optimally schedule Empty Arrives + cluster_shape_mnk = cta_layout_vmnk.shape + tidx, _, _ = cute.arch.thread_idx() + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + + is_signalling_thread = tidx < cute.size(cluster_shape_mnk) + dst_rank = tidx % cute.size(cluster_shape_mnk) + m = cluster_shape_mnk[0] + + # Check if same row + is_same_row_l = dst_rank % m + is_same_row_r = cta_rank_in_cluster % m + is_same_row = is_same_row_l == is_same_row_r + + # Check if same column + is_same_col_l = dst_rank // m + is_same_col_r = cta_rank_in_cluster // m + + is_same_col = is_same_col_l == is_same_col_r + + is_same_row_or_col = or_(is_same_row, is_same_col) + is_signalling_thread_final = and_(is_signalling_thread, is_same_row_or_col) + + return dst_rank, is_signalling_thread_final + + @staticmethod + def create( + barrier_storage: cute.Pointer, + num_stages: Int32, + producer_group: CooperativeGroup, + consumer_group: CooperativeGroup, + tx_count: int, + cta_layout_vmnk: Optional[cute.Layout] = None, + ): + """ + This helper function computes any necessary attributes and returns an instance of PipelineTmaAsync. + :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers + :type barrier_storage: cute.Pointer + :param num_stages: Number of buffer stages for this pipeline + :type num_stages: Int32 + :param producer_group: CooperativeGroup for the producer agent + :type producer_group: CooperativeGroup + :param consumer_group: CooperativeGroup for the consumer agent + :type consumer_group: CooperativeGroup + :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage + :type tx_count: int + :param cta_layout_vmnk: Layout of the cluster shape + :type cta_layout_vmnk: cute.Layout | None + """ + producer_type = _PipelineOp.TmaLoad + consumer_type = _PipelineOp.AsyncThread + + producer = (producer_type, producer_group) + consumer = (consumer_type, consumer_group) + + sync_object_array_full = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8), num_stages, producer, tx_count + ) + sync_object_array_empty = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8) + num_stages, num_stages, consumer + ) + + dst_rank, is_signalling_thread = ( + PipelineTmaAsync.init_empty_barrier_arrive_signal(cta_layout_vmnk) + ) + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1: + dst_rank = None + else: + dst_rank = dst_rank + + is_signalling_thread = is_signalling_thread + producer_mask = None + + pipeline_init_wait(cta_layout_vmnk) + + return PipelineTmaAsync( + sync_object_array_full, + sync_object_array_empty, + num_stages, + producer_mask, + dst_rank, + is_signalling_thread, + ) + + def producer_acquire( + self, state: PipelineState, try_acquire_token: Optional[Boolean] = None + ): + """ + TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks. + """ + if_generate( + try_acquire_token is None or try_acquire_token == 0, + lambda: self.sync_object_array_empty.wait(state.index, state.phase), + ) + self.sync_object_array_full.arrive(state.index, self.producer_mask) + + + def producer_commit(self, state: PipelineState): + """ + TMA producer commit is a NOP. The transaction barrier signals the commit upon completion of the TMA. + """ + pass + + def consumer_release(self, state: PipelineState): + """ + TMA consumer release conditionally signals the empty buffer to the producer. + """ + if_generate( + self.is_signalling_thread, + lambda: self.sync_object_array_empty.arrive( + state.index, self.consumer_mask + ), + ) + +@dataclass(frozen=True) +class PipelineTmaUmma(PipelineAsync): + """ + PipelineTmaUmma is used for TMA producers and UMMA consumers (e.g. Blackwell mainloops). + """ + is_leader_cta: bool + + @staticmethod + def _compute_mcast_arrival_mask(cta_layout_vmnk: cute.Layout): + """ + Computes a mask for signaling arrivals to multicasting threadblocks. + """ + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster) + + tma_mcast_mask_a = cute.nvgpu.cpasync.create_tma_multicast_mask( + cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=2 + ) + tma_mcast_mask_b = cute.nvgpu.cpasync.create_tma_multicast_mask( + cta_layout_vmnk, cta_in_cluster_coord_vmnk, mcast_mode=1 + ) + + block_in_cluster_coord_vmnk_peer = ( + cta_in_cluster_coord_vmnk[0] ^ 1, + *cta_in_cluster_coord_vmnk[1:], + ) + tma_mcast_mask_a_peer = cute.nvgpu.cpasync.create_tma_multicast_mask( + cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=2 + ) + tma_mcast_mask_b_peer = cute.nvgpu.cpasync.create_tma_multicast_mask( + cta_layout_vmnk, block_in_cluster_coord_vmnk_peer, mcast_mode=1 + ) + return ( + tma_mcast_mask_a + | tma_mcast_mask_b + | tma_mcast_mask_a_peer + | tma_mcast_mask_b_peer + ) + + @staticmethod + def _compute_is_leader_cta(cta_layout_vmnk: cute.Layout): + """ + Computes leader threadblocks for 2CTA kernels. For 1CTA, all threadblocks are leaders. + """ + bidx, bidy, _ = cute.arch.block_idx() + + mma_coord_vmnk = ( + bidx % cute.size(cta_layout_vmnk, mode=[0]), + bidx // cute.size(cta_layout_vmnk, mode=[0]), + bidy, + None, + ) + return mma_coord_vmnk[0] == 0 + + @staticmethod + def create( + barrier_storage: cute.Pointer, + num_stages: Int32, + producer_group: CooperativeGroup, + consumer_group: CooperativeGroup, + tx_count: int, + cta_layout_vmnk: Optional[cute.Layout] = None, + ): + """ + This helper function computes any necessary attributes and returns an instance of PipelineTmaUmma. + :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers + :type barrier_storage: cute.Pointer + :param num_stages: Number of buffer stages for this pipeline + :type num_stages: Int32 + :param producer_group: CooperativeGroup for the producer agent + :type producer_group: CooperativeGroup + :param consumer_group: CooperativeGroup for the consumer agent + :type consumer_group: CooperativeGroup + :param tx_count: Number of bytes expected to be written to the transaction barrier for one stage + :type tx_count: int + :param cta_layout_vmnk: Layout of the cluster shape + :type cta_layout_vmnk: cute.Layout | None + """ + producer_type = _PipelineOp.TmaLoad + consumer_type = _PipelineOp.TCGen05Mma + + producer = (producer_type, producer_group) + consumer = (consumer_type, consumer_group) + + sync_object_array_full = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8), num_stages, producer, tx_count + ) + sync_object_array_empty = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8) + num_stages, num_stages, consumer + ) + + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1: + # No mcast mask if not using clusters + producer_mask = None + # All threadblocks are leaders if not using clusters + is_leader_cta = True + else: + producer_mask = PipelineTmaUmma._compute_mcast_arrival_mask(cta_layout_vmnk) + is_leader_cta = PipelineTmaUmma._compute_is_leader_cta(cta_layout_vmnk) + + consumer_mask = producer_mask + + pipeline_init_wait(cta_layout_vmnk) + + return PipelineTmaUmma( + sync_object_array_full, + sync_object_array_empty, + num_stages, + producer_mask, + consumer_mask, + is_leader_cta, + ) + + def producer_acquire( + self, state: PipelineState, try_acquire_token: Optional[Boolean] = None + ): + """ + TMA producer commit conditionally waits on buffer empty and sets the transaction barrier for leader threadblocks. + """ + if_generate( + try_acquire_token is None or try_acquire_token == 0, + lambda: self.sync_object_array_empty.wait(state.index, state.phase), + ) + if_generate( + self.is_leader_cta, + lambda: self.sync_object_array_full.arrive(state.index, self.producer_mask), + ) + + def producer_commit(self, state: PipelineState): + """ + TMA producer commit is a NOP. The transaction barrier signals the commit upon completion of the TMA. + """ + pass + + +@dataclass(frozen=True) +class PipelineUmmaAsync(PipelineAsync): + """ + PipelineTmaUmma is used for UMMA producers and AsyncThread consumers (e.g. Blackwell accumulator pipelines). + """ + + @staticmethod + def _compute_tmem_sync_mask(cta_layout_vmnk: cute.Layout): + """ + Computes a mask to signal completion of tmem buffers for 2CTA kernels. + """ + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + cta_in_cluster_coord_vmnk = cta_layout_vmnk.get_flat_coord(cta_rank_in_cluster) + return cute.make_layout_image_mask( + cta_layout_vmnk, cta_in_cluster_coord_vmnk, mode=0 + ) + + @staticmethod + def _compute_peer_cta_rank(): + """ + Computes a mask to signal release of tmem buffers for 2CTA kernels. + """ + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + return cta_rank_in_cluster // 2 * 2 + + @staticmethod + def create( + barrier_storage: cute.Pointer, + num_stages: Int32, + producer_group: CooperativeGroup, + consumer_group: CooperativeGroup, + cta_layout_vmnk: Optional[cute.Layout] = None, + ): + """ + This helper function computes any necessary attributes and returns an instance of PipelineUmmaAsync. + :param barrier_storage: Pointer to the smem address for this pipeline's mbarriers + :type barrier_storage: cute.Pointer + :param num_stages: Number of buffer stages for this pipeline + :type num_stages: Int32 + :param producer_group: CooperativeGroup for the producer agent + :type producer_group: CooperativeGroup + :param consumer_group: CooperativeGroup for the consumer agent + :type consumer_group: CooperativeGroup + :param cta_layout_vmnk: Layout of the cluster shape + :type cta_layout_vmnk: cute.Layout | None + """ + producer_type = _PipelineOp.TCGen05Mma + consumer_type = _PipelineOp.AsyncThread + + producer = (producer_type, producer_group) + consumer = (consumer_type, consumer_group) + + sync_object_array_full = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8), num_stages, producer + ) + sync_object_array_empty = PipelineAsync._make_sync_object_array( + barrier_storage.align(min_align=8) + num_stages, num_stages, consumer + ) + + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1: + # Set mask to None if not using clusters (i.e. 1CTA kernels) + producer_mask = None + else: + producer_mask = PipelineUmmaAsync._compute_tmem_sync_mask(cta_layout_vmnk) + + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk, mode=[0]) == 1: + # Set mask to None if not using 2CTA intructions + consumer_mask = None + else: + consumer_mask = PipelineUmmaAsync._compute_peer_cta_rank() + + pipeline_init_wait(cta_layout_vmnk) + + return PipelineUmmaAsync( + sync_object_array_full, + sync_object_array_empty, + num_stages, + producer_mask, + consumer_mask, + ) + + def producer_tail(self, state: PipelineState): + """ + Make sure the last used buffer empty signal is visible to producer. + Producer tail is usually executed by producer before exit, to avoid dangling + mbarrier arrive signals after kernel exit. + + :param state: The pipeline state that points to next useful buffer + :type state: PipelineState + """ + cta_rank_in_cluster = cute.arch.make_warp_uniform( + cute.arch.block_idx_in_cluster() + ) + is_leader_cta = cta_rank_in_cluster % 2 == 0 + + def then_body(): + # Assume state contains that next useful buffer + # So we only need to advance to num_stages - 1 times to last used buffer + for i in range(self.num_stages - 1): + state.advance() + self.producer_acquire(state) + + if_generate(is_leader_cta, then_body) + + +@dataclass(frozen=True) +class PipelineTmaStore(PipelineAsync): + """ + PipelineTmaStore is used for synchronizing TMA stores in the epilogue. It does not use mbarriers. + """ + + @staticmethod + def create( + num_stages: Int32, + producer_group: CooperativeGroup, + ): + """ + This helper function computes any necessary attributes and returns an instance of PipelineTmaStore. + :param num_stages: Number of buffer stages for this pipeline + :type num_stages: Int32 + :param producer_group: CooperativeGroup for the producer agent + :type producer_group: CooperativeGroup + """ + producer_type = _PipelineOp.TmaStore + + producer = (producer_type, producer_group) + + sync_object_array_full = PipelineAsync._make_sync_object_array( + None, num_stages, producer + ) + + return PipelineTmaStore(sync_object_array_full, None, num_stages, None, None) + + def producer_acquire(self): + self.sync_object_array_full.wait() + + def producer_commit(self): + self.sync_object_array_full.arrive() + + def consumer_wait(self): + assert False, "Error: PipelineTmaStore does not have a consumer agent." + + def consumer_release(self): + assert False, "Error: PipelineTmaStore does not have a consumer agent." + + def producer_tail(self): + self.sync_object_array_full.tail() + + +############################################################################## +# Helper functions +############################################################################## + + +def pipeline_init_wait(cta_layout_vmnk: Optional[cute.Layout] = None): + """ + Fences the mbarrier init and syncs the threadblock or cluster + """ + cute.arch.mbarrier_init_fence() + + if cta_layout_vmnk is None or cute.size(cta_layout_vmnk) == 1: + # If not using clusters, sync the threadblock + _sync(Agent.ThreadBlock) + else: + # If using clusters, sync the cluster + _sync(Agent.ThreadBlockCluster) + + +def _sync(group: Agent): + """ + Syncs all threads within an agent. + """ + if group is Agent.Thread: + assert False, "Error: Not supported." + elif group is Agent.ThreadBlock: + cute.arch.sync_threads() + elif group is Agent.ThreadBlockCluster: + cute.arch.cluster_arrive() + cute.arch.cluster_wait() + else: + assert ( + False + ), "Error: No explicit sync instruction exists. Please use barriers (named / mbarrier) instead." + + +def _mbarrier_i64_to_ptr(val: Int64) -> cute.Pointer: + """ + Converts a smem pointer of type Int64 to cute.Pointer with 8B alignment + """ + return cute.make_ptr( + Int64, + val.ir_value(), + mem_space=_cute_ir.AddressSpace.smem, + assumed_align=8, + ) diff --git a/python/CuTeDSL/cutlass/utils/smem_allocator.py b/python/CuTeDSL/cutlass/utils/smem_allocator.py new file mode 100644 index 00000000..3e3a4020 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/smem_allocator.py @@ -0,0 +1,217 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Type, Union, overload + +from cutlass.cutlass_dsl import Int8, Numeric, NumericMeta + +import cutlass.cute as cute +from cutlass.cute.arch import get_dyn_smem + + +class SmemAllocator: + """ + A class for managing shared memory allocation on GPU. + + This class manages a chunk of shared memory and provide APIs for sub-allocation + inside the chunk. + + Attributes + ---------- + _base : cute.Pointer as i8 typed dynamic value + The current base address of the shared memory. + + _allocated_bytes: + The bytes allocated in shared memory. + + Methods + ------- + allocate(num_bytes, alignment) + Allocates num_bytes in the shared memory with the given byte alignment. + + allocate_value(value_ty, num_elems) + Allocates num_elems of value_ty values in the shared memory. + + allocate_tensor(value_ty, layout, alignment) + Allocates a tensor in the shared memory with given layout and byte alignment. + + Notes + ----- + This class is responsible for managing the allocation of tensors in shared memory. + """ + + def __init__(self): + """ + Initializes the SmemAllocator instance with dynamic smem base ptr, + which is i8 type and aligned to 1024. + + """ + self._base = get_dyn_smem(Int8, alignment=1024) + self._allocated_bytes = 0 + + @overload + def allocate(self, size_or_type: int, byte_alignment: int): ... + + @overload + def allocate(self, size_or_type: cute.struct, byte_alignment: int): ... + + def allocate(self, size_or_type, byte_alignment: int = 1) -> int: + """ + Allocates a block of memory with the specified size and byte alignment. + + This method adjusts the base cute.Pointer to ensure that the allocated memory + is aligned according to the specified byte alignment. It updates the internal + state to reflect the new base cute.Pointer and the total allocated bytes. + + Parameters + ---------- + size_or_type : int or struct + The number of bytes to allocate or struct class. + byte_alignment : int + The byte alignment requirement for the allocation. Defaults to 1 (no alignment). + + Returns + ---------- + A cute.Pointer to the start of the allocated memory block or struct instance. + + Raises + ---------- + ValueError + If num_bytes is negative or if byte_alignmemt is less than 1. + """ + + if isinstance(size_or_type, cute.struct): + alignment = max(byte_alignment, size_or_type.__alignof__()) + base_ptr = self.allocate(size_or_type.__sizeof__(), alignment) + return size_or_type(base_ptr) + + num_bytes = size_or_type + if num_bytes < 0: + raise ValueError("num_bytes must be non-negative") + if byte_alignment < 1: + raise ValueError("byte_alignment must be at least 1") + + self._base = self._base.align(byte_alignment) + ptr = self._base + self._base += num_bytes + if self._allocated_bytes % byte_alignment != 0: + self._allocated_bytes += ( + byte_alignment - self._allocated_bytes % byte_alignment + ) + self._allocated_bytes += num_bytes + return ptr + + def allocate_array(self, element_type: Type[Numeric], num_elems: int = 1): + """ + Allocates num_elems values of element_type in shared memory. + + This method calls allocate() to return a byte ptr, pointing to start of shared + memory. Then calls cute.recast_ptr() to recast this byte cute.Pointer to element_type. + + Parameters + ---------- + element_type : Type[Numeric] + The type of the values in the tensor. + num_elems : int, optional + The number of elements for each allocation. Defaults to 1. + + Returns + ---------- + A value_type cute.Pointer to the start of the allocated memory block. + + Raises + ---------- + ValueError + If num_elems is less than 1. + """ + if num_elems < 1: + raise ValueError("num_elems must be at least 1") + if not isinstance(element_type, NumericMeta): + raise TypeError( + f"value_ty must be a type of Numeric, but got {element_type}" + ) + + ptr = self.allocate( + element_type.width // 8 * num_elems, element_type.width // 8 + ) + + return cute.recast_ptr(ptr, dtype=element_type) + + def allocate_tensor( + self, + element_type: Type[Numeric], + layout: Union[int, cute.Layout, cute.ComposedLayout], + byte_alignment: int = 1, + swizzle: cute.Swizzle = None, + ): + """ + Allocates a tensor in the shared memory with value type, layout and byte alignment. + + Parameters + ---------- + element_type : Type[Numeric] + The type of the values in the tensor. + layout : int | DynamicInt | cute.Layout | cute.ComposedLayout + The layout of the tensor. + byte_alignment : int, optional + The byte alignment requirement for the allocation. Defaults to 1 (no alignment). + swizzle : cute.Swizzle + A swizzle for the iterator (for position-dependent swizzling). + + Returns + ------- + tensor : cute.Tensor + The allocated tensor with specified value type, layout and byte alignment. + + Notes + ----- + The base address is updated to point to the next available memory location. + """ + if not isinstance(element_type, NumericMeta): + raise TypeError( + f"value_ty must be a type of Numeric, but got {element_type}" + ) + + if ( + isinstance(layout, cute.ComposedLayout) + and isinstance(layout.inner, cute.Swizzle) + ) and (swizzle is not None): + raise TypeError( + f"iterator swizzle with swizzle layout is currently not supported" + ) + + if isinstance(layout, int): + layout = cute.make_layout(layout) + + profile = layout(0) + if isinstance(profile, tuple): + raise TypeError( + f"cannot allocate a shared memory tensor with a non-integer iterator" + ) + + if not cute.is_static(layout.type): + raise NotImplementedError(f"dynamic layout is not supported: {layout.type}") + + # At least align the allocation to the natural alignment given by the element type + if element_type.width // 8 > byte_alignment: + byte_alignment = element_type.width // 8 + + # Relevant only for sub-byte data types: verify that the entire allocation is byte-aligned + cosize_in_bits = cute.cosize(layout) * element_type.width + assert isinstance(cosize_in_bits, int) + if cosize_in_bits % 8 != 0: + raise ValueError("invalid allocation that is not byte-aligned") + + num_bytes = cosize_in_bits // 8 + ptr = self.allocate(num_bytes, byte_alignment) + ptr = cute.recast_ptr(ptr, swizzle, dtype=element_type) + res = cute.make_tensor(ptr, layout) + return res diff --git a/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py b/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py new file mode 100644 index 00000000..1a4d13de --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/static_persistent_tile_scheduler.py @@ -0,0 +1,384 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import Tuple + +from cutlass.cutlass_dsl import ( + Boolean, + Integer, + Int32, + min, + extract_mlir_values, + new_from_mlir_values, + dsl_user_op, +) +from cutlass._mlir import ir +import cutlass.cute as cute + +############################################################################## +# Static persistent tile scheduler +############################################################################## + + +class WorkTileInfo: + """A class to represent information about a work tile. + + :ivar tile_idx: The index of the tile. + :type tile_idx: cute.Coord + :ivar is_valid_tile: Whether the tile is valid. + :type is_valid_tile: Boolean + """ + + def __init__(self, tile_idx: cute.Coord, is_valid_tile: Boolean): + self._tile_idx = tile_idx + self._is_valid_tile = Boolean(is_valid_tile) + + def __extract_mlir_values__(self) -> list[ir.Value]: + values = extract_mlir_values(self.tile_idx) + values.extend(extract_mlir_values(self.is_valid_tile)) + return values + + def __new_from_mlir_values__(self, values: list[ir.Value]) -> "WorkTileInfo": + assert len(values) == 4 + new_tile_idx = new_from_mlir_values(self._tile_idx, values[:-1]) + new_is_valid_tile = new_from_mlir_values(self._is_valid_tile, [values[-1]]) + return WorkTileInfo(new_tile_idx, new_is_valid_tile) + + @property + def is_valid_tile(self) -> Boolean: + """Check latest tile returned by the scheduler is valid or not. Any scheduling + requests after all tasks completed will return an invalid tile. + + :return: The validity of the tile. + :rtype: Boolean + """ + return self._is_valid_tile + + @property + def tile_idx(self) -> cute.Coord: + """ + Get the index of the tile. + + :return: The index of the tile. + :rtype: cute.Coord + """ + return self._tile_idx + + +class PersistentTileSchedulerParams: + """A class to represent parameters for a persistent tile scheduler. + + This class is designed to manage and compute the layout of clusters and tiles + in a batched gemm problem. + + :ivar cluster_shape_mn: Shape of the cluster in (m, n) dimensions (K dimension cta count must be 1). + :type cluster_shape_mn: tuple + :ivar problem_layout_ncluster_mnl: Layout of the problem in terms of + number of clusters in (m, n, l) dimensions. + :type problem_layout_ncluster_mnl: cute.Layout + """ + + def __init__( + self, + problem_shape_ntile_mnl: cute.Shape, + cluster_shape_mnk: cute.Shape, + *, + loc=None, + ip=None, + ): + """ + Initializes the PersistentTileSchedulerParams with the given parameters. + + :param problem_shape_ntile_mnl: The shape of the problem in terms of + number of CTA (Cooperative Thread Array) in (m, n, l) dimensions. + :type problem_shape_ntile_mnl: cute.Shape + :param cluster_shape_mnk: The shape of the cluster in (m, n) dimensions. + :type cluster_shape_mnk: cute.Shape + + :raises ValueError: If cluster_shape_k is not 1. + """ + + if cluster_shape_mnk[2] != 1: + raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}") + + self.problem_shape_ntile_mnl = problem_shape_ntile_mnl + # cluster_shape_mnk is kept for reconstruction + self._cluster_shape_mnk = cluster_shape_mnk + self.cluster_shape_mn = cluster_shape_mnk[:2] + self._loc = loc + + # By default, we follow m major (col-major) raster order, so make a col-major layout + self.problem_layout_ncluster_mnl = cute.make_layout( + cute.ceil_div( + self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip + ), + loc=loc, + ip=ip, + ) + + def __extract_mlir_values__(self): + values, self._values_pos = [], [] + for obj in [self.problem_shape_ntile_mnl, self._cluster_shape_mnk]: + obj_values = extract_mlir_values(obj) + values += obj_values + self._values_pos.append(len(obj_values)) + return values + + def __new_from_mlir_values__(self, values): + obj_list = [] + for obj, n_items in zip( + [self.problem_shape_ntile_mnl, self._cluster_shape_mnk], self._values_pos + ): + obj_list.append(new_from_mlir_values(obj, values[:n_items])) + values = values[n_items:] + return PersistentTileSchedulerParams(*(tuple(obj_list)), loc=self._loc) + + @dsl_user_op + def get_grid_shape( + self, max_active_clusters: Int32, *, loc=None, ip=None + ) -> Tuple[Integer, Integer, Integer]: + """ + Computes the grid shape based on the maximum active clusters allowed. + + :param max_active_clusters: The maximum number of active clusters that + can run in one wave. + :type max_active_clusters: Int32 + + :return: A tuple containing the grid shape in (m, n, persistent_clusters). + - m: self.cluster_shape_m. + - n: self.cluster_shape_n. + - persistent_clusters: Number of persistent clusters that can run. + """ + + # Total ctas in problem size + num_ctas_mnl = tuple( + x * y + for x, y in zip( + self.problem_layout_ncluster_mnl.shape, self.cluster_shape_mn + ) + ) + (self.problem_layout_ncluster_mnl.shape[2],) + + num_ctas_in_problem = cute.size(num_ctas_mnl, loc=loc, ip=ip) + + num_ctas_per_cluster = cute.size(self.cluster_shape_mn, loc=loc, ip=ip) + # Total ctas that can run in one wave + num_ctas_per_wave = max_active_clusters * num_ctas_per_cluster + + num_persistent_ctas = min(num_ctas_in_problem, num_ctas_per_wave) + num_persistent_clusters = num_persistent_ctas // num_ctas_per_cluster + + return (*self.cluster_shape_mn, num_persistent_clusters) + + +class StaticPersistentTileScheduler: + """A scheduler for static persistent tile execution in CUTLASS/CuTe kernels. + + :ivar params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl + :type params: PersistentTileSchedulerParams + :ivar num_persistent_clusters: Number of persistent clusters that can be launched + :type num_persistent_clusters: Int32 + :ivar cta_id_in_cluster: ID of the CTA within its cluster + :type cta_id_in_cluster: cute.Coord + :ivar _num_tiles_executed: Counter for executed tiles + :type _num_tiles_executed: Int32 + :ivar _current_work_linear_idx: Current cluster index + :type _current_work_linear_idx: Int32 + """ + + def __init__( + self, + params: PersistentTileSchedulerParams, + num_persistent_clusters: Int32, + current_work_linear_idx: Int32, + cta_id_in_cluster: cute.Coord, + num_tiles_executed: Int32, + ): + """ + Initializes the StaticPersistentTileScheduler with the given parameters. + + :param params: Tile schedule related params, including cluster shape and problem_layout_ncluster_mnl. + :type params: PersistentTileSchedulerParams + :param num_persistent_clusters: Number of persistent clusters that can be launched. + :type num_persistent_clusters: Int32 + :param current_work_linear_idx: Current cluster index. + :type current_work_linear_idx: Int32 + :param cta_id_in_cluster: ID of the CTA within its cluster. + :type cta_id_in_cluster: cute.Coord + :param num_tiles_executed: Counter for executed tiles. + :type num_tiles_executed: Int32 + """ + self.params = params + self.num_persistent_clusters = num_persistent_clusters + self._current_work_linear_idx = current_work_linear_idx + self.cta_id_in_cluster = cta_id_in_cluster + self._num_tiles_executed = num_tiles_executed + + def __extract_mlir_values__(self) -> list[ir.Value]: + values = extract_mlir_values(self.num_persistent_clusters) + values.extend(extract_mlir_values(self._current_work_linear_idx)) + values.extend(extract_mlir_values(self.cta_id_in_cluster)) + values.extend(extract_mlir_values(self._num_tiles_executed)) + return values + + def __new_from_mlir_values__( + self, values: list[ir.Value] + ) -> "StaticPersistentTileScheduler": + assert len(values) == 6 + new_num_persistent_clusters = new_from_mlir_values( + self.num_persistent_clusters, [values[0]] + ) + new_current_work_linear_idx = new_from_mlir_values( + self._current_work_linear_idx, [values[1]] + ) + new_cta_id_in_cluster = new_from_mlir_values( + self.cta_id_in_cluster, values[2:5] + ) + new_num_tiles_executed = new_from_mlir_values( + self._num_tiles_executed, [values[5]] + ) + return StaticPersistentTileScheduler( + self.params, + new_num_persistent_clusters, + new_current_work_linear_idx, + new_cta_id_in_cluster, + new_num_tiles_executed, + ) + + # called by host + @dsl_user_op + @staticmethod + def create( + params: PersistentTileSchedulerParams, + block_idx: Tuple[Integer, Integer, Integer], + grid_dim: Tuple[Integer, Integer, Integer], + *, + loc=None, + ip=None, + ): + """Initialize the static persistent tile scheduler. + + :param params: Parameters for the persistent + tile scheduler. + :type params: PersistentTileSchedulerParams + :param block_idx: The 3d block index in the format (bidx, bidy, bidz). + :type block_idx: Tuple[Integer, Integer, Integer] + :param grid_dim: The 3d grid dimensions for kernel launch. + :type grid_dim: Tuple[Integer, Integer, Integer] + + :return: A StaticPersistentTileScheduler object. + :rtype: StaticPersistentTileScheduler + """ + params = params + + # Calculate the number of persistent clusters by dividing the total grid size + # by the number of CTAs per cluster + num_persistent_clusters = cute.size(grid_dim, loc=loc, ip=ip) // cute.size( + params.cluster_shape_mn, loc=loc, ip=ip + ) + + bidx, bidy, bidz = block_idx + + # Initialize workload index equals to the cluster index in the grid + current_work_linear_idx = Int32(bidz) + + # CTA id in the cluster + cta_id_in_cluster = ( + Int32(bidx % params.cluster_shape_mn[0]), + Int32(bidy % params.cluster_shape_mn[1]), + Int32(0), + ) + # Initialize number of tiles executed to zero + num_tiles_executed = Int32(0) + return StaticPersistentTileScheduler( + params, + num_persistent_clusters, + current_work_linear_idx, + cta_id_in_cluster, + num_tiles_executed, + ) + + # called by host + @staticmethod + def get_grid_shape( + params: PersistentTileSchedulerParams, + max_active_clusters: Int32, + *, + loc=None, + ip=None, + ) -> Tuple[Integer, Integer, Integer]: + """Calculates the grid shape to be launched on GPU using problem shape, + threadblock shape, and active cluster size. + + :param params: Parameters for grid shape calculation. + :type params: PersistentTileSchedulerParams + :param max_active_clusters: Maximum active clusters allowed. + :type max_active_clusters: Int32 + + :return: The calculated 3d grid shape. + :rtype: Tuple[Integer, Integer, Integer] + """ + + return params.get_grid_shape(max_active_clusters, loc=loc, ip=ip) + + # private method + def _get_current_work_for_linear_idx( + self, current_work_linear_idx: Int32, *, loc=None, ip=None + ) -> WorkTileInfo: + """Compute current tile coord given current_work_linear_idx and cta_id_in_cluster. + + :param current_work_linear_idx: The linear index of the current work. + :type current_work_linear_idx: Int32 + + :return: An object containing information about the current tile coordinates + and validity status. + :rtype: WorkTileInfo + """ + + is_valid = current_work_linear_idx < cute.size( + self.params.problem_layout_ncluster_mnl, loc=loc, ip=ip + ) + + cur_cluster_coord = self.params.problem_layout_ncluster_mnl.get_hier_coord( + current_work_linear_idx, loc=loc, ip=ip + ) + + # cur_tile_coord is a tuple of i32 values + cur_tile_coord = tuple( + Int32(x) * Int32(z) + Int32(y) + for x, y, z in zip( + cur_cluster_coord, + self.cta_id_in_cluster, + (*self.params.cluster_shape_mn, Int32(1)), + ) + ) + + return WorkTileInfo(cur_tile_coord, is_valid) + + @dsl_user_op + def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo: + return self._get_current_work_for_linear_idx( + self._current_work_linear_idx, loc=loc, ip=ip + ) + + @dsl_user_op + def initial_work_tile_info(self, *, loc=None, ip=None) -> WorkTileInfo: + return self.get_current_work(loc=loc, ip=ip) + + @dsl_user_op + def advance_to_next_work(self, *, advance_count: int = 1, loc=None, ip=None): + self._current_work_linear_idx += Int32(advance_count) * Int32( + self.num_persistent_clusters + ) + self._num_tiles_executed += Int32(1) + + @property + def num_tiles_executed(self) -> Int32: + return self._num_tiles_executed diff --git a/python/CuTeDSL/cutlass/utils/tensormap_manager.py b/python/CuTeDSL/cutlass/utils/tensormap_manager.py new file mode 100644 index 00000000..c6369c20 --- /dev/null +++ b/python/CuTeDSL/cutlass/utils/tensormap_manager.py @@ -0,0 +1,140 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from dataclasses import dataclass +from enum import Enum, auto +from typing import Tuple + +from cutlass.cutlass_dsl import const_expr + +import cutlass._mlir.dialects.cute as _cute_ir +import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir + +import cutlass.cute as cute + + +class TensorMapUpdateMode(Enum): + """ + Enum class defining tensor map update modes. + + Modes: + GMEM: Update tensormap in global memory + SMEM: Load tensormap from global memory to shared memory, + update it in shared memory, then store back to global memory + """ + + GMEM = auto() # Update tensormap in global memory + SMEM = auto() # Update tensormap in shared memory + + +@dataclass(frozen=True) +class TensorMapManager: + """ + Manages TensorMap operations including initialization and updates. + Provides utilities to convert tensormap pointer to across different memory spaces. + """ + + tensormap_update_mode: TensorMapUpdateMode + bytes_per_tensormap: int + + # convert given cute.Pointer or cutlass.Int64 to a cute.Pointer to tensormap. + # address_space: the address space of the resulting tensormap pointer. It could be generic or gmem + def get_tensormap_ptr( + self, + ptr: cute.Pointer, + address_space=_cute_ir.AddressSpace.gmem, + ) -> cute.Pointer: + if address_space not in [ + _cute_ir.AddressSpace.gmem, + _cute_ir.AddressSpace.generic, + ]: + raise ValueError(f"Invalid address space: {address_space} for tensormap") + + gmem_ptr_i64 = ptr.toint().ir_value() + gmem_ptr_i64_align_ty = _cute_ir.ConstrainedIntType.get( + self.bytes_per_tensormap, gmem_ptr_i64.type.width + ) + gmem_ptr_i64_align = _cute_ir.assume(gmem_ptr_i64_align_ty, gmem_ptr_i64) + gmem_ptr_ty = _cute_ir.PtrType.get( + _cute_nvgpu_ir.TmaDescriptorTiledType.get(), + address_space, + self.bytes_per_tensormap, + ) + return _cute_ir.inttoptr(gmem_ptr_ty, gmem_ptr_i64_align) + + # init tensormap pointed by dst_ptr with the one inside copy_atom. + # dst_ptr should be pointing to a global memory location or a smem location + # warp_id specifies which warp to perform the initialization + @cute.jit + def init_tensormap_from_atom( + self, copy_atom: cute.CopyAtom, dst_ptr: cute.Pointer, warp_id: int + ) -> None: + warp_idx = cute.arch.warp_idx() + warp_idx = cute.arch.make_warp_uniform(warp_idx) + if warp_idx == warp_id: + with cute.arch.elect_one(): + cute.nvgpu.cpasync.copy_tensormap(copy_atom, dst_ptr) + cute.arch.sync_warp() + return + + # Perform a fence operation to ensure previous `init_tensormap_from_atom` calls have been completed + def fence_tensormap_initialization( + self, + ) -> None: + if self.tensormap_update_mode == TensorMapUpdateMode.GMEM: + cute.arch.fence_acq_rel_cta() + return + + # Perform a fence operation to ensure previous `update_tensormap` calls have been completed + def fence_tensormap_update( + self, + tensormap_ptr: cute.Pointer, + ) -> None: + cute.nvgpu.cpasync.fence_tma_desc_acquire(tensormap_ptr) + return + + @cute.jit + def update_tensormap( + self, + tensor_gmem: Tuple[cute.Tensor, ...], + tma_copy_atom: Tuple[cute.CopyAtom, ...], + tensormap_gmem_ptr: Tuple[cute.Pointer, ...], + warp_id: int, + tensormap_smem_ptr: Tuple[cute.Pointer, ...], + ) -> None: + warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx()) + # updates before touching tensormap in global memory + if warp_idx == warp_id: + if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM): + for copy_atom, tensor, smem_ptr in zip( + tma_copy_atom, tensor_gmem, tensormap_smem_ptr + ): + cute.nvgpu.cpasync.update_tma_descriptor( + copy_atom, tensor, smem_ptr + ) + # wait until it's safe to update tensormap in global memory + with cute.arch.elect_one(): + cute.arch.cp_async_bulk_commit_group() + cute.arch.cp_async_bulk_wait_group(0, read=True) + cute.arch.sync_warp() + # updates to tensormap in global memory + if const_expr(self.tensormap_update_mode == TensorMapUpdateMode.SMEM): + for gmem_ptr, smem_ptr in zip(tensormap_gmem_ptr, tensormap_smem_ptr): + cute.nvgpu.cpasync.cp_fence_tma_desc_release(gmem_ptr, smem_ptr) + else: + for copy_atom, tensor, gmem_ptr in zip( + tma_copy_atom, tensor_gmem, tensormap_gmem_ptr + ): + cute.nvgpu.cpasync.update_tma_descriptor( + copy_atom, tensor, gmem_ptr + ) + cute.arch.sync_warp() + cute.nvgpu.cpasync.fence_tma_desc_release() diff --git a/python/CuTeDSL/cutlass_dsl/__init__.py b/python/CuTeDSL/cutlass_dsl/__init__.py new file mode 100644 index 00000000..9c6861c3 --- /dev/null +++ b/python/CuTeDSL/cutlass_dsl/__init__.py @@ -0,0 +1,37 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from .cutlass import * + +from ..base_dsl.ast_helpers import ( + loop_selector, + if_selector, + if_executor, + while_selector, + while_executor, + range_constexpr, + range_dynamic, + const_expr, + dynamic_expr, + assert_executor, + bool_cast, +) + +from ..base_dsl import * +from ..base_dsl.dsl import extract_mlir_values, new_from_mlir_values +from ..base_dsl.typing import _binary_op_type_promote +from ..base_dsl._mlir_helpers.gpu import * +from ..base_dsl._mlir_helpers.op import dsl_user_op +from ..base_dsl.runtime import * +from ..base_dsl.runtime import cuda as cuda_helpers +from ..base_dsl.compiler import compile +from ..base_dsl.runtime.dlpack_runtime import * +from ..base_dsl.runtime.jit_arg_adapters import * diff --git a/python/CuTeDSL/cutlass_dsl/cutlass.py b/python/CuTeDSL/cutlass_dsl/cutlass.py new file mode 100644 index 00000000..1e2f4d1c --- /dev/null +++ b/python/CuTeDSL/cutlass_dsl/cutlass.py @@ -0,0 +1,1322 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +""" +This module provides a DSL for Cutlass Dialects. It also includes utils with +regarding to that dialect. +""" + +# Local module imports +from typing import Callable, Union, Type, List, Union, Sequence, ForwardRef +from inspect import isclass +import functools +import pkgutil +from dataclasses import is_dataclass + +from ..base_dsl import * +from ..base_dsl import compiler +from ..base_dsl.dsl import is_dynamic_expression, extract_mlir_values +from ..base_dsl.typing import * +from ..base_dsl.typing import DynamicExpression, get_mlir_types +from ..base_dsl.runtime.jit_arg_adapters import is_arg_spec_constexpr + +from ..base_dsl.ast_helpers import const_expr + +# MLIR Imports +from cutlass._mlir import ir, execution_engine, passmanager +from cutlass._mlir.dialects import arith, func, gpu, scf, cute, gpu as cutlass_gpu +from cutlass._mlir.dialects._ods_common import ( + get_op_result_or_op_results as _get_op_result_or_op_results, +) +from cutlass._mlir.extras import types as T + +# Helpers +from ..base_dsl._mlir_helpers import arith as cutlass_arith +from ..base_dsl._mlir_helpers import lru_cache_ir + +from ..base_dsl.ast_helpers import ( + loop_selector, + executor, + if_selector, + if_executor, + while_selector, + while_executor, + assert_executor, + bool_cast, +) +from ..base_dsl.runtime.dlpack_runtime import ( + get_cute_tensor_c_pointer, + get_tensor_desc_shape_all, + get_tensor_desc_stride_all, + get_tensor_desc_element_type, + get_tensor_desc_is_in_device, + get_tensor_desc_assumed_align, +) + +from .cutlass_ast_decorators import ( + _loop_execute_range_dynamic, + _if_execute_dynamic, + _while_execute_dynamic, +) + +# ============================================================================= +# Set the AST decorator +# ============================================================================= + +# Set the DSL specific functions +executor.set_functions( + is_dynamic_expression, + _loop_execute_range_dynamic, + _if_execute_dynamic, + _while_execute_dynamic, +) + + +# ============================================================================= +# Cutlass DSL Base Abstract Class +# ============================================================================= + + +# Return a ctype class that represents the in-memory layout expected +# for a CuTe hierarchical tuple type. +def get_sparse_tuple_ctype(dyn): + # When there is a single dynamic value, the sparse CuTe + # representation is a single integer. + if isinstance(dyn, int): + return ctypes.c_int32 + + # For zero or greater than 1 dynamic values, the tuple + # representation will be a struct with a field for each dynamic + # value. The representation is flattened, even for hierarchical CuTe + # profiles (although we are only dealing with depth 1 inputs here). + class TupleDescriptor(ctypes.Structure): + _fields_ = [(f"x{idx}", ctypes.c_int32) for idx in range(len(dyn))] + + def __str__(self): + return f"struct<{str(self._fields_)}>" + + return TupleDescriptor + + +def is_cute_algebra_type(arg_spec): + # Walk through the arg_spec to check if it's a cute algebra type + _cute_algebra_type_aliases = ( + "Shape", + "Stride", + "Coord", + "Tile", + "IntTuple", + ) + + origin = get_origin(arg_spec) + if origin is Union: + for sub_ty in get_args(arg_spec): + sub_origin = get_origin(sub_ty) + if sub_origin is Tuple or ( + type(sub_origin) is type and issubclass(sub_origin, tuple) + ): + tuple_arg0 = get_args(sub_ty)[0] + if isinstance( + tuple_arg0, ForwardRef + ) and tuple_arg0.__forward_arg__ in (_cute_algebra_type_aliases): + return True + return False + + +class CutlassBaseDSL(BaseDSL): + """This abstract class provides a DSL for Cutlass.""" + + def __init__( + self, + name: str, + compiler_provider: Any, + pass_sm_arch_name: str, + device_compilation_only: bool = False, + preprocess: bool = False, + ): + super().__init__( + name, + compiler_provider, + pass_sm_arch_name, + device_compilation_only, + preprocess, + ) + + def _is_tensor_descriptor(self, maybe_tensor_descriptor) -> bool: + return False + + def _build_gpu_module(self, attrs): + self.gpu_module = gpu.GPUModuleOp(ir.StringAttr.get("kernels")) + with ir.InsertionPoint(self.gpu_module.bodyRegion.blocks.append(*[])): + pass + + for attr_name in attrs: + self.gpu_module.attributes[attr_name] = ir.Attribute.parse(attrs[attr_name]) + + def _get_pipeline(self, pipeline): + pipeline = super()._get_pipeline(pipeline) + if pipeline == None: + # cubin format is required to be cubin as we launch cuda module at python level. + return "builtin.module(cute-to-nvvm{cubin-format=bin opt-level=3})" + + return pipeline + + def preprocess_pipeline(self, pipeline, arch) -> str: + pipeline = super().preprocess_pipeline(pipeline, arch) + pipeline = pipeline.rstrip(")") + ",external-kernel-for-gpu-launch)" + return pipeline + + def _enter_gpu_module(self): + return ir.InsertionPoint(self.gpu_module.bodyRegion.blocks[0]) + + def _generate_kernel_attrs(self, config: BaseDSL.LaunchConfig) -> dict: + assert isinstance( + config, BaseDSL.LaunchConfig + ), f"Expect LaunchConfig for @kernel, but got {type(config)}" + + ret = {} + # generate launch bound attr from LaunchConfig + max_threads = ", ".join(map(str, config.block)) + ret["nvvm.reqntid"] = ir.Attribute.parse(f"array") + # min_blocks_per_mp is optional for kernel + min_blocks = config.min_blocks_per_mp + if min_blocks > 0: + ret["nvvm.minctasm"] = ir.Attribute.parse(f"{min_blocks} : i32") + return ret + + @lru_cache(maxsize=1) + def get_version(self): + """ + Get the version of cutlass dsl, used for computing the hash key of the cache. + Including source python files and the shared library. + """ + dsl_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + # get the version hash of the cutlass shared library + version_hash = hashlib.sha256() + # update the version hash of the source python files + for lib in pkgutil.walk_packages([dsl_path], prefix="cutlass."): + try: + with open(lib.module_finder.find_spec(lib.name).origin, "rb") as f: + version_hash.update(f.read()) + except Exception: + raise DSLRuntimeError( + f"Failed to read module file {lib.name}. The file may not exist or may not be readable." + "Please re-install the package." + ) + try: + # update the version hash of the cutlass shared library + with open( + os.path.join(dsl_path, "_mlir/_mlir_libs/libCutlassIRPythonCAPI.so"), + "rb", + ) as f: + while True: + chunk = f.read(1024**2) + if not chunk: + break + version_hash.update(chunk) + except Exception: + raise DSLRuntimeError( + f"Failed to read the shared library file libCutlassIRPythonCAPI.so." + "The file may not exist or may not be readable." + "Please re-install the package." + ) + + return version_hash + + def _kernel_helper(self, funcBody, *args, **kwargs): + class _CutlassIrKernelGenHelper(BaseDSL._KernelGenHelper): + def generate_func_op(self, arg_types, arg_attrs, kernel_name, loc=None): + super().generate_func_op(arg_types, arg_attrs, kernel_name) + self.func_op = func.FuncOp( + kernel_name, ir.FunctionType.get(arg_types, []), loc=loc + ) + if arg_attrs is not None: + log().debug(arg_attrs) + self.func_op.arg_attrs = arg_attrs + return self.func_op + + def generate_func_ret_op(self): + return func.ReturnOp([]) + + def get_func_body_start(self): + assert self.func_op is not None, "Invalid func_op is not expected!" + return self.func_op.add_entry_block() + + def generate_launch_op(self, *args, **kwargs): + # Extract args and do validation + kernelSym = kwargs.get("kernelSym", None) + kernelOperands = kwargs.get("kernelOperands", None) + requiredArgs = kwargs.get("requiredArgs", None) + assert kernelSym is not None, "kernelSym being None is not expected!" + assert ( + requiredArgs is not None + ), "requiredArgs being None is not expected!" + assert ( + kernelOperands is not None + ), "kernelOperands being None is not expected!" + assert isinstance( + requiredArgs.config, BaseDSL.LaunchConfig + ), f"Expect LaunchConfig for @kernel, but got {type(requiredArgs.config)}" + + cfg = requiredArgs.config + + # Apply to grid, block, and cluster if present + cfg.grid = [to_index(size) for size in cfg.grid] + cfg.block = [to_index(size) for size in cfg.block] + if cfg.has_cluster: + cfg.cluster = [to_index(size) for size in cfg.cluster] + + cfg.smem = const(cfg.smem) + + if not isinstance(cfg.async_deps, (list, tuple)): + cfg.async_deps = [cfg.async_deps] + is_async = len(cfg.async_deps) > 0 + token = gpu.launch_func( + gpu.AsyncTokenType.get() if is_async else None, + cfg.async_deps, + kernelSym, + *cfg.grid, + *cfg.block, + kernelOperands, + **dict( + zip( + ("cluster_size_x", "cluster_size_y", "cluster_size_z"), + tuple(cfg.cluster), + ) + ), + dynamic_shared_memory_size=cfg.smem, + ) + return token if is_async else None + + return KernelLauncher( + self, _CutlassIrKernelGenHelper, funcBody, *args, **kwargs + ) + + def _get_globals(self): + caller_globals = self.frame.f_globals + caller_locals = self.frame.f_locals + all_globals = globals().copy() + all_globals.update(caller_globals) + all_globals.update(caller_locals) + return all_globals + + def _preprocess_launch_config_args(self, args, kwargs): + """Helper to preprocess args and kwargs for LaunchConfig""" + if "stream" in kwargs: + kwargs["async_deps"] = kwargs.pop("stream") + + def mangle_name(self, function_name, args, args_spec: inspect.FullArgSpec): + """Mangle the name of the function to avoid conflicts with other functions""" + function_name = "cutlass_" + function_name + return super().mangle_name(function_name, args, args_spec) + + def _validate_arg(self, arg, arg_index, arg_name, arg_annotation): + """ + Validates if the arg is really of the annotated type. + """ + + if is_arg_spec_constexpr(arg_annotation, arg_name, arg_index, None): + pass + else: + origin = get_origin(arg_annotation) + # Handle special case where annotation is Type[X] but arg is an actual type + if origin is type and isinstance(arg, type): + # Get the expected base type from Type[X] + expected_base = get_args(arg_annotation)[0] + if not issubclass(arg, expected_base): + return DSLRuntimeError( + f"expects argument #{arg_index+1} ({arg_name}) to be Type[{expected_base}], but got {arg}" + ) + # Handle Union types and generic types + elif origin is Union: + # For Union types, check if arg matches any of the allowed types + allowed_types = get_args(arg_annotation) + if not any( + (isinstance(ty, type) and isinstance(arg, ty)) + or (get_origin(ty) is tuple and isinstance(arg, tuple)) + for ty in allowed_types + ): + return DSLRuntimeError( + f"expects argument #{arg_index+1} ({arg_name}) to be one of {allowed_types}, but got {type(arg)}" + ) + elif isinstance(arg_annotation, type): + # Handle simple type annotations + if not isinstance(arg, arg_annotation) and arg is not None: + return DSLRuntimeError( + f"expects argument #{arg_index+1} ({arg_name}) to be {arg_annotation}, but got {type(arg)}" + ) + # Everything looks good if we are here + return None + + def _generate_jit_func_args_for_known_types( + self, + func, + arg, + arg_name, + arg_spec, + arg_index, + *, + is_host=True, + ): + jit_arg_type, jit_arg_attr, jit_exec_arg = [], [], [] + default_attr = ir.DictAttr.get({}) + + ( + jit_exec_arg, + jit_arg_type, + jit_arg_attr, + ) = super()._generate_jit_func_args_for_known_types( + func, arg, arg_name, arg_spec, arg_index, is_host=is_host + ) + + if jit_arg_type is not None and len(jit_arg_type) == 0: + # Handle DSL specific types + if is_cute_algebra_type(arg_spec): + dyn_vals = extract_mlir_values(arg) + if dyn_vals: + # Handle dynamic types + jit_arg_type.extend([v.type for v in dyn_vals]) + jit_arg_attr.extend([default_attr] * len(dyn_vals)) + jit_exec_arg.extend(get_c_pointers(arg) if is_host else dyn_vals) + else: + jit_exec_arg = jit_arg_type = jit_arg_attr = None + return jit_exec_arg, jit_arg_type, jit_arg_attr + + def _generate_execution_arguments_for_known_types( + self, arg, arg_spec, arg_name, i, fop_args, iv_block_args + ): + ir_arg, iv_block_args = super()._generate_execution_arguments_for_known_types( + arg, arg_spec, arg_name, i, fop_args, iv_block_args + ) + if not ir_arg: + # Handling DSL specific types + if is_cute_algebra_type(arg_spec): + n_args = len(get_mlir_types(arg)) + blk_args = fop_args[iv_block_args : iv_block_args + n_args] + ir_arg.append(new_from_mlir_values(arg, blk_args)) + iv_block_args += n_args + + return ir_arg, iv_block_args + + +# ============================================================================= +# Cute DSL Class +# ============================================================================= + + +class CuTeDSL(CutlassBaseDSL): + """ + This is a concrete DSL subclass for the CuTe dialect. + """ + + def __init__(self): + name = "CUTE_DSL" + compiler_provider = compiler.Compiler(passmanager, execution_engine) + pass_sm_arch_name = "cubin-chip" + + super().__init__(name, compiler_provider, pass_sm_arch_name, preprocess=True) + + +# ============================================================================= +# KernelLauncher +# ============================================================================= + + +class KernelLauncher: + """ + This class is used to launch a kernel function. + Usage: + ```python + @cute.kernel + def kernel(arg1, arg2, ...): + ... + + @cute.jit + def launch_kernel(): + kernel(arg1, arg2, ...).launch(grid=[1, 1, 1], block=[1, 1, 1], ...) + # or + kernel(arg1, arg2, ...)(grid=[1, 1, 1], block=[1, 1, 1], ...) + ``` + """ + + def __init__( + self, + dsl: "CutlassBaseDSL", + kernelGenHelper: BaseDSL._KernelGenHelper, + funcBody, + *func_args, + **func_kwargs, + ): + self.dsl = dsl + self.kernelGenHelper = kernelGenHelper + self.funcBody = funcBody + self.func_args = func_args + self.func_kwargs = func_kwargs + + self._check_func_args(funcBody, *func_args, **func_kwargs) + + def _check_func_args(self, funcBody, *func_args, **func_kwargs): + # Get function signature + sig = inspect.signature(funcBody) + + # func_args and func_kwargs should match funcBody's signature, + # no extra or missing arguments. + try: + sig.bind(*func_args, **func_kwargs) + except TypeError as e: + raise DSLRuntimeError( + f"Failed to bind arguments to function `{funcBody.__name__}` with signature `{sig}`", + cause=e, + ) + + def launch(self, *args, **kwargs): + self.dsl.frame = inspect.currentframe().f_back + self.dsl._preprocess_launch_config_args(args, kwargs) + config = self.dsl.LaunchConfig(*args, **kwargs) + + kernel_generator = self.dsl.kernel_launcher( + requiredArgs=["config"], + unitAttrNames=["gpu.kernel", "cute.kernel"], + valueAttrDict=self.dsl._generate_kernel_attrs(config), + kernelGenHelper=self.kernelGenHelper, + )(self.funcBody) + + ret, name = kernel_generator(*self.func_args, **self.func_kwargs, config=config) + self.dsl.kernel_symbols.append(name) + return ret.launch_op_ret + + def __call__(self, *args, **kwargs): + return self.launch(*args, **kwargs) + + +# ============================================================================= +# Utils +# ============================================================================= + + +def is_frozen_dataclass(obj_or_cls) -> bool: + """ + Return True if obj_or_cls is a dataclass (class or instance) declared with frozen=True, + otherwise False. + """ + if not isinstance(obj_or_cls, type): + # If it's an instance, get its class + obj_or_cls = obj_or_cls.__class__ + + # Must be a dataclass, and __dataclass_params__.frozen must be True + return ( + is_dataclass(obj_or_cls) + and getattr(obj_or_cls, "__dataclass_params__", None) is not None + and obj_or_cls.__dataclass_params__.frozen + ) + + +def pack_from_irvalue( + ir_values: List["ir.Value"], + indices: Dict[int, Tuple[int, int]], + class_types: List[Any], +) -> List[Any]: + """ + Packs MLIR values into a list of mixed values. + """ + log().info("===--- Values Pack (%d)", len(ir_values)) + for idx, packed in enumerate(ir_values): + log().info("[%d]: will-packed: %s", idx, ir_values) + for idx, unpacked in indices.items(): + log().info("[%d]: indices: %s", idx, unpacked) + for idx, c in enumerate(class_types): + log().info("[%d]: obj-types: %s", idx, type(c)) + + mixed_values = [None] * len(indices) + for idx, (start, length) in sorted(indices.items()): + chunk = ir_values[start : start + length] + obj = class_types[idx] + if is_frozen_dataclass(obj): + mixed_values[idx] = obj + elif not isinstance(obj, type) and hasattr(obj, "__new_from_mlir_values__"): + mixed_values[idx] = obj.__new_from_mlir_values__(chunk) + else: + try: + if isinstance(chunk, list) and chunk[0] is None: + mixed_values[idx] = class_types[idx] + else: + mixed_values[idx] = t.as_numeric(chunk[0]) + except DSLRuntimeError as e: + mixed_values[idx] = chunk[0] + + log().info("------------------ ") + for idx, packed in enumerate(mixed_values): + log().info("[%d]: packed: %s", idx, packed) + log().info("------------------ ") + return mixed_values + + +def unpack_to_irvalue( + mixed_values: List[Any], body_name: str +) -> Tuple[List[ir.Value], List[Any], Dict[int, Tuple[int, int]], List[Any]]: + """ + Unpacks mixed values into ir.Value values. + """ + unpacked_values = [] + ir_values = [] + indices = {} + class_types = [] + current_offset = 0 + + log().info("===--- Values UNPack (%d)", len(mixed_values)) + for idx, packed in enumerate(mixed_values): + log().info("[%d]: will-unpacked: [type:%s] %s", idx, type(packed), packed) + for idx, item in enumerate(mixed_values): + class_types.append(item) + try: + if is_frozen_dataclass(item): + extracted_vals = [None] + else: + extracted_vals = extract_mlir_values(item) + # it's consexpr (python value), so we create mlir value for it + if extracted_vals == []: + if item is None: + extracted_vals = [None] + else: + dyn_expr = t.as_numeric(item) + extracted_vals = extract_mlir_values(dyn_expr) + ir_values.extend(extracted_vals) + else: + ir_values.extend(extracted_vals) + + unpacked_values.extend(extracted_vals) + length = len(extracted_vals) + indices[idx] = (current_offset, length) + current_offset += length + except Exception as e: + raise DSLRuntimeError( + f"The '{body_name}' statement encountered a user-defined Python object, which cannot be automatically converted into an dynamic expression (aka MLIR value).", + context={ + item: ( + f"All expressions within '{body_name}' must be dynamic expressions, " + "mixing Python objects and dynamic expressions (aka MLIR values) is not supported. " + "The DSL failed to convert the Python object into MLIR values." + ) + }, + suggestion=( + f"Please ensure '{item}' implements the '{DynamicExpression.__name__}', " + f"so it can be treated as a valid dynamic expression or mark '{body_name}' as a constant expression if conditions are Python objects." + ), + ) from e + + log().info("------------------ ") + for idx, unpacked in enumerate(unpacked_values): + log().info("[%d]: unpacked values: %s", idx, unpacked) + for idx, unpacked in enumerate(ir_values): + log().info("[%d]: unpacked ir_values: %s", idx, unpacked) + for idx, unpacked in indices.items(): + log().info("[%d]: indices: %s", idx, unpacked) + for idx, unpacked in enumerate(class_types): + log().info("[%d]: initial-class-types: %s", idx, unpacked) + log().info("------------------ ") + + return ir_values, unpacked_values, indices, class_types + + +def to_index(value): + """Converts a value to an index, either by casting or coercing to int.""" + if is_dynamic_expression(value): + if isinstance(value, Numeric): + value = value.ir_value() + assert ir.IntegerType.isinstance( + value.type + ), f"expects integer type, but got {value.type}" + res = arith.index_cast(T.index(), value) + else: + res = const(int(value), ty=T.index()) + + return res + + +def _validate_iter_args_structure(iter_args, ir_values): + """ + Validates that iter_args structure contains the same number of atomic values + as there are IR values. + + Args: + iter_args: Original iteration arguments, possibly nested sequences + ir_values: Flattened MLIR values extracted from iter_args + + Returns: + bool: True if the number of atomic values in iter_args matches + the number of values in ir_values + """ + # Handle non-sequence case + if not isinstance(iter_args, (tuple, list, set)): + return not isinstance(ir_values, (tuple, list, set)) or len(ir_values) == 1 + + # If we have a sequence but ir_values isn't one, there's a mismatch + if not isinstance(ir_values, (tuple, list, set)): + return False + + # Count all non-sequence values recursively + def count_values(args): + if not isinstance(args, (tuple, list, set)): + return 1 + else: + return sum(count_values(arg) for arg in args) + + return count_values(iter_args) == len(ir_values) + + + +# ============================================================================= +# DSL implementation of Python Build-in Operators +# ============================================================================= + + +def _minmax(op, *args, loc=None, ip=None): + """Computes the minimum or maximum value from the provided arguments.""" + from ..base_dsl.typing import _binary_op, _binary_op_type_promote + + # AST Traversal doesn't support early exit in if executor + x = None + res = None + if len(args) == 1: + # Handle case for min([a, b, c, d, ..]) + if hasattr(args[0], "__iter__"): + x = op(*tuple(args[0])) + # Handle case for min(a) + else: + x = args[0] + # Handle case for min(a, b, c, ...) and min([x, y], [b]) and min(a, (x, y, z)) + elif len(args) > 1: + res, *xs = tuple(args) + for x in xs: + lhs = as_numeric(op(res, loc=loc, ip=ip)) + rhs = as_numeric(op(x, loc=loc, ip=ip)) + emitter = getattr(cutlass_arith, f"_{op.__name__}") + + lhs, rhs, res_type = _binary_op_type_promote(lhs, rhs, promote_bool=True) + + if isinstance(lhs.value, cutlass_arith.ArithValue) and isinstance( + lhs, Integer + ): + lhs_val = lhs.value.with_signedness(lhs.signed) + else: + lhs_val = lhs.value + + if isinstance(rhs.value, cutlass_arith.ArithValue) and isinstance( + rhs, Integer + ): + rhs_val = rhs.value.with_signedness(rhs.signed) + else: + rhs_val = rhs.value + + res = res_type(emitter(lhs_val, rhs_val), loc=loc, ip=ip) + x = res + else: + raise DSLNotImplemented(f"{type(args)} is not supported") + return x + + +def min(*args, loc=None, ip=None): + """Computes the minimum value from the provided arguments. + + This function differs from Python's built-in min() in that the return type + is determined by the static types of the inputs, not their dynamic values. + + :param args: One or more values or iterables to find the minimum of + :type args: tuple + :param loc: Source location for MLIR operation tracking + :type loc: object, optional + :param ip: Insertion point for MLIR operation + :type ip: object, optional + :return: The minimum value among all inputs + :rtype: Numeric + :raises DSLNotImplemented: If the input type is not supported + + Supports multiple calling patterns: + + - min(a): Returns a + - min([a, b, c, ...]): Returns minimum of all elements in the iterable + - min(a, b, c, ...): Returns minimum of all arguments + - min([x, y], [b]): Returns minimum across all elements in all iterables + - min(a, (x, y, z)): Returns minimum across all elements + + Examples: + + .. code-block:: python + + # Find minimum of two values + result = min(x, y) + + # Find minimum of multiple values + result = min(a, b, c, d) + + # Find minimum of values in a list + values = [a, b, c, d] + result = min(values) + + # Find minimum across mixed arguments + result = min(x, [y, z]) + + Difference from Python's built-in min(): + + .. code-block:: python + + # In Python, the return type depends on the dynamic values: + a = 5 + b = 3.14 + result = min(a, b) # Returns 3.14 (float) + + # In this DSL implementation, the return type is determined statically: + a = Int32(5) + b = Float32(3.14) + result = min(a, b) # Return type is determined by the type of operands, not values + """ + return _minmax(min, *args, loc=loc, ip=ip) + + +def max(*args, loc=None, ip=None): + """Computes the maximum value from the provided arguments. + + This function differs from Python's built-in max() in that the return type + is determined by the static types of the inputs, not their dynamic values. + + :param args: One or more values or iterables to find the maximum of + :type args: tuple + :param loc: Source location for MLIR operation tracking + :type loc: object, optional + :param ip: Insertion point for MLIR operation + :type ip: object, optional + :return: The maximum value among all inputs + :rtype: Numeric + :raises DSLNotImplemented: If the input type is not supported + + Supports multiple calling patterns: + + - max(a): Returns a + - max([a, b, c, ...]): Returns maximum of all elements in the iterable + - max(a, b, c, ...): Returns maximum of all arguments + - max([x, y], [b]): Returns maximum across all elements in all iterables + - max(a, (x, y, z)): Returns maximum across all elements + + Examples: + + .. code-block:: python + + # Find maximum of two values + result = max(x, y) + + # Find maximum of multiple values + result = max(a, b, c, d) + + # Find maximum of values in a list + values = [a, b, c, d] + result = max(values) + + # Find maximum across mixed arguments + result = max(x, [y, z]) + + Difference from Python's built-in max(): + + .. code-block:: python + + # In Python, the return type depends on the dynamic values: + a = 5 + b = 3.14 + result = max(a, b) # Returns 5 (int) + + # In this DSL implementation, the return type is determined statically: + a = Int32(5) + b = Float32(3.14) + result = max(a, b) # Return type is determined by the type of operands, not values + """ + return _minmax(max, *args, loc=loc, ip=ip) + + +def and_(*args, loc=None, ip=None): + """AND operation for value in DSL numeric types. + + :param *args: One or more numeric values to AND together + :type *args: Numeric + :param loc: Source location for MLIR operation tracking + :type loc: object, optional + :param ip: Insertion point for MLIR operation + :type ip: object, optional + :return: The result of the logical AND operation + :rtype: Numeric + :raises ValueError: If no arguments are provided + + Supports multiple calling patterns: + + - and_(a): Returns a + - and_(a, b, c, ...): if a is truthy, returns and_(b, c, ...), otherwise returns a + + All arguments must be of the same type. + + Examples: + + .. code-block:: python + + # In Python, 'and' returns the second operand if the first is truthy, + # otherwise it returns the first operand + a = 5 + b = 3 + result = a and b # Returns 3 + + # In this DSL implementation, the behavior is similar but works with DSL types + a = Int32(5) + b = Int32(3) + result = and_(a, b) # Returns b + """ + if len(args) == 0: + raise ValueError("and_() requires at least one argument") + + if len(args) == 1: + return args[0] + + def and_op(lhs, rhs): + if not isinstance(lhs, (Numeric, cutlass_arith.ArithValue, int, float, bool)): + raise DSLNotImplemented(f"{type(lhs)} is not supported") + elif isinstance(lhs, (int, float, bool)) and isinstance( + rhs, (int, float, bool) + ): + return lhs and rhs + else: + return as_numeric(lhs).__dsl_and__(as_numeric(rhs)) + + return functools.reduce(and_op, args[1:], args[0]) + + +def or_(*args, loc=None, ip=None): + """Logical OR operation for DSL numeric types. + + :param *args: One or more numeric values to OR together + :type *args: Numeric + :param loc: Source location for MLIR operation tracking + :type loc: object, optional + :param ip: Insertion point for MLIR operation + :type ip: object, optional + :return: The result of the logical OR operation + :rtype: Numeric + :raises ValueError: If no arguments are provided + + Supports multiple calling patterns: + + - or_(a): Returns a + - or_(a, b, c, ...): if a is truthy, returns a, otherwise returns or_(b, c, ...) + + Examples: + + .. code-block:: python + + # In Python, 'or' returns the first operand if it's truthy, + # otherwise it returns the second operand + a = 5 + b = 3 + result = a or b # Returns 5 + + # In this DSL implementation, the behavior is similar but works with DSL types + a = Int32(5) + b = Int32(3) + result = or_(a, b) # Returns a + """ + if len(args) == 0: + raise ValueError("or_() requires at least one argument") + + if len(args) == 1: + return args[0] + + def or_op(lhs, rhs): + if not isinstance(lhs, (Numeric, cutlass_arith.ArithValue, int, float, bool)): + raise DSLNotImplemented(f"{type(lhs)} is not supported") + elif isinstance(lhs, (int, float, bool)) and isinstance( + rhs, (int, float, bool) + ): + return lhs or rhs + else: + return as_numeric(lhs).__dsl_or__(as_numeric(rhs)) + + return functools.reduce(or_op, args[1:], args[0]) + + +def all_(iterable): + """Logical AND operation for all elements in an iterable. + + Returns True if all elements in the iterable are truthy, otherwise False. + This is the DSL equivalent of Python's built-in all() function. + + :param iterable: An iterable containing values to check + :type iterable: Iterable + :return: True if all elements are truthy, False otherwise + :rtype: Boolean + + Examples: + + .. code-block:: python + + # Check if all values are non-zero + values = [Int32(1), Int32(2), Int32(3)] + result = all_(values) # Returns True + + # Check if all conditions are met + conditions = [a > 0, b < 10, c != 0] + result = all_(conditions) # Returns True if all conditions are met + """ + bool_iterable = [Boolean(i) for i in iterable] + return functools.reduce( + lambda lhs, rhs: lhs.__dsl_and__(rhs) if hasattr(lhs, "__dsl_and__") else lhs, + bool_iterable, + Boolean(True), + ) + + +def any_(iterable): + """Logical OR operation for any element in an iterable. + + Returns True if any element in the iterable is truthy, otherwise False. + This is the DSL equivalent of Python's built-in any() function. + + :param iterable: An iterable containing values to check + :type iterable: Iterable + :return: True if any element is truthy, False otherwise + :rtype: Boolean + + Examples: + + .. code-block:: python + + # Check if any value is non-zero + values = [Int32(0), Int32(0), Int32(3)] + result = any_(values) # Returns True + + # Check if any condition is met + conditions = [a > 10, b < 0, c != 0] + result = any_(conditions) # Returns True if any condition is met + """ + bool_iterable = [Boolean(i) for i in iterable] + return functools.reduce( + lambda lhs, rhs: lhs.__dsl_or__(rhs) if hasattr(lhs, "__dsl_or__") else lhs, + bool_iterable, + Boolean(False), + ) + + +# ============================================================================= +# Conditional Expression +# ============================================================================= + + +def select_(cond, if_value, else_value): + def _as_scalar(value): + if const_expr(isinstance(value, list)): + if const_expr(len(value) == 1): + return value[0] + else: + raise DSLRuntimeError( + "Conditional expression must have exactly one value in all expressions" + ) + return value + + # Non-DSL dynamic cond should be handled before this. + if const_expr(not is_dynamic_expression(cond)): + raise DSLRuntimeError("Conditional expression must be dynamic") + + # Extract MLIR values + cond = extract_mlir_values(cond) + if const_expr(is_dynamic_expression(if_value)): + if_value = extract_mlir_values(if_value) + else: + if_value = const(if_value) + if const_expr(is_dynamic_expression(else_value)): + else_value = extract_mlir_values(else_value) + else: + else_value = const(else_value) + + return arith.SelectOp( + _as_scalar(cond), _as_scalar(if_value), _as_scalar(else_value) + ).result + + +# ============================================================================= +# Terminator +# ============================================================================= + + +def yield_out(args=[], loc=None, ip=None): + """ + Generate a yield operation. It it used to return values from a loop, if-else, or while region. + """ + scf.yield_(extract_mlir_values(args), loc=loc, ip=ip) + + +# ============================================================================= +# For Loop +# ============================================================================= + + +class LoopUnroll(ir.Attribute): + def __init__(self, **kwargs): + valid_keys = set(["count", "full"]) + def to_mlir_attr(val): + if isinstance(val, bool): + return "true" if val else "false" + elif isinstance(val, int): + return f"{val} : i32" + else: + raise DSLNotImplemented(f"{type(val)} is not supported") + + cfg = {key: to_mlir_attr(kwargs[key]) for key in valid_keys if key in kwargs} + if kwargs.get("count", None) == 1: + cfg["disable"] = "true" + + unroll = "<" + ", ".join(f"{key} = {value}" for key, value in cfg.items()) + ">" + + super().__init__( + ir.Attribute.parse(f"#llvm.loop_annotation") + ) + + +def for_generate( + start, + stop=None, + step=None, + iter_args: Optional[Sequence[ir.Value]] = None, + *, + unroll: LoopUnroll = None, + loc=None, + ip=None, +): + """ + scf.for with yield support + """ + + if step is None: + step = 1 + if stop is None: + stop = start + start = 0 + start = const(start) + params = [start, stop, step] + for i, p in enumerate(params): + if isinstance(p, int): + p = const(p) + elif isinstance(p, float): + raise DSLRuntimeError(f"{p=} must be int.") + elif isinstance(p, Integer): + p = p.ir_value() + params[i] = p + + start, stop, step = params + + def _createI32Attr(value): + if not isinstance(value, int): + raise DSLRuntimeError(f"value must be int.") + return ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value) + + ir_iter_args = extract_mlir_values(iter_args) if iter_args is not None else None + if not _validate_iter_args_structure(iter_args, ir_iter_args): + raise DSLRuntimeError("iter_args: Elements should be extractable as ir.Value.") + for_op = scf.ForOp(start, stop, step, ir_iter_args, loc=loc, ip=ip) + if unroll is not None: + for_op.attributes["loop_annotation"] = unroll + + iv = for_op.induction_variable + new_results = new_from_mlir_values(iter_args, for_op.results) + new_iter_args = new_from_mlir_values(iter_args, for_op.inner_iter_args) + new_iter_args = () if new_iter_args is None else tuple(new_iter_args) + + with ir.InsertionPoint(for_op.body): + if len(new_iter_args) > 1: + yield iv, new_iter_args, new_results + elif len(new_iter_args) == 1: + yield iv, new_iter_args[0], new_results[0] + else: + yield iv + + +# ============================================================================= +# Logical Operators +# ============================================================================= + + +def not_(lhs: Union[ir.Value, bool], *, loc=None, ip=None): + """ + Logical Not + """ + res = None + # Handle Python bool first to prevent infinite recursion + if const_expr(type(lhs) == bool): + res = lhs ^ True + elif const_expr(hasattr(lhs, "__dsl_not__")): + res = lhs.__dsl_not__(loc=loc, ip=ip) + elif const_expr(is_dynamic_expression(lhs)): + # If lhs is MLIR value, compute not using xor + res = arith.XOrIOp(lhs, const(1, lhs.type)).result + else: + res = bool(lhs) ^ True + + return res + + +# ============================================================================= +# If/Else +# ============================================================================= + + +def if_generate( + cond: Boolean, + then_body: Callable, + else_body: Optional[Callable] = None, + input_args: List[DslType] = None, + return_types: List[DslType] = None, + *, + loc=None, + ip=None, +) -> List: + """ + Generate an IfOp with optional else branch and return values. + + Args: + cond: The condition expression + then_body: Function to execute in then branch + else_body: Optional function to execute in else branch + input_args: Arguments to pass to branch bodies + return_types: Expected return types for the operation + loc: Optional location information + ip: Optional insertion point + + Returns: + List of DSL typed results + """ + input_args = input_args or [] + mlir_return_types = [] + + # Validate and collect MLIR return types (if provided). + if return_types is not None: + for t in return_types: + if not isinstance(t, DslType): + raise DSLRuntimeError(f"{t=} must be a DslType.") + mlir_return_types.append(t.mlir_type) + + # Determine whether there's an else branch. + has_else = else_body is not None + + # Create the IfOp. + if_op = scf.IfOp( + Boolean(cond).ir_value(), mlir_return_types, hasElse=has_else, loc=loc, ip=ip + ) + + def _execute_and_yield_out(body, input_args): + yield_vals = body(*input_args) + if return_types is not None: + if not isinstance(yield_vals, Iterable): + # body only return single element + yield_vals = [yield_vals] + + yield_vals = [t(r) for t, r in zip(return_types, yield_vals)] + yield_out(yield_vals) + + # Generate the body for 'then'. + with ir.InsertionPoint(if_op.then_block): + _execute_and_yield_out(then_body, input_args) + + # Generate the body for 'else' if provided. + if has_else: + with ir.InsertionPoint(if_op.else_block): + _execute_and_yield_out(else_body, input_args) + + # Collect MLIR results. + mlir_results = _get_op_result_or_op_results(if_op) + + if not isinstance(mlir_results, list): + mlir_results = [mlir_results] + + # Wrap the results with their DSL types. + if return_types is None: + return [] + + vals = [t(r) for t, r in zip(return_types, mlir_results)] + + if len(vals) == 1: + return vals[0] + + return vals + + +# ============================================================================= +# While Loop +# ============================================================================= + + +class WhileLoopContext: + """ + Context manager for a dynamic while loop. + """ + + def __init__( + self, + inputs: Sequence[Union[ir.Value, Numeric]], + condition: Callable[[Sequence[ir.Value]], ir.Value], + *, + loc=None, + ip=None, + ): + # Keep original inputs and allow recover original type information + self.inputs = inputs + + self.input_ir_values = extract_mlir_values(inputs) + + if not _validate_iter_args_structure(inputs, self.input_ir_values): + raise DSLRuntimeError("inputs: Elements should be extractable as ir.Value.") + + self.condition = condition + self.input_ir_types = [i.type for i in self.input_ir_values] + self.while_op = scf.WhileOp( + self.input_ir_types, self.input_ir_values, loc=loc, ip=ip + ) + + self.before_region = self.while_op.before + self.after_region = self.while_op.after + + self.before_region.blocks.append(*self.input_ir_types) + self.before_block = self.before_region.blocks[0] + + self.after_region.blocks.append(*self.input_ir_types) + self.after_block = self.after_region.blocks[0] + + def __enter__(self): + with ir.InsertionPoint(self.before_block): + args = new_from_mlir_values(self.inputs, self.before_block.arguments) + cond = self.condition(*args) + cond_ir_val = extract_mlir_values(cond) + scf.ConditionOp(cond_ir_val[0], [*self.before_block.arguments]) + self.ipoint_op = ir.InsertionPoint(self.after_block) + self.ipoint_op.__enter__() + return new_from_mlir_values(self.inputs, self.after_block.arguments) + + def __exit__(self, exc_type, exc_value, traceback): + self.ipoint_op.__exit__(exc_type, exc_value, traceback) + return True + + @property + def results(self): + return new_from_mlir_values(self.inputs, self.while_op.results_) + + +def while_generate( + inputs: Sequence[Union[ir.Value, Numeric]], + condition: Callable[[Sequence[Union[ir.Value, Numeric]]], Union[ir.Value, Numeric]], + *, + loc=None, + ip=None, +) -> WhileLoopContext: + """ + Generate a WhileLoopContext for a dynamic loop. + """ + return WhileLoopContext(inputs, condition, loc=loc, ip=ip) diff --git a/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py new file mode 100644 index 00000000..ba7b9d76 --- /dev/null +++ b/python/CuTeDSL/cutlass_dsl/cutlass_ast_decorators.py @@ -0,0 +1,515 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: LicenseRef-NvidiaProprietary +# +# Use of this software is governed by the terms and conditions of the +# NVIDIA End User License Agreement (EULA), available at: +# https://docs.nvidia.com/cutlass/media/docs/pythonDSL/license.html +# +# Any use, reproduction, disclosure, or distribution of this software +# and related documentation outside the scope permitted by the EULA +# is strictly prohibited. + +from typing import List, Tuple +from cutlass._mlir import ir +from cutlass._mlir.dialects import scf, arith +from cutlass._mlir.extras import types as T + +from ..base_dsl.dsl import extract_mlir_values, new_from_mlir_values +from ..base_dsl.ast_helpers import * +from ..base_dsl.utils.logger import log +from ..base_dsl import typing as t +from ..base_dsl.typing import Int32, Float32, Boolean, Numeric, get_mlir_types +from . import cutlass as cutlass_dsl + +# ============================================================================= +# AST Helpers +# ============================================================================= + + +class LoopUnroll(ir.Attribute): + def __init__(self, **kwargs): + valid_keys = set(["count", "full"]) + def to_mlir_attr(val): + if isinstance(val, bool): + return "true" if val else "false" + elif isinstance(val, int): + return f"{val} : i32" + else: + raise DSLNotImplemented(f"{type(val)} is not supported") + + cfg = {key: to_mlir_attr(kwargs[key]) for key in valid_keys if key in kwargs} + if kwargs.get("count", None) == 1: + cfg["disable"] = "true" + + unroll = "<" + ", ".join(f"{key} = {value}" for key, value in cfg.items()) + ">" + + super().__init__( + ir.Attribute.parse(f"#llvm.loop_annotation") + ) + + +class ScfGenerator: + """ + Encapsulates common scf dialect functionality: pack, unpack, and SCF execution. + """ + + def __init__(self): + pass + + @staticmethod + def fill_none(ir_values, unpacked_values): + i = 0 + for idx, item in enumerate(unpacked_values): + if item is not None: + unpacked_values[idx] = ir_values[i] + i += 1 + + @staticmethod + def _normalize_region_result_to_list(region_result: Any) -> List[Any]: + """ + Convert region_result to a list if it is not already a list + If region_result is a list, return it as is. + If region_result is None, return an empty list. + If region_result is not a list, return a list containing region_result as the only element. + """ + if region_result is None: + region_result_list = [] + elif not isinstance(region_result, list): + region_result_list = [region_result] + else: + region_result_list = region_result + return region_result_list + + @staticmethod + def check_region_result(region_values, ir_values): + for i, (expected_value, actual_value) in enumerate( + zip(ir_values, region_values) + ): + expected_value_type = get_mlir_types(expected_value) + actual_value_type = get_mlir_types(actual_value) + if expected_value_type != actual_value_type: + return False, i, expected_value_type, actual_value_type + return True, -1, None, None + + def scf_execute_dynamic( + self, + op_type_name: str, + used_args: List[Any], + mix_iter_args: List[Any], + mix_iter_arg_names: List[str], + create_op_func: Callable[ + [List[ir.Value], Dict[int, Tuple[int, int]], List[Any]], ir.Operation + ], + region_builders: List[ + Callable[ + [ + "ir.Operation", + List["ir.Value"], # block_args + List[Any], # used_args + List["ir.Value"], # dyn_yield_ops + Dict[int, Tuple[int, int]], + List[Any], + ], + Any, + ] + ], + # block_term_op_builder[region_builder] = scf_op_builder + # e.g. scf.ConditionOp for while loop + block_term_op_builder: Dict[Callable, Callable] = {}, + ) -> Any: + # 1) Unpack + ir_values, dyn_unpacked_values, dyn_indices, dyn_class_types = ( + cutlass_dsl.unpack_to_irvalue(mix_iter_args, op_type_name) + ) + # 2) Create the SCF op + op = create_op_func(ir_values, dyn_indices, dyn_class_types) + log().debug("Generated scf.%s \n[%s]", op_type_name, op) + + # 3) Build the regions + for i, builder in enumerate(region_builders): + region = op.regions[i] + block = region.blocks[0] + with ir.InsertionPoint(block): + block_args = list(block.arguments) + region_result = builder( + op, + block_args, + used_args, + dyn_unpacked_values, + dyn_indices, + dyn_class_types, + ) + + # Use custom terminator if provided for this builder, otherwise use default YieldOp + if builder in block_term_op_builder: + # Use the provided terminator generator + block_term_op_builder[builder](region_result) + else: + # Normalize region_result + region_result_list = ScfGenerator._normalize_region_result_to_list( + region_result + ) + # Default behavior - generate YieldOp + region_values, unpacked_values, _, _ = ( + cutlass_dsl.unpack_to_irvalue(region_result_list, op_type_name) + ) + + is_match, mismatch_idx, expected_type, actual_type = ( + ScfGenerator.check_region_result(region_values, ir_values) + ) + + if not is_match: + # From unpacked index, we need to find the original index + original_idx = -1 + for unpacked_idx, (original_idx, length) in dyn_indices.items(): + if ( + mismatch_idx >= original_idx + and mismatch_idx < original_idx + length + ): + original_idx = unpacked_idx + break + raise DSLRuntimeError( + f"`{op_type_name}` expects {expected_type} type for varible `{mix_iter_arg_names[original_idx]}`, but got {actual_type}.", + suggestion=f"Please make sure `{mix_iter_arg_names[original_idx]}` type is not changed inside of `{op_type_name}`.", + ) + scf.YieldOp(region_values) + + log().debug("Completed scf.%s \n[%s]", op_type_name, op) + ScfGenerator.fill_none(op.results, unpacked_values) + + # 4) Pack final results + final_results = cutlass_dsl.pack_from_irvalue( + unpacked_values, dyn_indices, dyn_class_types + ) + + # 5) Return in a nice pattern + if not final_results: + return + if len(final_results) == 1: + return final_results[0] + return final_results + + +def _loop_execute_range_dynamic( + func: Callable, + start: Any, + stop: Any, + step: Any, + used_args: List[Any] = [], + mix_iter_args: List[Any] = [], + mix_iter_arg_names: List[str] = [], + unroll: int = -1, + unroll_full: bool = False, +): + """ + Example: build an scf.for with optional unroll, using our universal helper. + """ + scf_gen = ScfGenerator() + + def create_for_op( + dyn_yield_ops: List[ir.Value], + dyn_indices: Dict[int, Tuple[int, int]], + dyn_class_types: List[Any], + ): + for d in dyn_yield_ops: + if not isinstance(d, ir.Value): + raise DSLRuntimeError( + f"Invalid dyn_yield_ops: {dyn_yield_ops} \n\tExpected ir.Value, got {type(d)}" + ) + + # Convert Python ints or values to IR constants if needed + start_ = t.as_numeric(start) + stop_ = t.as_numeric(stop) + step_ = t.as_numeric(step) + assert start_ is not t.Int32, "Start is required for scf.for" + assert stop_ is not t.Int32, "Stop is required for scf.for" + assert step_ is not t.Int32, "Step is required for scf.for" + start_ = start_.ir_value() + stop_ = stop_.ir_value() + step_ = step_.ir_value() + + # Possibly attach unroll attributes + unroll_attr = None + if unroll_full: + unroll_attr = LoopUnroll(full=True) + elif unroll != -1: + unroll_attr = LoopUnroll(count=unroll) + log().debug("Unroll attribute: %s", unroll_attr) + + log().debug( + "Creating scf.ForOp \n\t\tstart=%s: type : %s\n\t\tstop=%s: type : %s\n\t\tstep=%s: type : %s", + start_, + type(start_), + stop_, + type(stop_), + step_, + type(step_), + ) + # Create scf.ForOp, passing iteration args if any + try: + if not dyn_yield_ops: + for_op = scf.ForOp(start_, stop_, step_) + else: + for_op = scf.ForOp(start_, stop_, step_, list(dyn_yield_ops)) + except Exception as e: + yield_ops = "\n".join( + f"\t\t{i} => {d} : type : {type(d)}" + for i, d in enumerate(dyn_yield_ops) + ) + raise DSLRuntimeError( + f"Failed to create scf.ForOp \n\t\tstart={start_}: type : {type(start_)}" + f"\n\t\tstop={stop_}: type : {type(stop_)}\n\t\tstep={step_}: type : {type(step_)}" + f", \n\tdyn_yield_ops:\n{yield_ops}" + ) from e + + if unroll_attr is not None: + for_op.attributes["loop_annotation"] = unroll_attr + + return for_op + + def for_body_builder( + op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types + ): + # Insert induction variable at the beginning + dyn_yield_ops.insert(0, block_args[0]) + ScfGenerator.fill_none(block_args, dyn_yield_ops) + block_args = dyn_yield_ops + # scf.ForOp block_args are typically [induction_var, iter_args...] + # But MLIR also gives you op.induction_variable + iv = t.as_numeric(op.induction_variable) + log().debug( + "For body builder: %s block_args: %s used_args: %s", + iv, + block_args, + used_args, + ) + if len(block_args) <= 1: + # No iteration arguments, or only the induction var + func(iv, *used_args) + return [] # yield nothing + else: + # block_args[1:] are iteration variables + func_args = [*used_args] + func_args.extend( + cutlass_dsl.pack_from_irvalue( + block_args[1:], dyn_indices, dyn_class_types + ) + ) + updated_func_args = func(iv, *func_args) + return updated_func_args + + # Now call the universal SCF executor with a single region builder + return scf_gen.scf_execute_dynamic( + op_type_name="for", + used_args=used_args, + mix_iter_args=mix_iter_args, + mix_iter_arg_names=mix_iter_arg_names, + create_op_func=create_for_op, + region_builders=[for_body_builder], + ) + + +def _if_execute_dynamic( + pred: "ir.Value", + then_block: Callable, + else_block: Callable = None, + used_args: List[Any] = [], + mix_yield_args: List[Any] = [], + mix_yield_arg_names: List[str] = [], + if_constexpr=None, # ignoring for brevity +): + """ + Build an scf.if with optional else, using our universal helper. + """ + scf_gen = ScfGenerator() + + def create_if_op( + dyn_yield_ops: List[ir.Value], + dyn_indices: Dict[int, Tuple[int, int]], + dyn_class_types: List[Any], + ): + # Assume final result types match the dynamic yields + result_types = [arg.type for arg in dyn_yield_ops] + + pred_ = t.as_numeric(pred) + + if not isinstance(pred_, Boolean): + # Convert to Boolean through comparison + pred_ = pred_ == True + + try: + if_op = scf.IfOp( + pred_.ir_value(), + hasElse=(else_block is not None), + results_=result_types, + ) + except Exception as e: + raise DSLRuntimeError( + f"Failed to create scf.IfOp \n\t\tpred={pred_}: type : {type(pred_)}" + ) from e + return if_op + + def then_builder( + if_op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types + ): + flat_args = [*used_args] + flat_args.extend( + cutlass_dsl.pack_from_irvalue(dyn_yield_ops, dyn_indices, dyn_class_types) + ) + return then_block(*flat_args) + + region_builders = [then_builder] + + if else_block is not None: + + def else_builder( + if_op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types + ): + flat_args = [*used_args] + flat_args.extend( + cutlass_dsl.pack_from_irvalue( + dyn_yield_ops, dyn_indices, dyn_class_types + ) + ) + return else_block(*flat_args) + + region_builders.append(else_builder) + + return scf_gen.scf_execute_dynamic( + op_type_name="if", + used_args=used_args, + mix_iter_args=mix_yield_args, + mix_iter_arg_names=mix_yield_arg_names, + create_op_func=create_if_op, + region_builders=region_builders, + ) + + +def _while_execute_dynamic( + while_before_block: Callable, + while_after_block: Callable = None, + used_args=[], + yield_args=[], + yield_arg_names=[], +): + """ + Create and return an SCF WhileOp for dynamic loops. + Generate the dynamic loop body using SCF WhileOp. + + Args: + while_before_block: Function that returns (condition, updated_values) + while_after_block: Function that returns updated values + used_args: Additional arguments used in the loop body + yield_args: Values that are updated in the loop + + See create_while_function in ast_preprocessor.py for details on the input structure. + """ + log().debug("_while_execute_dynamic") + while_op_type_name = "while" + scf_gen = ScfGenerator() + + def create_while_op( + dyn_yield_ops: List[ir.Value], + dyn_indices: Dict[int, Tuple[int, int]], + dyn_class_types: List[Any], + ): + # Create the while operation with the types from yield_args + result_types = [arg.type for arg in dyn_yield_ops] + try: + while_op = scf.WhileOp(result_types, dyn_yield_ops) + while_op.before.blocks.append(*result_types) + while_op.after.blocks.append(*result_types) + log().debug("[%s]", while_op) + return while_op + except Exception as e: + yield_ops = "\n".join( + f"\t\t{i} => {d} : type : {type(d)}" + for i, d in enumerate(dyn_yield_ops) + ) + raise DSLRuntimeError( + f"Failed to create scf.WhileOp with yield_ops:\n{yield_ops}" + ) from e + + def before_block_builder( + op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types + ): + # Build the before (condition) block + ScfGenerator.fill_none(block_args, dyn_yield_ops) + block_args = dyn_yield_ops + flat_args = [*used_args] + flat_args.extend( + cutlass_dsl.pack_from_irvalue(block_args, dyn_indices, dyn_class_types) + ) + + log().debug("before block args: %s", flat_args) + + cond, before_results = while_before_block(*flat_args) + + if not isinstance(before_results, (list, ir.OpResultList)): + before_results = [before_results] + + log().debug("cond [%s]", cond) + log().debug( + "before_results [%s]", + before_results, + ) + + return cond, before_results + + def before_block_terminator(cond_and_results): + # Generate a condition op instead of yield op + cond = cond_and_results[0] + before_result_list = ScfGenerator._normalize_region_result_to_list( + cond_and_results[1] + ) + ir_cond_list, _, _, _ = cutlass_dsl.unpack_to_irvalue( + [cond], while_op_type_name + ) + ir_cond = ir_cond_list[0] + ir_results_list, _, _, _ = cutlass_dsl.unpack_to_irvalue( + before_result_list, while_op_type_name + ) + log().debug( + "creating scf.ConditionOp with [%s], [%s]", + ir_cond, + ir_results_list, + ) + scf.ConditionOp(ir_cond, ir_results_list) + + def after_block_builder( + op, block_args, used_args, dyn_yield_ops, dyn_indices, dyn_class_types + ): + # Build the after (body) block + ScfGenerator.fill_none(block_args, dyn_yield_ops) + block_args = dyn_yield_ops + flat_args = [*used_args] + flat_args.extend( + cutlass_dsl.pack_from_irvalue(block_args, dyn_indices, dyn_class_types) + ) + + log().debug("after block args: %s", flat_args) + + after_results = while_after_block(*flat_args) + + if not isinstance(after_results, (list, ir.OpResultList)): + after_results = [after_results] + + log().debug( + "after_results [%s]", + after_results, + ) + + return after_results + + # Call the universal SCF executor with two region builders + return scf_gen.scf_execute_dynamic( + op_type_name=while_op_type_name, + used_args=used_args, + mix_iter_args=yield_args, + mix_iter_arg_names=yield_arg_names, + create_op_func=create_while_op, + region_builders=[before_block_builder, after_block_builder], + block_term_op_builder={ + before_block_builder: before_block_terminator + }, # Only customize the before block + ) diff --git a/python/CuTeDSL/requirements.txt b/python/CuTeDSL/requirements.txt new file mode 100644 index 00000000..78ff7a28 --- /dev/null +++ b/python/CuTeDSL/requirements.txt @@ -0,0 +1,3 @@ +# Use `pip install -r requirements.txt` with the present file to install a +# wheel consistent with the present state of the github repository +nvidia-cutlass-dsl=4.0.0.dev1 diff --git a/python/cutlass/__init__.py b/python/cutlass/__init__.py index 6cbc9eef..bc2a98d9 100644 --- a/python/cutlass/__init__.py +++ b/python/cutlass/__init__.py @@ -133,7 +133,7 @@ def get_option_registry(): this._option_registry = OptionRegistry(device_cc()) return this._option_registry -this.__version__ = '3.9.2' +this.__version__ = '4.0.0' from cutlass.backend import create_memory_pool from cutlass.emit.pytorch import pytorch diff --git a/python/cutlass/op/conv.py b/python/cutlass/op/conv.py index 0e8366ab..3639d477 100644 --- a/python/cutlass/op/conv.py +++ b/python/cutlass/op/conv.py @@ -111,6 +111,7 @@ args.sync() """ + from __future__ import annotations from typing import Optional from cutlass.utils.lazy_import import lazy_import diff --git a/python/cutlass/utils/lazy_import.py b/python/cutlass/utils/lazy_import.py index 28ba6546..16f6a185 100644 --- a/python/cutlass/utils/lazy_import.py +++ b/python/cutlass/utils/lazy_import.py @@ -1,3 +1,34 @@ +################################################################################################# +# +# Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +################################################################################################# import importlib from typing import Any @@ -8,4 +39,3 @@ def lazy_import(mod_name: str) -> Any: return getattr(module, name) return Lazy() - diff --git a/python/cutlass/utils/profiler.py b/python/cutlass/utils/profiler.py index 155c1d35..5733f3ba 100644 --- a/python/cutlass/utils/profiler.py +++ b/python/cutlass/utils/profiler.py @@ -193,3 +193,4 @@ class CUDAEventProfiler: flops_ += m * n * batch_count * 2 return flops_ + diff --git a/python/cutlass_library/emit_kernel_listing.py b/python/cutlass_library/emit_kernel_listing.py index a6eca001..70ba077e 100755 --- a/python/cutlass_library/emit_kernel_listing.py +++ b/python/cutlass_library/emit_kernel_listing.py @@ -75,15 +75,10 @@ audit_csv_runtime_fields = [ ] def hash_cutlass_string(input_string): - # Regex pattern to match instruction shape - instruction_shape_pattern = r"[a-zA-Z]\d+x\d+x\d+" # Matches '_s128x128x64', '_h64x128x16', etc. mma_cluster_shape_pattern = r"_\d+x\d+x\d+" # Matches MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1') - # Remove instruction shape (e.g., '_s128x128x64', '_h64x128x16') - output = re.sub(instruction_shape_pattern, "", input_string) - # Remove MMA and Cluster shapes (e.g., '_128x128x256', '_0x0x1') - output = re.sub(mma_cluster_shape_pattern, "", output) + output = re.sub(mma_cluster_shape_pattern, "", input_string) return output @@ -288,7 +283,7 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode # TODO: randomize beta values for wider coverage beta_values = [0.5] - is_supported_arch = (arch in ["100a", "101a", "120a"]) + is_supported_arch = (arch in ["100a", "100f", "101a", "101f", "120a", "120f"]) is_runtime_datatype_enabled = mode == "functional_L0" and is_supported_arch @@ -300,23 +295,23 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode # sm100_mma_data_type_general = [ - 'x16gemm_f16_f16_f16_f16_f16', - 'x16gemm_f16_f16_f16_void_f16', - 'x16gemm_f16_f16_f32_f16_f16', - 'x8tf32gemm_f32_f32_f32_f32_f32', - 'x16bf16gemm_f32_f32_f32_f32_f32', + 'gemm_f16_f16_f16_f16_f16', + 'gemm_f16_f16_f16_void_f16', + 'gemm_f16_f16_f32_f16_f16', + 'tf32gemm_f32_f32_f32_f32_f32', + 'bf16gemm_f32_f32_f32_f32_f32', ] sm100_mma_data_type_runtime_dtype = [ - 'x32gemm_f4_f4_f32_f32_f32', - 'x32gemm_f6_f6_f32_f32_f32', - 'x32gemm_f8_f8_f32_f32_f32', + 'gemm_f4_f4_f32_f32_f32', + 'gemm_f6_f6_f32_f32_f32', + 'gemm_f8_f8_f32_f32_f32', ] sm100_mma_data_type_mergeable = [ - 'x32gemm_e4m3_e4m3_f32_f32_f32',# mask out one instance for verification - 'x32gemm_e2m1_e2m1_f32_f32_f32', - 'x32gemm_e3m2_e3m2_f32_f32_f32', + 'gemm_e4m3_e4m3_f32_f32_f32',# mask out one instance for verification + 'gemm_e2m1_e2m1_f32_f32_f32', + 'gemm_e3m2_e3m2_f32_f32_f32', ] sm100_mma_cluster_size = [ @@ -331,22 +326,15 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode 'ntn' ] - sm100_mma_instruction_shape = [ - # [0] .1CTA, General - ['64x128', '128x128', '128x256'], - # [1] .2CTA, General - ['128x128', '256x128', '256x256'], - ] - # regex list must be in kernel procedural name order - mergeable_sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" - mergeable_sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" + mergeable_sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" + mergeable_sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_mergeable, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" - sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" - sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" + sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" + sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_general, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" - sm100_mma_filter_regex_1sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" - sm100_mma_filter_regex_2sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" + sm100_mma_filter_regex_1sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" + sm100_mma_filter_regex_2sm_runtime = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_data_type_runtime_dtype, sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" # # Block Scale Gemm @@ -354,19 +342,19 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode block_scaled_data_type_base = [ # runtime datatypes - 'x32gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2', - 'x64gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2', - 'x32gemm.*ue8m0xf4_ue8m0xf6_f32_f16_e5m2', - 'x64gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1', - 'x32gemm.*ue8m0xf6_ue8m0xf6_f32_f16_ue8m0xe3m2', + 'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2', + 'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_e5m2', + 'gemm.*ue8m0xf4_ue8m0xf6_f32_f16_e5m2', + 'gemm.*ue8m0xf4_ue8m0xf4_f32_f16_ue8m0xe2m1', + 'gemm.*ue8m0xf6_ue8m0xf6_f32_f16_ue8m0xe3m2', ] block_scaled_data_type_mergeable = [ - 'x32gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2', - 'x64gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2', - 'x32gemm.*ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2', - 'x64gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1', - 'x32gemm.*ue8m0xe2m3_ue8m0xe2m3_f32_f16_ue8m0xe3m2', + 'gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2', + 'gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2', + 'gemm.*ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2', + 'gemm.*ue8m0xe2m1_ue8m0xe2m1_f32_f16_ue8m0xe2m1', + 'gemm.*ue8m0xe2m3_ue8m0xe2m3_f32_f16_ue8m0xe3m2', ] block_scaled_data_type = block_scaled_data_type_base + block_scaled_data_type_mergeable @@ -377,56 +365,43 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode ] block_scaled_layouts = ['tnt'] - block_scaled_instruction_shape = [ - # .1CTA - ['128x128', '128x192', '128x256'], - # .2CTA - ['256x128', '256x192', '256x256'], - ] # regex list must be in kernel procedural name order - mergeable_block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[0], block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*" - mergeable_block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[1], block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*" + mergeable_block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*" + mergeable_block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type_mergeable, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*" - block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[0], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*" - block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[1], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*" + block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*" + block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*" - if arch == "100a": + if arch == "100a" or arch == "100f": kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \ f"({sm100_mma_filter_regex_2sm})|" \ f"({sm100_mma_filter_regex_1sm_runtime})|" \ f"({sm100_mma_filter_regex_2sm_runtime})|" \ f"({block_scaled_filter_regex_1sm})|" \ f"({block_scaled_filter_regex_2sm})" - elif arch == "101a": + elif arch == "101a" or arch == "101f": kernel_filter = f"({sm100_mma_filter_regex_1sm})|" \ f"({sm100_mma_filter_regex_2sm})|" \ f"({sm100_mma_filter_regex_1sm_runtime})|" \ f"({sm100_mma_filter_regex_2sm_runtime})|" \ f"({block_scaled_filter_regex_1sm})|" \ f"({block_scaled_filter_regex_2sm})" - elif arch == "120a": + elif arch == "120a" or arch == "120f": # blockscaled sm120_mma kernels blockscaled_sm120_mma_kernel_cta_tiles = [ [ '128x128' ] ] - # sm120 MMA instruction shapes - blockscaled_sm120_mma_instruction_shapes = [ - [ 's16x8x64gemm', - 's16x8x32gemm' - ] - ] - # Restrict to two layouts to reduce L0 build and test time. blockscaled_sm120_mma_layouts = [ 'tn' ] - filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_instruction_shapes[0], blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*" + filter_regex_blockscaled_sm120_mma = "cutlass3x_sm120_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [blockscaled_sm120_mma_kernel_cta_tiles[0], blockscaled_sm120_mma_layouts]]) + ").*" problem_waves = [0.5, 1.25, 2.5] kernel_filter = f"({filter_regex_blockscaled_sm120_mma})" else: - error_message = "unsupported arch, only support sm100a, sm101a, sm120a" + error_message = "unsupported arch, only support sm100a, sm100f, sm101a, sm101f, sm120a, sm120f" raise Exception(error_message) # Statically encoded kernels are still added to generated_kernels @@ -445,14 +420,8 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode ] # Restrict to two layouts to reduce L1 build and test time. sm100_mma_layouts = ['tnt', 'ntn'] - sm100_mma_instruction_shape = [ - # .1CTA - ['64x128', '128x128', '128x256'], - # .2CTA - ['128x128', '256x128', '256x256'] - ] - sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[0], sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" - sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_instruction_shape[1], sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" + sm100_mma_filter_regex_1sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*1sm.*" + sm100_mma_filter_regex_2sm = "cutlass3x_sm100_tensorop.*(" + ").*(".join([ "|".join(x) for x in [sm100_mma_cluster_size, sm100_mma_layouts]]) + ").*2sm.*" block_scaled_data_type = [ 'ue8m0xe2m1_ue8m0xe2m1_f32_f16_e5m2', 'ue8m0xe2m1_ue8m0xe2m3_f32_f16_e5m2', @@ -463,15 +432,10 @@ def emit_gemm_kernel_testlist(manifest, curr_build_dir, arch, mode block_scaled_cluster_size = ['4x4x1', '2x1x1', '0x0x1'] block_scaled_layouts = ['tnt'] - block_scaled_instruction_shape = [ - # .1CTA - ['128x128', '128x192', '128x256'], - # .2CTA - ['256x128', '256x192', '256x256'], - ] + # regex list must be in kernel procedural name order - block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[0], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*" - block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_instruction_shape[1], block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*" + block_scaled_filter_regex_1sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*1sm.*" + block_scaled_filter_regex_2sm = "cutlass3x_sm100_bstensorop.*(" + ").*(".join([ "|".join(x) for x in [block_scaled_data_type, block_scaled_cluster_size, block_scaled_layouts]]) + ").*2sm.*" filter_regex_sm100_mma = f"({sm100_mma_filter_regex_1sm})|" \ f"({sm100_mma_filter_regex_2sm})|" \ f"({block_scaled_filter_regex_1sm})|" \ diff --git a/python/cutlass_library/gemm_operation.py b/python/cutlass_library/gemm_operation.py index 54acee63..f85e160f 100644 --- a/python/cutlass_library/gemm_operation.py +++ b/python/cutlass_library/gemm_operation.py @@ -183,10 +183,7 @@ class GemmOperation: math_op = self.tile_description.math_instruction.math_operation math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else '' - if self.is_3x: - inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) - else: - inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) + inst_shape = "{0}{1}{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) if not self.is_3x else "" inst_shape += math_op_string @@ -194,7 +191,9 @@ class GemmOperation: self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator: intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] - return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, GemmKindNames[self.gemm_kind]) + short_math_name = self.short_math_name() if not self.is_3x else "" + + return "%s%s%s%s" % (short_math_name, inst_shape, intermediate_type, GemmKindNames[self.gemm_kind]) # Generates a string representing the MMA instruction. def extended_name(self): @@ -337,18 +336,36 @@ class GemmOperation: def opcode_class_name(self): return OpcodeClassNames[self.tile_description.math_instruction.opcode_class] + def get_collective_tile_shape(self): + """ + Get the tile shape passed to the collective builder. + On Blackwell, this is different than the operation.tile_description.tile_shape. + """ + is_sm100_kernel = (self.arch == 100) + if not is_sm100_kernel: + return self.tile_description.tile_shape + + opcode_class_main = self.tile_description.math_instruction.opcode_class + instruction_shape = self.tile_description.math_instruction.instruction_shape + tile_shape_m, tile_shape_n, tile_shape_k = self.tile_description.tile_shape + if opcode_class_main in [OpcodeClass.TensorOp, OpcodeClass.BlockScaledTensorOp, OpcodeClass.SparseTensorOp]: + tile_shape_m = instruction_shape[0] + tile_shape_n = instruction_shape[1] + return (tile_shape_m, tile_shape_n, tile_shape_k) + # Generates the full kernel function name def procedural_name(self): ''' The full procedural name indicates architecture, extended name, tile size, and layout. ''' opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class] if self.arch >= 90: kernel_name_template = "cutlass{p}_sm{ar}_{op}_{ex}{ct}{cs}_{l}_{s}_align{al}{t}{k}{e}" + tile_shape = self.get_collective_tile_shape() return kernel_name_template.format( p = self.prefix, ar = self.arch, op = opcode_class_name, ex = self.extended_name_3x(), - ct = '_' + 'x'.join([str(i) for i in self.tile_description.tile_shape]) if self.tile_description.tile_shape[0] > 0 else "", + ct = '_' + 'x'.join([str(i) for i in tile_shape]) if tile_shape[0] > 0 else "", cs = '_' + 'x'.join([str(i) for i in self.tile_description.cluster_shape]), l = self.tile_description.stages, s = self.layout_name_3x(), @@ -920,28 +937,8 @@ ${compile_guard_end} instruction_shape = operation.tile_description.math_instruction.instruction_shape cluster_m = operation.tile_description.cluster_shape[0] cluster_n = operation.tile_description.cluster_shape[1] - - tile_shape_m, tile_shape_n, tile_shape_k = tile_shape - - # account for static/dynamic cluster shapes - cta_m = tile_shape[0] // cluster_m if cluster_m > 0 else tile_shape[0] cta_n = tile_shape[1] // cluster_n if cluster_n > 0 else tile_shape[1] - - - # Shape passed to epilogue builder - is_sm100_kernel = (operation.arch == 100) - if is_sm100_kernel: - cta_m_per_mma_instruction = 2 if "2sm" in operation.procedural_name() else 1 - if cluster_m <= 0: - cta_m = cta_m // cta_m_per_mma_instruction - - if opcode_class_main in [OpcodeClass.TensorOp - , OpcodeClass.BlockScaledTensorOp - , OpcodeClass.SparseTensorOp - ]: - tile_shape_m = instruction_shape[0] - tile_shape_n = instruction_shape[1] - + tile_shape_m, tile_shape_n, tile_shape_k = operation.get_collective_tile_shape() # stage count set to zero indicates builder automatic stage selection if operation.tile_description.stages > 0: diff --git a/python/cutlass_library/generator.py b/python/cutlass_library/generator.py index 20f8e828..58b605ad 100644 --- a/python/cutlass_library/generator.py +++ b/python/cutlass_library/generator.py @@ -1003,14 +1003,11 @@ class ConvOperation3x: math_op = self.tile_description.math_instruction.math_operation math_op_string = math_operations_map[math_op] if math_op in math_operations_map.keys() else '' - inst_shape = "{0}x{1}x{2}".format(*tuple(self.tile_description.math_instruction.instruction_shape)) - inst_shape += math_op_string - if self.tile_description.math_instruction.element_a != self.A.element and \ self.tile_description.math_instruction.element_a != self.tile_description.math_instruction.element_accumulator: intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a] - return "%s%s%s%s" % (self.short_math_name(), inst_shape, intermediate_type, ConvKindNames[self.conv_kind]) + return "%s%s%s" % (math_op_string, intermediate_type, ConvKindNames[self.conv_kind]) def extended_name(self): '''Generates a string representing the MMA atom. Assumes accumulator type is C type.''' @@ -5997,8 +5994,8 @@ def GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version): math_instructions = generate_mixed_dtype_math_instructions_sm90(instantiation_level, valid_types_for_a_b_acc) - valid_types_for_d = [DataType.f32] - valid_types_for_c = [DataType.f32] + valid_types_for_d = [DataType.f32, DataType.bf16, DataType.f16, DataType.e4m3, DataType.e5m2] + valid_types_for_c = copy.deepcopy(valid_types_for_d) tile_descriptions = generate_tile_descriptions_sm90( math_instructions=math_instructions, @@ -6009,6 +6006,12 @@ def GenerateSM90_TensorOp_mixed_dtype_WGMMA_gemm(manifest, cuda_version): math_inst = tile_desc.math_instruction data_types = [] + # Limit C/D types to avoid a giant number of instantiations. + # A typical use case for mixed dtype in DL is weight quantization (tensor A), + # therefore we can limit the output type to that of activation (tensor B). + valid_types_for_c = [math_inst.element_b] + valid_types_for_d = [math_inst.element_b] + for c_type, d_type in product(valid_types_for_c, valid_types_for_d): data_types.append( generate_data_types_from_math_instruction( @@ -6791,6 +6794,11 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [[1,2,1], [1,1,1], [1,4,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.Default ] @@ -6838,6 +6846,11 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version): cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1] , DynamicClusterShape ] + + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1] + , DynamicClusterShape + ] for math_inst in math_instructions_2sm: tile_descriptions = [] @@ -6937,6 +6950,11 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [[1,2,1], [1,1,1], [1,4,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.Default ] @@ -7090,6 +7108,11 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -7247,6 +7270,11 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.Default, ] @@ -7456,6 +7484,11 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version, gemm_kind=GemmK , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -7916,6 +7949,13 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [ + [2,1,1], + [1,1,1] + , DynamicClusterShape + ] + # 1xSM MMA kernels for math_inst in math_instructions_1sm: tile_descriptions = [] @@ -7985,6 +8025,12 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [ + [2,1,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -8138,6 +8184,13 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [ + [1,1,1], + [2,1,1] + , DynamicClusterShape + ] + # 1xSM MMA kernels for math_inst in math_instructions_1sm: tile_descriptions = [] @@ -8211,6 +8264,13 @@ def GenerateSM100_TensorOp_mixed_8bits_UMMA_gemm_with_block_scaled(manifest, cud , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [ + [2,1,1], + [4,1,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -8417,6 +8477,13 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [ + [1,1,1], + [2,1,1] + , DynamicClusterShape + ] + # 1xSM MMA kernels for math_inst in math_instructions_1sm: tile_descriptions = [] @@ -8537,6 +8604,13 @@ def GenerateSM100_TensorOp_fp4_UMMA_gemm_with_block_scaled(manifest, cuda_versio , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [ + [2,1,1], + [4,1,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -8689,6 +8763,11 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [[1,2,1], [2,1,1], [1,1,1], [1,4,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.Default, ] @@ -8788,6 +8867,11 @@ def GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -8925,6 +9009,9 @@ def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_1sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_1sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape tile_descriptions.append( TileDescription([ @@ -8953,6 +9040,9 @@ def GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_2sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2]) tile_descriptions.append( TileDescription([ @@ -9044,6 +9134,9 @@ def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_1sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_1sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape tile_descriptions.append( TileDescription([ @@ -9072,6 +9165,9 @@ def GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_2sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2]) tile_descriptions.append( TileDescription([ @@ -9163,6 +9259,9 @@ def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_1sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_1sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape tile_descriptions.append( TileDescription([ @@ -9191,6 +9290,9 @@ def GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_2sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2]) tile_descriptions.append( TileDescription([ @@ -9287,6 +9389,9 @@ def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_1sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_1sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape tile_descriptions.append( TileDescription([ @@ -9319,6 +9424,9 @@ def GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_2sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2]) tile_descriptions.append( TileDescription([ @@ -9417,6 +9525,9 @@ def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_1sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_1sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_1sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else cluster_shape tile_descriptions.append( TileDescription([ @@ -9476,6 +9587,9 @@ def GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version): for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in sm100_cluster_shape_2sm: + if 101 in manifest.compute_capabilities : + if cluster_shape == [4,4,1] : + continue multiplier_2sm = (1, 1, 1) if cluster_shape == DynamicClusterShape else (cluster_shape[0] // 2, cluster_shape[1], cluster_shape[2]) tile_descriptions.append( TileDescription([ @@ -9578,6 +9692,12 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [ + [1,2,1], [1,1,1], [1,4,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.StreamK, ] @@ -9612,6 +9732,12 @@ def GenerateSM100_TensorOp_32b_UMMA_gemm_stream_k(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [ + [2,1,1], [2,2,1], [2,4,1], [4,1,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -9658,6 +9784,12 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [ + [1,2,1], [1,1,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.StreamK ] @@ -9726,6 +9858,12 @@ def GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [ + [2,1,1], [2,2,1], [2,4,1], [4,1,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -9809,6 +9947,12 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [ + [1,2,1], [2,1,1], [1,1,1] + , DynamicClusterShape + ] + tile_schedulers = [ TileSchedulerType.StreamK, ] @@ -9861,6 +10005,12 @@ def GenerateSM100_TensorOp_fp8_UMMA_gemm_stream_k(manifest, cuda_version): , DynamicClusterShape ] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [ + [2,1,1], [2,2,1], [2,4,1], [4,1,1] + , DynamicClusterShape + ] + for math_inst in math_instructions_2sm: tile_descriptions = [] for cluster_shape in cluster_shapes_2sm: @@ -9960,6 +10110,9 @@ def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version, cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1]] + # tile_descriptions is a 2-level list. # Each inner list is for each cluster shape. for math_inst, output_type in math_instructions_w_output_1sm: @@ -10023,6 +10176,8 @@ def GenerateSM100_TensorOp_16b_UMMA_conv3x(manifest, cuda_version, data_types_and_instruction_shapes_2sm) cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]] for math_inst, output_type in math_instructions_w_output_2sm: tile_descriptions = [] @@ -10103,6 +10258,8 @@ def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version, data_types_and_instruction_shapes_1sm) cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1],[4,4,1]] + if 101 in manifest.compute_capabilities : + cluster_shapes_1sm = [[1,1,1], [1,2,1], [1,4,1]] for math_inst, output_type in math_instructions_w_output_1sm: tile_descriptions = [] @@ -10166,6 +10323,8 @@ def GenerateSM100_TensorOp_fp8_UMMA_conv3x(manifest, cuda_version, data_types_and_instruction_shapes_2sm) cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1], [4,4,1]] + if 101 in manifest.compute_capabilities : + cluster_shapes_2sm = [[2,1,1], [2,2,1], [2,4,1], [4,1,1], [4,2,1]] for math_inst, output_type in math_instructions_w_output_2sm: tile_descriptions = [] @@ -10629,6 +10788,8 @@ def GenerateSM100(manifest, cuda_version): # # Dense Gemm # + architectures = manifest.args.architectures.split(';') if len(args.architectures) else ['50',] + GenerateSM100_TensorOp_16b_UMMA_gemm(manifest, cuda_version) GenerateSM100_TensorOp_32b_UMMA_gemm(manifest, cuda_version) @@ -10636,7 +10797,8 @@ def GenerateSM100(manifest, cuda_version): GenerateSM100_TensorOp_16b_UMMA_gemm_stream_k(manifest, cuda_version) - GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version) + if '100f' not in architectures and '101f' not in architectures: + GenerateSM100_TensorOp_int8_UMMA_gemm(manifest, cuda_version) GenerateSM100_TensorOp_fp8_UMMA_gemm(manifest, cuda_version) # grouped GEMM @@ -10657,7 +10819,8 @@ def GenerateSM100(manifest, cuda_version): # GenerateSM100_SparseTensorOp_32b_UMMA_gemm(manifest, cuda_version) GenerateSM100_SparseTensorOp_16b_UMMA_gemm(manifest, cuda_version) - GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version) + if '100f' not in architectures and '101f' not in architectures: + GenerateSM100_SparseTensorOp_int8_UMMA_gemm(manifest, cuda_version) GenerateSM100_SparseTensorOp_fp8_UMMA_gemm(manifest, cuda_version) GenerateSM100_SparseTensorOp_mixed_8bits_UMMA_gemm(manifest, cuda_version) @@ -11166,7 +11329,7 @@ if __name__ == "__main__": GenerateSM89(manifest, args.cuda_version) GenerateSM90(manifest, args.cuda_version) - blackwell_enabled_arch = any(arch in ["100a", "101a", "120a"] for arch in archs) + blackwell_enabled_arch = any(arch in ["100a", "100f", "101a", "101f", "120a", "120f"] for arch in archs) if blackwell_enabled_arch: GenerateSM100(manifest, args.cuda_version) GenerateSM120(manifest, args.cuda_version) diff --git a/python/cutlass_library/manifest.py b/python/cutlass_library/manifest.py index 38d0f764..d10ec125 100644 --- a/python/cutlass_library/manifest.py +++ b/python/cutlass_library/manifest.py @@ -523,10 +523,14 @@ class Manifest: arch_conditional_cc = [ '90a', '100a', + '100f', '101a', - '120a' + '101f', + '120a', + '120f' ] architectures = [x if x not in arch_conditional_cc else x.split('a')[0] for x in architectures] + architectures = [x if x not in arch_conditional_cc else x.split('f')[0] for x in architectures] self.compute_capabilities = [int(x) for x in architectures] diff --git a/python/cutlass_library/sm90_utils.py b/python/cutlass_library/sm90_utils.py index 63ff6f1f..8ea870ec 100644 --- a/python/cutlass_library/sm90_utils.py +++ b/python/cutlass_library/sm90_utils.py @@ -375,6 +375,13 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level: mma_multipliers, cluster_sizes = get_mma_multipliers(level), get_cluster_sizes(level, is_aligned) for math_inst, mma_mul, cluster_size in product(math_instructions, mma_multipliers, cluster_sizes): + # generator can stamp out duplicate kernels, because it doesn't explicitly set instruction + # shape for SM90 kernels, and the 3.X collective API doesn't directly expose them when using + # the auto kernel schedule. + + math_inst_stub = copy.deepcopy(math_inst) + math_inst_stub.instruction_shape = [0, 0, 0] + tile_desc = TileDescription( threadblock_shape=[ math_inst.instruction_shape[0] * mma_mul[0], @@ -383,7 +390,7 @@ def generate_tile_descriptions_sm90(math_instructions, is_aligned: bool, level: ], stages=0, warp_count=[4, 1, 1], - math_instruction=math_inst, + math_instruction=math_inst_stub, min_compute=90, max_compute=90, cluster_shape=cluster_size) @@ -551,6 +558,7 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types, b_type_size = DataTypeSize[data_types["b_type"]] if a_type_size != b_type_size and CudaToolkitVersionSatisfies(cuda_version, 12, 1): schedules = [] + stream_k_schedules = [] epilogue_schedule = EpilogueScheduleType.TmaWarpSpecialized if a_type_size > b_type_size: epilogue_schedule = EpilogueScheduleType.EpilogueTransposed @@ -579,7 +587,11 @@ def get_valid_schedules(tile_description, cuda_version, is_aligned, data_types, KernelScheduleType.TmaWarpSpecializedCooperative, epilogue_schedule ]) - return schedules, [] + stream_k_schedules.append([ + KernelScheduleType.TmaWarpSpecializedCooperative, + epilogue_schedule + ]) + return schedules, stream_k_schedules if not is_aligned and not is_blockwise(gemm_kind): schedules = [[KernelScheduleType.CpAsyncWarpSpecialized, diff --git a/python/setup_library.py b/python/setup_library.py index 8262e5a7..3738e24d 100644 --- a/python/setup_library.py +++ b/python/setup_library.py @@ -36,7 +36,7 @@ from setuptools import setup def perform_setup(): setup( name='cutlass_library', - version='3.9.2', + version='4.0.0', description='CUTLASS library generation scripts', packages=['cutlass_library'] ) diff --git a/python/setup_pycute.py b/python/setup_pycute.py index cb945049..b84a228a 100644 --- a/python/setup_pycute.py +++ b/python/setup_pycute.py @@ -36,7 +36,7 @@ from setuptools import setup def perform_setup(): setup( name='pycute', - version='3.9.2', + version='4.0.0', description='Python implementation of CuTe', packages=['pycute'], ) diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt index 8d30e790..00a1fef7 100644 --- a/test/unit/gemm/device/CMakeLists.txt +++ b/test/unit/gemm/device/CMakeLists.txt @@ -658,7 +658,6 @@ cutlass_test_unit_gemm_device_add_executable( # Syrk SM80 complex f64 tests syrk_cf64n_cf64t_tensor_op_f64_sm80.cu syrk_cf64n_cf64n_tensor_op_f64_sm80.cu - syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu # Syrk SM80 complex f32 tests syrk_cf32n_cf32t_tensor_op_f32_sm80.cu @@ -703,7 +702,6 @@ cutlass_test_unit_gemm_device_add_executable( # Trmm SM80 complex f64 tests trmm_cf64n_cf64n_cf64t_tensor_op_f64_sm80.cu - trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu # Trmm SM80 complex f32 tests trmm_cf32n_cf32n_cf32t_tensor_op_f32_sm80.cu @@ -776,7 +774,6 @@ cutlass_test_unit_gemm_device_add_executable( # Symm SM80 complex f64 tests symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_sm80.cu symm_cf64n_cf64n_cf64n_tensor_op_rs_f64_sm80.cu - symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu # Symm SM80 complex f32 tests symm_cf32n_cf32n_tensor_op_f32_ls_sm80.cu @@ -793,7 +790,6 @@ cutlass_test_unit_gemm_device_add_executable( # Hemm SM80 complex f64 tests hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_sm80.cu hemm_cf64h_cf64n_cf64n_tensor_op_rs_f64_sm80.cu - hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu # Hemm SM80 complex f32 tests hemm_cf32h_cf32n_tensor_op_f32_ls_sm80.cu @@ -805,6 +801,20 @@ cutlass_test_unit_gemm_device_add_executable( hemm_cf64_cf64_cf64_tensor_op_f64_sm90.cu ) +if (NOT CUTLASS_NVCC_ARCHS MATCHES 101|101a|101f|103|103a|103f) +cutlass_test_unit_gemm_device_add_executable( + cutlass_test_unit_gemm_device_blas3_gaussian + + BATCH_SOURCES ON + BATCH_SIZE 4 + + syrk_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu + trmm_cf64n_cf64n_cf64t_tensor_op_f64_gaussian_sm80.cu + symm_cf64n_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu + hemm_cf64h_cf64n_cf64n_tensor_op_ls_f64_gaussian_sm80.cu +) +endif() + cutlass_test_unit_gemm_device_add_executable( cutlass_test_unit_gemm_device_grouped_blas3 @@ -930,6 +940,13 @@ cutlass_test_unit_gemm_device_add_executable( # 8 unit tests sm100_gemm_f6_f6_f32_tensor_op_f32_ptr_array.cu ) + +cutlass_test_unit_gemm_device_add_executable( + cutlass_test_unit_blockwise_gemm_sm100 + + sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu +) + endif() diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu index ae3dd8da..829ab7ef 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1285,7 +1285,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1361,8 +1361,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1372,8 +1372,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1383,8 +1383,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1394,8 +1394,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1405,8 +1405,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1416,8 +1416,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1427,8 +1427,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1438,8 +1438,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu index 4468cc08..57cf69b5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_o_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x512_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x512_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1285,7 +1285,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1361,8 +1361,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1372,8 +1372,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1383,8 +1383,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1394,8 +1394,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x512_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1405,8 +1405,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1416,8 +1416,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1427,8 +1427,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1438,8 +1438,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x512_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu index 7a2b8b4a..32e576e8 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu index 59ea632d..9a0dda14 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f16_mxf8_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu index e5162951..c8ef6c76 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1285,7 +1285,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1361,8 +1361,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1372,8 +1372,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1383,8 +1383,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1394,8 +1394,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1405,8 +1405,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1416,8 +1416,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1427,8 +1427,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1438,8 +1438,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu index 49669982..ce88316b 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_o_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x512_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x512_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -753,7 +753,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -829,7 +829,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -905,7 +905,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -981,7 +981,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1057,7 +1057,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1133,7 +1133,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1209,7 +1209,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1286,7 +1286,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1363,8 +1363,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1374,8 +1374,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1385,8 +1385,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1396,8 +1396,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x512_0_vs64_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1407,8 +1407,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1418,8 +1418,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1429,8 +1429,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1440,8 +1440,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x512_0_vs64_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu index 636139cd..5e2b22c5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf4_f32_f32_f32_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu index c9660746..aa7314a3 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf6_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu index 2d1f7fe2..742b437d 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf4_mxf8_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m1_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu index 3a2b8fb0..3d703dd9 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf4_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu index 3645ef36..99575646 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf6_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe3m2_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu index d3850ca0..c0eda537 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf6_mxf8_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe2m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu index 78173b29..fe98bbde 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu index 875385b1..53ae2ca6 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f16_mxf8_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu index 07b8f150..6130a8b3 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf4_f32_f32_f32_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m1_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu index e3040f86..01d7066f 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf6_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -578,7 +578,7 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -654,7 +654,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -730,7 +730,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -806,7 +806,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -882,7 +882,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -958,7 +958,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -1034,8 +1034,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1045,8 +1045,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1056,8 +1056,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1067,8 +1067,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1078,8 +1078,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -1089,8 +1089,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe2m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu index d6909000..614e69d8 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu index c9661953..4690024d 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu index 024246b1..9f209c05 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnn_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu index 2c751282..2d0bc431 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_nnt_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::ColumnMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_nnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_nnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_nnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_nnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_nnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_nnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu index 43eb3bea..fa8791ad 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnn_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu index 14e48873..dde24b65 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu index 6a8608e4..9df684fa 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_tnt_streamk.cu @@ -56,7 +56,7 @@ using namespace cute; // 6. 256x256_tnt_vs64in // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -125,7 +125,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -263,7 +263,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -332,7 +332,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -401,7 +401,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -470,8 +470,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -481,8 +481,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -492,8 +492,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -503,8 +503,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -514,8 +514,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -525,8 +525,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_e4m3_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu index 67130534..4e4a7579 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttn_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu index 9572cd45..95f428a5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f16_mxf8_q_ttt_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::RowMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x128x256_0_vs64_ttt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x192x256_0_vs64_ttt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_128x256x256_0_vs64_ttt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x128x256_0_vs64_ttt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x192x256_0_vs64_ttt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f16_ue8m0xe4m3_256x256x256_0_vs64_ttt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu index 4793d9c6..ffa652c5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu index 61294715..3e51cd57 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_f32_f32_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_f32_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu index b6e73cb7..1a0a3359 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu index 52e3a45e..7213e721 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f16_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f16_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu index a4c596cf..08aebed5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnn_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnn_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu index 1a00d07f..41512aec 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_f32_q_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x128x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x192x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_128x256x256_0_vs64_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x128x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x192x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_f32_256x256x256_0_vs64_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu index 1c48ee8f..552ffbf7 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnn_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnn_align32_q_1sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnn_align32_q_2sm_epiVs64n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu index 6527b6a5..a92a16d3 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_mxf8_mxf8_f32_void_mxf8_q_tnt_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,8 +546,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3 } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x128x256_0_vs64_tnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -557,8 +557,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x192x256_0_vs64_tnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -568,8 +568,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_128x256x256_0_vs64_tnt_align32_q_1sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -579,8 +579,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x128x256_0_vs64_tnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -590,8 +590,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x192x256_0_vs64_tnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -601,8 +601,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x192x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_ } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x64bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue8m0xe4m3_ue8m0xe4m3_f32_void_ue8m0xe4m3_256x256x256_0_vs64_tnt_align32_q_2sm_epiVs64t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu index 4112130f..e363feee 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu index b667e416..81c13779 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_f16_o_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x128x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x192x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_128x256x512_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x128x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x192x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_f16_256x256x512_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu index fe87a252..c4881802 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnn_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu index f3916d89..437dd2b5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu index c6e16ca2..04dc3fa5 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f16_nvf4_o_tnt_streamk.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,8 +512,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm, streamk) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm, streamk) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x128x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -523,8 +523,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm, streamk) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm, streamk) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x192x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -534,8 +534,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm, streamk) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm, streamk) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_128x256x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -545,8 +545,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm, streamk) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm, streamk) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x128x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -556,8 +556,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm, streamk) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm, streamk) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x192x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -567,8 +567,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm, streamk) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm, streamk) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f16_e2m1_256x256x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu index f89539f2..cd81e2bf 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu index 21d3b3d8..5c195462 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_f32_f32_o_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -666,8 +666,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x128x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -677,8 +677,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x192x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -688,8 +688,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -699,8 +699,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_128x256x512_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -710,8 +710,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x128x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -721,8 +721,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x192x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -732,8 +732,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -743,8 +743,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_f32_f32_256x256x512_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu index 4787b0d4..91dc0d64 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu index 1a40c960..a5594a86 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f16_o_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x128x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x192x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_128x256x512_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x128x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x192x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f16_256x256x512_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu index f8300b56..3f82337e 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnn.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -588,7 +588,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -664,8 +664,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -675,8 +675,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -686,8 +686,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -697,8 +697,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnn_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -708,8 +708,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnn_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu index 4c07b52c..0a197a48 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_f32_o_tnt.cu @@ -56,7 +56,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -208,7 +208,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -284,7 +284,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -360,7 +360,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -436,7 +436,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -512,7 +512,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -666,8 +666,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x128x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -677,8 +677,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x192x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -688,8 +688,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x256_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -699,8 +699,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_128x256x512_0_vs32_tnt_align64_o_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -710,8 +710,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x128x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -721,8 +721,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x192x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -732,8 +732,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x256_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -743,8 +743,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm, functional) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_f32_256x256x512_0_vs32_tnt_align64_o_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu index 7a2a9414..a5145287 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnn_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnn_align64_o_1sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnn_align64_o_2sm_epiVs32n; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu index 2f1316e0..deb1837a 100644 --- a/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu +++ b/test/unit/gemm/device/sm100_blockscaled_sparse_tensorop_gemm/sm100_bssp_gemm_nvf4_nvf4_f32_void_nvf4_o_tnt_sfd.cu @@ -60,7 +60,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -141,7 +141,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 2. -namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -222,7 +222,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3. -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -303,7 +303,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 3.2 -namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 4. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -465,7 +465,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 5. -namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -546,7 +546,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6. -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -627,7 +627,7 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 6.2 -namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t { +namespace cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -708,8 +708,8 @@ namespace cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m } // 1. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x128x256_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -719,8 +719,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 2. -TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x192x256_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -730,8 +730,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3. -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x256_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -741,8 +741,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 3.2 -TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_128x256x512_0_vs32_tnt_align64_o_1sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -752,8 +752,8 @@ TEST(cutlass3x_sm100_bssptensorop_s128x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 4. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x128x256_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -763,8 +763,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 5. -TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x192x256_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -774,8 +774,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x128x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6. -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x256_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -785,8 +785,8 @@ TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32 } // 6.2 -TEST(cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { - namespace gemm = cutlass3x_sm100_bssptensorop_s256x256x128bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t; +TEST(cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t, sfd_fusion) { + namespace gemm = cutlass3x_sm100_bssptensorop_bsspgemm_ue4m3xe2m1_ue4m3xe2m1_f32_void_ue4m3xe2m1_256x256x512_0_vs32_tnt_align64_o_2sm_epiVs32t; EXPECT_TRUE(test::gemm::device::TestSmallFusion( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu new file mode 100644 index 00000000..9ce8817e --- /dev/null +++ b/test/unit/gemm/device/sm100_gemm_f8_f8_f8_tensor_op_f32_blockwise.cu @@ -0,0 +1,320 @@ +/*************************************************************************************************** + * Copyright (c) 2025 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +/*! \file + \brief Tests for device-wide GEMM interface +*/ + +#include + +#include +#include +#include + +#include "cutlass/cutlass.h" +#include "cute/tensor.hpp" +#include "cute/atom/mma_atom.hpp" + +#include "cutlass/numeric_types.h" + +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" + +#include "cutlass/epilogue/dispatch_policy.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" + +#include "cutlass/epilogue/thread/activation.h" +#include "../../common/cutlass_unit_test.h" + +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/host/gett.hpp" +#include "cutlass/util/device_memory.h" + +using namespace cute; + +#if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) + +template +bool groupwise_test( + Int, Int, Int, C, + LayoutA, LayoutB, LayoutCD, + MmaTileShape, ClusterShape) { + + using ScaleConfig = cutlass::detail::Sm100BlockwiseScaleConfig; + using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA()); // Layout type for SFA matrix operand + using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB()); // Layout type for SFB matrix operand + + using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, + MmaTileShape, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, + float, float, + cutlass::float_e4m3_t, LayoutCD, 16, + cutlass::float_e4m3_t, LayoutCD, 16, + conditional_t + >::CollectiveOp; + + using CollectiveMainloop = + typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm100, cutlass::arch::OpClassTensorOp, + cutlass::float_e4m3_t, cute::tuple, 16, + cutlass::float_e4m3_t, cute::tuple, 16, + float, + MmaTileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout, + conditional_t + >::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + cute::Shape, + CollectiveMainloop, + CollectiveEpilogue>; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + // Strides just iterate over scalars and have no zeros + LayoutSFA layout_SFA; + LayoutSFB layout_SFB; + + int alignment_M = max(max((is_same_v ? 16 : 1) , + (SFAMajor == UMMA::Major::MN ? CollectiveMainloop::AlignmentSFA : 1)), + (is_same_v ? 16 : 1)); + + int alignment_N = max(max((is_same_v ? 16 : 1) , + (SFBMajor == UMMA::Major::MN ? CollectiveMainloop::AlignmentSFB : 1)), + (is_same_v ? 16 : 1)); + + int alignment_K = max(max((is_same_v ? 16 : 1) , + (SFAMajor == UMMA::Major::K ? CollectiveMainloop::AlignmentSFA : 1)), + max((is_same_v ? 16 : 1) , + (SFBMajor == UMMA::Major::K ? CollectiveMainloop::AlignmentSFB : 1))); + + int M = 1024 + alignment_M; + int N = 1024 + alignment_N; + int K = 512 + alignment_K; + EXPECT_TRUE(M % alignment_M == 0); + EXPECT_TRUE(N % alignment_N == 0); + EXPECT_TRUE(K % alignment_K == 0); + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, 1)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, 1)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, 1)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, 1)); + + layout_SFA = ScaleConfig::tile_atom_to_shape_SFA(make_shape(M, N, K, 1)); + layout_SFB = ScaleConfig::tile_atom_to_shape_SFB(make_shape(M, N, K, 1)); + + thrust::universal_vector tensor_A(M * K); + thrust::universal_vector tensor_SFA(cute::size(cute::filter_zeros(layout_SFA))); + thrust::universal_vector tensor_B(N * K); + thrust::universal_vector tensor_SFB(cute::size(cute::filter_zeros(layout_SFB))); + thrust::universal_vector tensor_C(M * N); + thrust::universal_vector tensor_D(M * N); + thrust::universal_vector tensor_ref_D(M * N); + + thrust::random::default_random_engine engine(2025); + thrust::random::uniform_int_distribution dist(-2, 2); + + std::generate(tensor_A.begin(), tensor_A.end(), [&] () { + return static_cast(dist(engine)); + }); + std::generate(tensor_SFA.begin(), tensor_SFA.end(), [&] () { + return static_cast(dist(engine)); + }); + std::generate(tensor_B.begin(), tensor_B.end(), [&] () { + return static_cast(dist(engine)); + }); + std::generate(tensor_SFB.begin(), tensor_SFB.end(), [&] () { + return static_cast(dist(engine)); + }); + std::generate(tensor_C.begin(), tensor_C.end(), [&] () { + return static_cast(dist(engine)); + }); + + typename Gemm::Arguments arguments { + cutlass::gemm::GemmUniversalMode::kGemm, + {M, N, K, 1}, + {thrust::raw_pointer_cast(tensor_A.data()), stride_A, + thrust::raw_pointer_cast(tensor_B.data()), stride_B, + thrust::raw_pointer_cast(tensor_SFA.data()), layout_SFA, + thrust::raw_pointer_cast(tensor_SFB.data()), layout_SFB}, + { + {}, // epilogue.thread + thrust::raw_pointer_cast(tensor_C.data()), stride_C, + thrust::raw_pointer_cast(tensor_D.data()), stride_D + } + }; + + auto &fusion_args = arguments.epilogue.thread; + fusion_args.alpha = 1.0f; + fusion_args.beta = 1.0f; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + Gemm gemm; + + EXPECT_TRUE(gemm.can_implement(arguments) == cutlass::Status::kSuccess); + EXPECT_TRUE(gemm.initialize(arguments, workspace.get()) == cutlass::Status::kSuccess); + EXPECT_TRUE(gemm.run() == cutlass::Status::kSuccess); + EXPECT_TRUE(cudaDeviceSynchronize() == cudaSuccess); + + auto A = cute::make_tensor(thrust::raw_pointer_cast(tensor_A.data()), + cute::make_layout(cute::make_shape(M, K, 1), stride_A)); + auto B = cute::make_tensor(thrust::raw_pointer_cast(tensor_B.data()), + cute::make_layout(cute::make_shape(N, K, 1), stride_B)); + auto C = cute::make_tensor(thrust::raw_pointer_cast(tensor_C.data()), + cute::make_layout(cute::make_shape(M, N, 1), stride_C)); + auto D = cute::make_tensor(thrust::raw_pointer_cast(tensor_ref_D.data()), + cute::make_layout(cute::make_shape(M, N, 1), stride_D)); + auto SFA = cute::make_tensor(thrust::raw_pointer_cast(tensor_SFA.data()), layout_SFA); + auto SFB = cute::make_tensor(thrust::raw_pointer_cast(tensor_SFB.data()), layout_SFB); + + cutlass::reference::host::GettBlockScalingMainloopParams< + float, + decltype(A), + decltype(SFA), + decltype(B), + decltype(SFB) + > mainloop_params{A, SFA, B, SFB}; + + cutlass::reference::host::GettEpilogueParams< + float, + float, + float, + float, + decltype(C), + decltype(D) + > epilogue_params; + + epilogue_params.C = C; + epilogue_params.D = D; + epilogue_params.alpha = 1.0f; + epilogue_params.beta = 1.0f; + + // get reference result + cutlass::reference::host::Gemm3x(mainloop_params, epilogue_params); + + // compare_reference + bool equal = true; + for (size_t i = 0; i < tensor_ref_D.size(); ++i) { + equal &= (tensor_ref_D[i] == tensor_D[i]); + } + return equal; +} + +TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_align16_blockwise, 128x128x128_1x1x1_2x2x32_scale) { + + bool passed = groupwise_test( + Int<2>{}, Int<2>{}, Int<32>{}, false_type{}, + cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, + cutlass::layout::RowMajor{}, + Shape<_128,_128,_128>{}, + Shape<_1,_1,_1>{}); + + EXPECT_TRUE(passed); +} + +TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_2sm_f32_align16_blockwise, 256x128x128_2x1x1_64x4x32_scale) { + + bool passed = groupwise_test( + Int<64>{}, Int<4>{}, Int<32>{}, true_type{}, + cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, + cutlass::layout::RowMajor{}, + Shape<_256,_128,_128>{}, + Shape<_2,_1,_1>{}); + + EXPECT_TRUE(passed); + +} + +TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_1sm_f32_align16_blockwise, 128x128x128_1x1x1_1x128x128_scale) { + + bool passed = groupwise_test( + Int<1>{}, Int<128>{}, Int<128>{}, false_type{}, + cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, + cutlass::layout::RowMajor{}, + Shape<_128,_128,_128>{}, + Shape<_1,_1,_1>{}); + + EXPECT_TRUE(passed); + +} + +TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_2sm_f32_align16_blockwise, 256x128x128_2x1x1_1x128x128_scale) { + + bool passed = groupwise_test( + Int<1>{}, Int<128>{}, Int<128>{}, true_type{}, + cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, + cutlass::layout::RowMajor{}, + Shape<_256,_128,_128>{}, + Shape<_2,_1,_1>{}); + + EXPECT_TRUE(passed); + +} + +TEST(SM100_Device_Gemm_e4m3t_e4m3n_e4m3t_tensorop_2sm_f32_align16_blockwise, 256x128x128_2x1x1_64x64x64_scale) { + + bool passed = groupwise_test( + Int<64>{}, Int<64>{}, Int<64>{}, true_type{}, + cutlass::layout::RowMajor{}, cutlass::layout::ColumnMajor{}, + cutlass::layout::RowMajor{}, + Shape<_256,_128,_128>{}, + Shape<_2,_1,_1>{}); + + EXPECT_TRUE(passed); + +} + + +#endif // #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu index e46934c8..584097b1 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu index dbce7f5d..994b9a30 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f16_f8_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f16_e4m3_256x256 // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_1 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_1 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_2 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_2 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_e4m3_128x12 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_e4m3_128x25 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_e4m3_256x12 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu index 94f76ca0..62069c68 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f4_f32_f32_f32_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_f32_f32_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_f32_f32_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_f32_f32_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_f32_f32_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e2m1_f32_void_f32_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e2m1_f32_void_f32_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e2m1_f32_void_f32_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e2m1_f32_void_f32_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu index 1896e27d..95f7cff8 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f6_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e3m2_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e3m2_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e3m2_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu index 4960e6a7..0f3f40ff 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f4_f8_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e2m1_e4m3_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e2m1_e4m3_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e2m1_e4m3_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e2m1_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu index 9643370a..3aa37e55 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f4_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e2m1_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e2m1_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e2m1_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu index 94b52d60..22b84502 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu index c8ab01da..190cb81a 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f16_f8_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f16_e4m3_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f16_e4m3_256x256 // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_1 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_1 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_2 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_2 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_e4m3_128x12 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_e4m3_128x25 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_e4m3_256x12 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_e4m3_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu index f97911be..470850f2 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f6_f32_f32_f32_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_f32_f32_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_f32_f32_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_f32_f32_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_f32_f32_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_f32_f32_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e3m2_f32_void_f32_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e3m2_f32_void_f32_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e3m2_f32_void_f32_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e3m2_f32_void_f32_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu index ad81c5f8..904ae41e 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f6_f8_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e3m2_e4m3_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e3m2_e4m3_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e3m2_e4m3_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e3m2_e4m3_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu index 4b0ab7f8..1f72248d 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f4_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e2m1_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e2m1_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e2m1_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e2m1_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu index adf4188e..acf64f12 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/narrow_precision/sm100_sp_gemm_f8_f6_f32_f16_f16_tn.cu @@ -54,7 +54,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -124,7 +124,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -194,7 +194,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -264,7 +264,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -334,8 +334,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -345,8 +345,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_f16_f16_128x128x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -356,8 +356,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_f16_f16_128x256x } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -367,8 +367,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_f16_f16_256x128x } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_f16_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, test::gemm::device::CheckEquality::RELATIVE, @@ -379,7 +379,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_f16_f16_256x256x // 1. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -449,7 +449,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -519,7 +519,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -589,7 +589,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_25 } // 4. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm { using LayoutA = cutlass::layout::RowMajor; using LayoutB = cutlass::layout::ColumnMajor; @@ -659,8 +659,8 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x128x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -670,8 +670,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e3m2_f32_void_f16_128x128 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_128x256x256_0_tnt_align32_q_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -681,8 +681,8 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e3m2_f32_void_f16_128x256 } // 3. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x128x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, @@ -692,8 +692,8 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e3m2_f32_void_f16_256x128 } // 4. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm, functional) { + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e3m2_f32_void_f16_256x256x256_0_tnt_align32_q_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, test::gemm::device::CheckEquality::RELATIVE, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu index e5e4f3ff..b92d17ae 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f16_f16_f32_f16_f16_hmma.cu @@ -47,7 +47,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< cutlass::arch::Sm100, cutlass::arch::OpClassSparseTensorOp, @@ -88,7 +88,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -130,7 +130,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x1 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -172,7 +172,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x1 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -214,7 +214,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x2 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -256,7 +256,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -298,7 +298,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -340,7 +340,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x1 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -382,7 +382,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x1 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -424,7 +424,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x1 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -466,7 +466,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x2 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -508,9 +508,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x2 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x64x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -521,9 +521,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_f16_f16_128x64x64_1 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x128x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -534,9 +534,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_f16_f16_128x128x64 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x192x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -547,9 +547,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_f16_f16_128x192x64 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_128x256x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -560,9 +560,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_f16_f16_128x256x64 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -573,9 +573,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x64_2 } //6. -TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x64x128_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -586,9 +586,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_f16_f16_256x64x128_ } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -599,9 +599,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x64 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x128x128_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -612,9 +612,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_f16_f16_256x128x12 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x192x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -625,9 +625,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_f16_f16_256x192x64 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -638,9 +638,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x64 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_f16_f16_256x256x128_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -651,7 +651,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_f16_f16_256x256x12 } // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -693,7 +693,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x6 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -735,7 +735,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -777,7 +777,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -819,7 +819,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -861,7 +861,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x6 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -903,7 +903,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x6 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -945,7 +945,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -987,7 +987,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1029,7 +1029,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1071,7 +1071,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1113,9 +1113,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x64x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1126,9 +1126,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x32spgemm_f16_f16_f32_void_f16_128x64x64_ } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x128x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1139,9 +1139,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x32spgemm_f16_f16_f32_void_f16_128x128x6 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x192x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1152,9 +1152,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x32spgemm_f16_f16_f32_void_f16_128x192x6 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_128x256x64_1x1x1_0_tnn_align16_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1165,9 +1165,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x32spgemm_f16_f16_f32_void_f16_128x256x6 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1178,9 +1178,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x64_ } // 6. -TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x64x128_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1191,9 +1191,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x32spgemm_f16_f16_f32_void_f16_256x64x128 } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1204,9 +1204,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x6 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x128x128_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1217,9 +1217,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x32spgemm_f16_f16_f32_void_f16_256x128x1 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x192x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1230,9 +1230,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x32spgemm_f16_f16_f32_void_f16_256x192x6 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x64_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1243,9 +1243,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x6 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x32spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f16_f16_f32_void_f16_256x256x128_2x1x1_0_tnn_align16_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu index 92671e67..a6837005 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f32_f32_f32_f32_f32_tfmma.cu @@ -47,7 +47,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -89,7 +89,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -131,7 +131,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x1 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -173,7 +173,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x1 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -215,7 +215,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x2 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -257,7 +257,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -299,7 +299,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -341,7 +341,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x1 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -383,7 +383,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x1 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -425,7 +425,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x1 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -467,7 +467,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x2 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -509,9 +509,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x2 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x64x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -522,9 +522,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_f32_f32_128x64x32_1 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x128x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -535,9 +535,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_f32_f32_128x128x32 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x192x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -548,9 +548,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_f32_f32_128x192x32 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_128x256x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -561,9 +561,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_f32_f32_128x256x32 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -574,9 +574,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x32_2 } //6. -TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x64x64_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -587,9 +587,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_f32_f32_256x64x64_2 } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -600,9 +600,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x32 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x128x64_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -613,9 +613,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_f32_f32_256x128x64 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x192x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -626,9 +626,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_f32_f32_256x192x32 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -639,9 +639,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x32 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_f32_f32_256x256x64_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -652,7 +652,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_f32_f32_256x256x64 } // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -694,7 +694,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x6 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -736,7 +736,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -778,7 +778,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -820,7 +820,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -862,7 +862,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x6 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -904,7 +904,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x6 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -946,7 +946,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -988,7 +988,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1030,7 +1030,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1072,7 +1072,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1114,9 +1114,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x64x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1127,9 +1127,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x16spgemm_f32_f32_f32_void_f32_128x64x32_ } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x128x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1140,9 +1140,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x16spgemm_f32_f32_f32_void_f32_128x128x3 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x192x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1153,9 +1153,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x16spgemm_f32_f32_f32_void_f32_128x192x3 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_128x256x32_1x1x1_0_tnn_align8_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1166,9 +1166,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x16spgemm_f32_f32_f32_void_f32_128x256x3 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1179,9 +1179,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x32_ } // 6. -TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x64x64_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1192,9 +1192,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x16spgemm_f32_f32_f32_void_f32_256x64x64_ } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1205,9 +1205,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x3 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x128x64_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1218,9 +1218,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x16spgemm_f32_f32_f32_void_f32_256x128x6 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x192x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1231,9 +1231,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x16spgemm_f32_f32_f32_void_f32_256x192x3 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x32_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1244,9 +1244,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x3 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x16spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_f32_f32_f32_void_f32_256x256x64_2x1x1_0_tnn_align8_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu index 12b22760..99644205 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f16_qmma.cu @@ -48,7 +48,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -90,7 +90,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -174,7 +174,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -216,7 +216,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -258,7 +258,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -300,7 +300,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -342,7 +342,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -426,7 +426,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -468,7 +468,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -510,9 +510,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -523,9 +523,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_f16_128x64x12 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -536,9 +536,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_f16_128x128x } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -549,9 +549,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_f16_128x192x } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -562,9 +562,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_f16_128x256x } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -575,9 +575,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x12 } //6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -588,9 +588,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_f16_256x64x25 } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -601,9 +601,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -614,9 +614,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_f16_256x128x } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -627,9 +627,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_f16_256x192x } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -640,9 +640,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_f16_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -653,7 +653,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_f16_256x256x } // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -695,7 +695,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -737,7 +737,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -779,7 +779,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_12 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -821,7 +821,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_12 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -863,7 +863,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -905,7 +905,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -947,7 +947,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_25 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -989,7 +989,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_25 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1031,7 +1031,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_25 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1073,7 +1073,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_25 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1115,9 +1115,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1128,9 +1128,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f16_128x64x1 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1141,9 +1141,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f16_128x128 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1154,9 +1154,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f16_128x192 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1167,9 +1167,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f16_128x256 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1180,9 +1180,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x1 } // 6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1193,9 +1193,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f16_256x64x2 } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1206,9 +1206,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1219,9 +1219,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f16_256x128 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1232,9 +1232,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f16_256x192 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1245,9 +1245,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f16_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu index 43a5222a..d103086c 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f16_f8_qmma.cu @@ -48,7 +48,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -90,7 +90,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -174,7 +174,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_12 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -216,7 +216,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_12 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -258,7 +258,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -300,7 +300,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -342,7 +342,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_25 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_25 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -426,7 +426,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_25 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -468,7 +468,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_25 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -510,9 +510,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -523,9 +523,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x64x1 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -536,9 +536,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x128 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -549,9 +549,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x192 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -562,9 +562,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_128x256 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -575,9 +575,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x1 } //6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -588,9 +588,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x64x2 } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -601,9 +601,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -614,9 +614,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x128 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -627,9 +627,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x192 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -640,9 +640,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f16_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -653,7 +653,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f16_e4m3_256x256 } // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -695,7 +695,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_12 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -737,7 +737,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_1 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -779,7 +779,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_1 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -821,7 +821,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_1 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -863,7 +863,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_25 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -905,7 +905,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_25 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -947,7 +947,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_2 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -989,7 +989,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_2 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1031,7 +1031,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_2 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1073,7 +1073,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_2 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1115,9 +1115,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_2 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1128,9 +1128,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_e4m3_128x64x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1141,9 +1141,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_e4m3_128x12 } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1154,9 +1154,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_e4m3_128x19 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1167,9 +1167,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_e4m3_128x25 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1180,9 +1180,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x } // 6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1193,9 +1193,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_e4m3_256x64x } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1206,9 +1206,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x12 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1219,9 +1219,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_e4m3_256x12 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1232,9 +1232,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_e4m3_256x19 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1245,9 +1245,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x25 } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_e4m3_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu index 71a54d0b..fb1160ab 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_f8_f8_f32_f32_f32_qmma.cu @@ -48,7 +48,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -90,7 +90,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -132,7 +132,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -174,7 +174,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -216,7 +216,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -258,7 +258,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -300,7 +300,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -342,7 +342,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -384,7 +384,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -426,7 +426,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -468,7 +468,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -510,9 +510,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -523,9 +523,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_f32_f32_128x64x12 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -536,9 +536,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_f32_f32_128x128x } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -549,9 +549,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_f32_f32_128x192x } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -561,9 +561,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_f32_f32_128x256x {512})); } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -574,9 +574,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x12 } //6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -587,9 +587,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_f32_f32_256x64x25 } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -600,9 +600,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -612,9 +612,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_f32_f32_256x128x {512})); } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -625,9 +625,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_f32_f32_256x192x } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -638,9 +638,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_f32_f32_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -651,7 +651,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_f32_f32_256x256x } // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -693,7 +693,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -735,7 +735,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_12 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -777,7 +777,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_12 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -819,7 +819,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_12 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -861,7 +861,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -903,7 +903,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -945,7 +945,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_25 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -987,7 +987,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_25 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1029,7 +1029,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_25 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1071,7 +1071,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_25 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1113,9 +1113,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_25 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1126,9 +1126,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_e4m3_e4m3_f32_void_f32_128x64x1 } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1138,9 +1138,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_e4m3_e4m3_f32_void_f32_128x128 {512})); } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1151,9 +1151,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_e4m3_e4m3_f32_void_f32_128x192 } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1164,9 +1164,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_e4m3_e4m3_f32_void_f32_128x256 } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1176,9 +1176,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x1 {512})); } // 6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1188,9 +1188,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_e4m3_e4m3_f32_void_f32_256x64x2 {512})); } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1201,9 +1201,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128 } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1214,9 +1214,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_e4m3_e4m3_f32_void_f32_256x128 } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1227,9 +1227,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_e4m3_e4m3_f32_void_f32_256x192 } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1239,9 +1239,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256 {512})); } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_e4m3_e4m3_f32_void_f32_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, diff --git a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu index 0ec92dd9..54af8b29 100644 --- a/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu +++ b/test/unit/gemm/device/sm100_sparse_tensorop_gemm/sm100_sp_gemm_s8_s8_s32_s8_s8_imma.cu @@ -47,7 +47,7 @@ using namespace cute; #if defined(CUTLASS_ARCH_MMA_SM100_SUPPORTED) // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -89,7 +89,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -131,7 +131,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x1 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -173,7 +173,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x1 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -215,7 +215,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x1 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -257,7 +257,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -299,7 +299,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -341,7 +341,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x1 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -383,7 +383,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x2 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -425,7 +425,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x1 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -467,7 +467,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x1 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -509,9 +509,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x2 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -522,9 +522,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_s8_s8_128x64x128_1x1x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -535,9 +535,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_s8_s8_128x128x128_1x } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -548,9 +548,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_s8_s8_128x192x128_1x } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -561,9 +561,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_s8_s8_128x256x128_1x } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -574,9 +574,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x128_2x1x } //6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -587,9 +587,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_s8_s8_256x64x256_2x1x } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -600,9 +600,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x128_2x } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -613,9 +613,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_s8_s8_256x128x256_2x } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -626,9 +626,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_s8_s8_256x192x128_2x } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -639,9 +639,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x128_2x } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_s8_s8_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 1, @@ -652,7 +652,7 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_s8_s8_256x256x256_2x } // 1. -namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -694,7 +694,7 @@ namespace cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x1 } // 2. -namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -736,7 +736,7 @@ namespace cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128 } // 3. -namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -778,7 +778,7 @@ namespace cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192 } // 4. -namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -820,7 +820,7 @@ namespace cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256 } // 5. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -862,7 +862,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x1 } // 6. -namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -904,7 +904,7 @@ namespace cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x2 } // 7. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -946,7 +946,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128 } // 8. -namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -988,7 +988,7 @@ namespace cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128 } // 9. -namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1030,7 +1030,7 @@ namespace cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192 } // 10. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1072,7 +1072,7 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256 } // 11. -namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm { +namespace cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm { using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder< @@ -1114,9 +1114,9 @@ namespace cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256 } // 1. -TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x64x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1127,9 +1127,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x64x64spgemm_s8_s8_s32_void_s8_128x64x128_1x } // 2. -TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x128x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1140,9 +1140,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x128x64spgemm_s8_s8_s32_void_s8_128x128x128_ } // 3. -TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x192x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1153,9 +1153,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x192x64spgemm_s8_s8_s32_void_s8_128x192x128_ } // 4. -TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_128x256x128_1x1x1_0_tnn_align32_1sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1166,9 +1166,9 @@ TEST(cutlass3x_sm100_sptensorop_s128x256x64spgemm_s8_s8_s32_void_s8_128x256x128_ } // 5. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1179,9 +1179,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x128_2x } // 6. -TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x64x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1192,9 +1192,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x64x64spgemm_s8_s8_s32_void_s8_256x64x256_2x } // 7. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1205,9 +1205,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x128_ } // 8. -TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x128x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1218,9 +1218,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x128x64spgemm_s8_s8_s32_void_s8_256x128x256_ } // 9. -TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x192x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1231,9 +1231,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x192x64spgemm_s8_s8_s32_void_s8_256x192x128_ } // 10. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x128_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, @@ -1244,9 +1244,9 @@ TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x128_ } // 11. -TEST(cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) +TEST(cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm, func_check) { - namespace gemm = cutlass3x_sm100_sptensorop_s256x256x64spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm; + namespace gemm = cutlass3x_sm100_sptensorop_spgemm_s8_s8_s32_void_s8_256x256x256_2x1x1_0_tnn_align32_2sm; EXPECT_TRUE(test::gemm::device::TestSmall( 1, 0, diff --git a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu index 63956c90..f686f713 100644 --- a/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu +++ b/test/unit/gemm/device/sm90_gemm_f8_f8_f32_tensor_op_fp32.cu @@ -550,5 +550,77 @@ TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 64x128x128_tma_epilog EXPECT_TRUE(test::gemm::device::TestAll()); } +#if defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED) +TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 128x56x128_tma_epilogue_fp8_fast_accum) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using TileMNK = Shape<_128,_56,_128>; + using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, + TileMNK, Shape<_1,_1,_1>, + cutlass::epilogue::collective::EpilogueTileAuto, + float, float, + void, LayoutC, 4, + cutlass::half_t, LayoutC, 8, + cutlass::epilogue::TmaWarpSpecialized + >::CollectiveOp; + + using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, + cutlass::float_e4m3_t, LayoutA, 16, + cutlass::float_e4m3_t, LayoutB, 16, + float, + TileMNK, Shape<_1,_1,_1>, + cutlass::gemm::collective::StageCountAutoCarveout, + cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum + >::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveOp, + EpilogueOp + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + EXPECT_TRUE(test::gemm::device::TestAll()); +} + + TEST(SM90_Device_Gemm_e4m3t_e4m3n_f32t_tensor_op_gmma_f32, 128x112x128_tma_epilogue_fp8_fast_accum) { + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using TileMNK = Shape<_128,_112,_128>; + + using EpilogueOp = typename cutlass::epilogue::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, + TileMNK, Shape<_1,_1,_1>, + cutlass::epilogue::collective::EpilogueTileAuto, + float, float, + void, LayoutC, 4, + cutlass::half_t, LayoutC, 8, + cutlass::epilogue::TmaWarpSpecialized + >::CollectiveOp; + + using CollectiveOp = typename cutlass::gemm::collective::CollectiveBuilder< + cutlass::arch::Sm90, cutlass::arch::OpClassTensorOp, + cutlass::float_e4m3_t, LayoutA, 16, + cutlass::float_e4m3_t, LayoutB, 16, + float, + TileMNK, Shape<_1,_1,_1>, + cutlass::gemm::collective::StageCountAutoCarveout, + cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum + >::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveOp, + EpilogueOp + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + EXPECT_TRUE(test::gemm::device::TestAll()); +} +#endif // defined(CUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED) #endif // defined(CUTLASS_ARCH_MMA_SM90_SUPPORTED)
  • 1528ZwgMp`F-ibB#ntdk^9;CEq~-bCHz$h=OXvXGH^uALejtcFJ! z^bQ?bBX4qX%f$qqE1I&~1io%EIEL*d<73ikV@(G7CQNVd<)*9|hnw^emvh&H{xbOh zYiOhnsp1d>0Vd zeldi{^0r(@33`yyA9n7WU_rK$bo5GCKLMfv<6R89@)yi&`c6c#^2mgNCb3Hs9<6%RtE7wkrI1^Ly!YN!l#N7iWwi`*)t{KWlDXn zx#GpMa)(2!d{1!*ili)du>59k*U9M)kr_F11&~tTDVSF&l2rnLln=3IJQT)CPJjk3 z#umS4(BE_>0O|!!O=)O$<{I+Ia8Y78{K+amI+p$BwcD`2jaidWNTcRNb$Sj=fNsgm zm=yO582mDuXQU(`pF<8%`8>{H1Jt&OzgUHd3y~wyd)a!gYByUM#a!=$bQ-0=moz{C=O^E(<;(L-oGf*F!!7lIwK~EzyIGmizmR2rm?+m?XF0jq;fU`=B$Mb^e z#ifZ4=+k@DX227ZuSA9oE>zaX%d3k01#n0%aw?0wVKn18+CrW{GAVS<2|r{Hr392R zJB1a`NbIPQ%?i=?XwP~szyk?b+dfbdO>%XoyG(C{jQN!QCKRZ?2PBY?c-es^r~bJd?hq&c>HTp+*pY>aM0VtvB6H4;UGz!yEoiSFoy_I?~X?$COn9sqg>ei4IwWbI)B`gsMKU&dv+ zsCd(_yHn~AY!}5c$ZncxASzwNQ0jm2+HWTT&_?o+_H`)%0{&Q?sBdSEoYA`VKV`=2U>o_ipZnxX7HOiqHPAB&r0%j^1Gai8^gP^cLFUR zhh*)&Oce>6BPOnH#(g0Eu5x%sBYss&=$*!?5RLbxrCQ1U2as;$7g?Lg(R{2e+5Ye{ z?k7{=psC;l#Im~=3&M=^jRY9oocvhIL6^l}!Weub$;Q==IQHFj-b3d~28G57_=Ccg z5>$lpHS58&P?x!BwtL^7exQrof`Sy33&ai+*GPWRSP{I}P!*clef@zFUzKx=!;M4CO>=s%s#Zg0jv zJ>v8qm>9pk#>a9gfRZ`%L&aknjtpJog2*(OiK#yxvArcUkX_>ZP%Y1H4+r>No{!)` z5#g z^cklc_TQrMS#68Fb%v`Y6xvxPkmTK)bf`C}t0y$v;TKeckxMv-a0iS_jfs{g0ByB) zJWT<7sxTPtyQ70xD;a&_&}#13l8e{BLm(bzhFT;&$t5T?9`UbkCw_)`^!y{0$iiTN z^>sY}Hx%k9D$XYT{saz_!~`laTmJDk%HzsqTPyxQV^z>y?X~GQW66}xztv^RrFUA4 z%>R{(vT?KhqfJPCSq$h{J}K*LW4|Rap#(gam^tjaHp(!O-z*IM)Kco;nVhiI?b|p9 zqNEhfFwz2b(meGK_luvV{hn1IbBGKpA7v6$#xCeLk6u`lUys7f zH@dR%;4cv!@`4G#EJIBFht6!L>-%v5vU#n(thxXw(KL8HKENcbJP z^X_eK9tDFh<(*#N+c?k-wn5_~I+gq(#_Vl@uBL?zUe(6trq5*CLE5N>n)LHCV!OvL zJ9zI)@?>e)WS9|NZ}l}Gia0p?mdMorczD^3w7hOLcf+ZN6v#~SZ#X69<>FkFzTJhy zdS#4R=PSo%@<5UaArXQcAWviv@B^ZU=Icj8t@HHOTE4~;6G?pR4r^5<#-rjNh<}SP zt@NA0Q8>v}l5gRs3HunqI>Jy7z3*9N?u>l3(`0kGG=17|m%?fI`fe?l7z_6kHDbTQ zw;Xame41*l_-Zp)4&iROB6xPppjR9h?`fjAQqB7_LIjifqt;nHWt8b%+GEz$qlM~I z=2h1s3G+@Xb8}qsw)wI&gakvwVO1^IGV z2hHuh%U_sU!FSb924EY4L{N} zZ5Cg67#ySxTu5-k=#APneL;vZn`m7@Gt&)+{)=|;aw%^iGCXKs5%WZO zutfqJjNfhaM2wakQ$#7Cp6IT2=^7pVqU~nsY(0cqi=|n#hm24sG9OMpaxat_3iU9-dS6Hvfrx;Y;v{S zQ#jx0@y^eBQ}{+FSbYDk-=G3l(lU@0JY8uwYz}!S8&0$-?eTMnKe#*_GQ2bSV(U$Q zFEvvUz-dj3o>YsLR0~#Fc1GO))B=$!gik9Uor|d`Dc4uontbxXNgdS{sQ-+@#3*?J-{obFp$v|)M$SnRV|+&>LZiSJNneZu2v!DQ>*)p13$tnQM$*ZpM7kF ziAF=;j}eVko^yzrwo%r_Uvi-tN;(yOh_HyBL~K@~F!l>Pe zT=0EmM*lTX$EsssQd?HvI3KN?@deIYB|LS;h;8NR;3)xU&CE+iyG9kNr%k`RcMqr$ zM9!dK)Msk$UcQ#ij1bH8JKyJY8fmBe7~^+>MXYpIhl7KHo^CDOy;@>}qK2OfJAp^3 zFPWej82(>)3LgM&*=fs}>1 zma-F;gfG+>fi>)9Q3RCnX(t6Tmy5qsxtl^6O}orS#Fx2 zr(nGP6rDzJvgK8Ir_kbP(dra)uKjqLe3d=^#roU>E=7!(Q3!PCxxC;pfjtrP$I&ogrV-L={vYS z`4EBCVN`6{qW?})(^kLgrA$-db%Ys2(T_xS*T~_xj{VaQoI)^=8~$QTh=4NJ%dWc| zX$|&)oS=wKhgK7wzw8x@MVL>!vfA6@^N#W)z{!nYZ`t=tBG2APj^h+)k(X??K*N>w zs%_uMdqCxou?srz?9&n)FT0y#J@f^skEC3xcq1%|%o;8o+P=CwVL?C1zPoD8P8WlN zm0`7;-(=6NyPK|sJFM4Bk_7q5Jhs z+y1jj(s9YH^cST9am#lX_AxhP)515f|WONb=EZX*% zgZa*uDdfU6-ba`FbPloJAZ9ia>%Y6OI3 z#g)G(KAo0BSr0X__6S+HS)=tsC)j|y3n#@}`S0ZX;vVuTB+?VHys=#<^m=e`aBI5q z*v~SgA*s?H09+w_D&rNyA_uw{Fd`D#qYsUbFl9hny_zf?(Pq@=)6-%H*+ruI)k3mJ zv-=Dhgk{4NA1;`&c)qbr>@vHMdqDLCGDFLTMTz0OxvY95AxX|@>Mb$DhI!le$%|8N zAGVS#gA((;`zhoZfGFx6Z+>C-KAJyefo~7Kh(8OmgD8T6`Vcv1WrvCP=6`;UdLp}A zxYDLVmLsh>TgLMH$-%rQ2+A%poBa$$_OatRxW46vSpnVNhyp3;M?w-=M?*(W9R~?6 zg@U>qE0FJ%T!>eRDSS9#KDL`>2^RfEsuZ!@B>;ZU>?|a65s6L(uMU zZ&bhe8BuWU#<02McPh&d5E{z*)~AJ1viL3&9wru4dH8coR*)KbJ|?e|zZMuuaI{i_ ziTbA*LYQ3Rf4czcAk0guyBz!v%b;dL1KK7M4jkBnd_fH)Kg5q9-D1LqwIGU$Sd7yC z`#=eW5-M3)+avji1+5v&z{_^dy_e(p{XLF!#5<6E+{6Ei{w2ISIC?|kCyxn{Sp?OJ zX6g>t2RiQ18q~DpLE|p(nXV$R`KZxO6^F3glDs6s&Zhx{y-&R#{a1i3F94^hk=Xg$ zy*J-&zXOC($;ffg?D$R%@-9&X*~L2qrsoMYM>k>3_s=NwUg4{Jt;k~*7X)RANfpJ? zMfKn1NSV-5FXg{`=XchDNA!SE6O04yZ`%Uj8L9y4I$#G(b^nO>|1g}H4g}29s;d9s zkw4@RWlr%Ag(l55O#< z1u1k>->bjB|L2=pOh76x864F2ua_a=0A;WUC%Qkw@c&g5WD_0);~akJ)St)FHw6UC zvZ&zxN$?&8P$58%nN<^*#`z!S|3}_kPXrejt^Su3_TEK^zn1G?4*-d%3P@g$ziI#P zq5pfB?@t#L1f(D-6X*W+$35}}L5yl|GJF01r3HZ2k1+XPFN3B7i$F+-KmDJObl(x7 zCV+!cD#iF>{$D8q)_G}(7ystocgc8yK*RZ; zLZVLO;YfthH2vjfxZi1KdJRmo{E(XCej2|23vv#qRZ+IXsec^#2LUTTsu9BfdKs!4 z@D9z!R{ifo<3A8}0dTM>QTOpbbdQ<|@D9@=ujc+Qg1^f~SrI5$L}d2AUj@Fl>)OA$ z_uZj`3s;)O|L)uIAEO5FD4AsZ;=j)PU4j%hLD-~!FzK(wbG)KJ9RX_wHH-1hFA(iK zFBfw!yEKs`)y`{<)f(FBcE_m0vgUV2^&UE;o?(o*$))$f_N^VsOBxMch9LpFN6|r1 zz-PH8j&E<5FxszF$WZ<+Vf+VVgAZs#nEmKX{c)f`Qn113RVwX+#sw41%VEfQB{wL6UV^KlKj{3ULU z{o^qP`-ZEOopko+i1z|wE#&9BZJN*wf2c`h#f`PRN3{Q=0?;JzDIbCcB8t)_bFw$f zn>zFJ_!t(g$ETOLT0MN@M06*=f@DurtWBn`WbyLtHWmB+##t50{H7m~xdfvIW6h-V z=eX0>syt>F573L5F(MmnjOQBFzq!xlj7pS_qGN5ODu%ojC(TS|z^55S6KsMz-;gqgz*M2 zKW72HTa2JSask!2#2F5T5yiixO)(?of#owBbdZ3*BTj|nk8HRvZ|>S0_1>D(+o`@F zwA-z><7EPQxI0g^lU|>`+cd8CnW_oBmH~5x{l0#PCE8@TXL0-!&qob+xarTB3#xw% zNJ4^XJfkIBgV)OM_KZ%e7l8`6MdWuDe^1;S1VHZB&J_=r83g-~u$uH4z znw*S3dyDHuU~v;)b0&ZPcESDsSw41#C=xP7_B>IsX%+u9aA>Luz(H~FONKqlhDn_fi+Zz6H!+T%25qHKzD?{`@+xCD|KqqDTGvdAHTpEvh(|U3QGWl<>t(@ISHrJBfGuTPejt0PG&5 zCD{!U!7WwJ_* zwA*Cy{J=HRX>bV>dxBbq7U{)Ppf3LModXGlDr$!Rp|e;E)&TwBb=*4`pi@DXW7OrU zC0Y7badG4H{bFt8`DS)|ZB41Pu~G}E8fp6PXW>I~K>`%L8C5a%53m>=rw^S|==`L$ zzOKOfHs|}N5_`KGeenH<5MQA1=a=LSQ%o%#~~PEIE3(zSR1$N?A$Sjg?dS zOyYwJ%ylVkRFDx`Q(MmT$?9}Ps+2C)_nbK16!YQjhlF+8m={XvY7w3u?iW;&?tzlA-hN)?~jJ7*7~^1hVC+W&(D^+1^fDZ&Y(V>(cSUM zoEyK*peBDkim;AUhq5PiPAUKN&HTpP3R|?@MFNN{jj1gfU}^g_)Opo1m8LoiLPxXceQtK1*>n3?Cr8B=%KEgKImVY3KdHCT_;>` zi}ptVSB7K)AIbFou~BI8xaG9pW{8(F&GzSGw^t9TzKWDIwr(oOeBQ94!UL|;WH7bk zEU712fDT}f7s$=C-LiNOz)4>YH8XTxeKBd9GG|W9c$fc7QumM|ek>C87K)i~k91>i z+Ctm&mKm*zmpC4|=TioB@*NwiXVTuwuu#Q&hxME9{P31JL8E=KRBs`AvWGzw@` z5SOC%{Q6mlq6A4Bk(nx0yS6og7 zd5Z5syrzR-_+33shs~-rcKD%{()^_htwYLQ(MP%uzrLpa~8Mc>Toj|q|sW#ye;^mAFQcbV7TDa^NwZPk++ zbDUCtmlcy2J5(q_=`mD&RZ589r{@7cBzPW~)O;$~|8^1bs*q|BH5^ z+K^1D`5N&YmMv5o@&D0vm0?vbT~v-BttbjAEnU*x-QA6(bayKuN(s_%kZzEcMo~JX zrKP(|K;WC><=*dJ#UDP;5qQr#v1jkK*IqN@F#b?xqXlK3@ICoEY%cXMDM=7Ab7vGS zpF!51xpnI7P6%21@EiKr+>HM8V-q**)!CWbC|9Fc6<<%$FYpdjLY9tOj3=E^9MCof z-G=Qnf=*S&lO&$kR~}S+xLFtmb}cA_CDitMgX(RS_Ijy>Olv09S)5F!8i0)_ZKe9WnveVl-u<#Q+0JeT&AlBj2~cO)($!s%EGd;C(f^L zBm3)5F?Zf1i(04d#M=ve*!Aj+hja00iLsFlwIOsFM^f!yGz%;Y`OU+x@*aH^e#-C` zo$~#AcLHm-Vygph&G=K}PHe-+p`2)@P=AXtGrDRUGXzXRdO^_i7Xs6l`k_y$(Km1R z!GNGiXXGL-$bKYQ zZBbzIU}TbD)pi*ib8P`%i7|%xl?f9OEOfTG)WO%Zs5CA;r`u;ih%->b_j2zyJlxfM z!u5_Y!lyH4GEd4JKl@uo$%mxd{yZmd@K-{JUz$^Aw3xmOO*uj#{V3_Q+ z$H6X2n$1y;;^g2$m7go+OD6wNA{+w?E1o}1jfs{(yUfVsVY8|T{$E1paD-L=u8#Yv z=ovcoqLaR3Lx(jDetIoku0~_OC1ioOkvd+gQ`W!nKPTB;kH!ps}nJl@+g`wz`-D zsyKfQA*>T)XgaJqb|XYRS!-LR^+nfnbPfwUEm-coT`gTCD;jeAmLHs;I*vhTp{eWv zd&*XYkcQCvD4DD9qx9u2epR|OI|wbPpjN!dU-^LI?>mKfu~}>rWUISugacO(c8UMs zp?<;kH8;cX^1XlCB&@NK7*9FM$=}6(qLC+ogBXGX&uSU_tv1rg-?GKnQ2wV5^jF^q z>AC&36<=+azQ7Xlz`_V(vl1NuC%5*YRfMq;VhqomkK3FV$HLWTHM>2bwSLgCrx$RVD<^Y4R!QPkc>-4aXj&d$TU(oRBKJCH9D^;D$(2cM@h#1vPfWkN8^9H;#37w z^~Ouf_JC8^ZZxH%f(D2y0$0&l%8G^9{PC^G2a-neyZ>@Cm)!^P)}Ma>HAwC;WdZh+ ztq5;`)ebDUl?Ukj$VR@^;g;zc=0E(~Y5#kq6EL4XiRH1d?nV!f0W&Et2~$^s{ljxK zoz}66c{*a0tdlQ>ZgEskbts+ysVPoA^RVAf7wudeh50^q5r`RX12ho zdw`kW%%6s0=H z8r*k>^EWrU4#6TBV{;DhMg|gUTvH9_T1ge6w?{+8lLhv9S9Th9bYLl`g*COW^Dt8) z7?v-QO;4Y>I|;gDp=%|0C0vyQ={rJLux`=b53{b!=zAhi&)gyreSomKo`$Xa;`o5C zR;v-|_q}yLN1Pv)WRz{yJCL6)*9S$j)KQc%CS$@-IyL>vx4J`Y9-7SilULRv{1$<- zG=Xhs_7M2lO4F^5N#sEdi__H+YvYvTAKosXYyM(NezjfENAiZ<8;W2TD+#6u8#2bS zvi`O}eI9C%bKjaH{?CovzZnDZ-JDWa_RG^7McXnvd0@Jyl_rXO20vKI=GEufx0cU` z$2wke{2tU`M!W!oLsf;kq(3!?!3R~lhdrQOO`LuLhzEIr`wizCwNvXWrvd-0{(pbK zLwcjby>HTc^TuaVnCv-v*4|d|=QdgJUgyz>DN}D7zSEPGNtv8%$*bnamwyamJTg9f zFx<&F0KJ|^&i|GV@*I6N;BzzV9BShnTHKgEdmviAVznkhF-P9?uzaLK%mC2 zUaX^6$OaA)AF0d5sv?(n4y9E-iJu{%1KAYp&TxMEq0*{ z*!RoJ=-Ud^Guj-N+p1gjvtQ-f;2>5gp8My`>l-$8SWdy0r;lSfy?c&S4s`t^kUX-g zNAjc@N4*Ws)G!0L&eX-ea-OgGbwj}Eg&Hxiy(ESWKi{vbFAv^N#Df~NK>mv9H`9jw z&6mukai1TWFjwV3w^9hz3438t2+9&nbYNdW25ese;#H=B#7#|DU@8jy&sz__5AX;6 z`!y-tMZcd~14F5m2nJXf8+jv#6i@l&0AKn;T!&wr?%=laZ|6$<^!co)p9(6hIL99* z{oIAO3lTzDr8TIrA+Ntr`tJ?q|H1}*5C+s2;3yyZquNvA;>e#_E@|t&3OJUtm^l5X zzV%mEfW%DxkG)8uYd-%A2aD3szd*z`a2det$Jvn{`KL_&Z-b4*qCi=b`D+VzoFn=62iYLz(Z7TLiSAc>Pc~dds5#J z{)K%Fa}a0gefHGa*Mo{7Z+9Y-)$}6otKQJ^Se~UI)_Antla~*_Hsu^xDWNDhm`b;6 zoR=22MH=A`BXL6O0hHd>H>)$<{O{OUG$o0Eje!7VeeZ7tyN zn7T1=Iwwh>(-)DJf9s{}!7EON*i2O?r& zu~u|Ue&IYUN;!7b#x}5eXi+@@Z&tR|;E?TjZ+IHOUcYOA8h{;OsUco&%U&8^PR%ETq#Xt9J8tYnza8UlS=}J#g zopy0{wjUWTIusSBE8J1}{pdf~K<6O|7gg|#lTvYje!x2uz@0v{b-OH-yZ|krB>;fmEzl{R#uZ0=kwD&2{Zy)R}B{MgGzpJ6_xp! z<0{43&vR)2p-mKFAa65=!xymc&X9fgDwuxFPZr^PsPSueJ@oTO*&st@Y_MF1Y{^GQ zsfg-}eNL?p<9oNrNA*Om99>_2H@~Wj?-lFmNBA+EeXkg$>^S1UWeCXNL|3!$k z+uhnaSNi%UOusfZFDT5>KZQ?#)w~szv}C}kWhrhWJBnRdVgf&N$KR%Axv@!Xi?3_y zyu~6DHMw_5VnIRDtctgfEaq zL|!^#C~)?_(S?Xdk);vVA3d%|dX!3BPeFl4(*K(5h3_+lU~PGLgcTN>?uG8}ABhsVE_0c85|KDEsO$;K7wPq)uMNog;5&;f={r2`NK_1zo@+D*a zyI0;Yhrv&nisFBS6xEh~>KIFK_wL=>cz8Ozshh8zPNCNV+#8Oh)%$!9LT)yN^acwt zxOrT4=N|Z(UlSI6l#Pe_iwQ0LM!I!y*WfI+I7WcGd%N(C@ z^HNb!K^WlIG2Y7e-V|xE4dNFU7cYpmYp>GidvxVB>fj!^Z8^BIsOwu7Y>9cr^>(mJA8i$y zK2oL#85kJcsAb2Syp4f!;EL?2&}5#9U$t^Pt7Iheho<@-!FSG7D&GWm zhTrb%>svUR^gn~mgbBQ7y?prieiyfh!M~aaYt8@vdw!2Tik;+18>(RtFk5E|#y}m2 z>Z|?1x``~)Sl+pREt3Dt=sBtWp>5ZD9S89&A08iPaNTPU>UT@ntZ-b&uLNIp3nKU# ze5vo~CnDr#es8JuQLX^@Lw3fOJP=jXgZDDM_%5OX|7qF(+xSDJscRBgjt7^?`hbCffmbVKHt}Nt3LN|og(h;J+Ygh!OU!;D&=WcI|3fFglItJ( z%UKVV^>Sl~rODG_q}14Kj#5+|e$T6x>19hH&KLYci#=tUEPG@M6YCO-d3c7wm`-7A z55c;**D~Ie^ryzAAjR>_L}?xTnD_HA!@0Zah72`J!9YSr?&~W2-1<2+u1@$5kHSo( zk~WD%3tN^MlWKlan&CF+l=tIr4gv0>W~DNobo_;RdP(1m4^uU^y zf`Z}^?4;f&gm6gQWc;c4#r||i%XrToOrMBag&hVMm)*G9FoD~Lf27a4lvOE|sta+; zp#ko1f-K@K%7Hj;%?6LY{vRG|qC@C{ZcFrkp(AJ zH1{CIbywT*XED|6gEi)JXS+hMUj0tc`M!Gz_MhHW3o()Vo*~~`+e?|+Bb%x#=w*Li zwiyu-@!=6;+j`hom=+la>KkSpS#XQ*pS%RPX&063!P?(i01mc$IGC7oN={VVg5b4H zQ4roe)GrjT#b>`n6(czO$8zui&?5*w=3Du2wU6TvRg?oNmbFLs5}ghYBjXAF;E4DM{ZPOJDCETQ6VP{x zUW3zu2k)1ogLOQ{w43niR{QevqeL*HwY6@(mQl)Cx@|(%2V2z-XdXT9f`pFYTCu)i z|HI?~{{qGM+fX~;5fDU##l&LS&`4m8XIls^g<_CNk_4>pX0~uzB46JD$gR!kD82BF z*o!2N$8vn9b@Nm|YBI#5hgWu%uec-kd#&$i22r-h$!Vhg#W}@b+4P&+-KhS7f!5)n zp|qx1h(QJk3Xg95X*in1-^wk>r(*#806a5jks+KO<>0F!Ml!qQRvQ8>p=)rRkVf25 zS}8H7Hx{vUJpEPbL-e_Oed5TnZ@%uvIsV5X{51v;0pE-#gxtoxTdxIqdJ~!EF5m@0 zoZqtu=Xz~3*Zrr^g7Hp>uud#a7=O}~{Vt->!#lTu6?s#AI}qeaAKw=Tc996Pi>A`1 zz84jxjz(hL-UuBHwCEl(F~Y%=!(|~-h`-Ijwg+69L^E)U#TSzkTLggwRuO_ceC~0` zmeiVC&|y~K9Ax#mjH>@I?bjd;6c}!nDgG|En6w`TxCaWu6Z-n5ptzo)B8mpMliWPp z?>5u|4pidX=7tvIJ5lRq-g{FkBsP`HCilgzSOXYOA>&xGk%g$_53Lh3Qi*QE-Gceb zwQsa==|9Gr*E%%bWt8L9c4;Bf6zlWOd4!u?m`JkEhf=9f z3-5hr-6qLY{A1RotYNbUX$FvPZNTlz%zG{t@`pP+CvtQ(eb>KOF1ulO4cj%*- zmGSScBJ^v`jKMH#clcBfY8>rg{$Rl?Wy|5wZ!#20%Wp>$f+UeD!1;&DD3 zNvfucjIz5rLjFTJ!Rj~X@7st1!x!&azvxgTG4d~iNlU0qvc)BYk*Wd5*w4Rod9JcL9t>60=I zFx6k91wSD&Xf^63q_{24x<14K;7pQ3daMORb4$lFJ<`UZJ7Eey6(0!I*5T)QUVEdp zRWa6BDxDiGx#RtuC>1Tx_3ezZy|0NP5<@-BNL2sT}#p6`|ub@cl{DairSv}S&^)M zcS+E)=Ch0TU!=yOAX&d63$xYKAO9#(>NF!fc|Jdpw3tZ@K_bzIE55h-fb_NmF4?!L zmzju3O)+kB^NF98WHz{zmS07(sOh#t8~u%ZGX0HidbKMz^50AtF3CzdTgjn>IE|S$ zE(FPZzHJfP*ICitOD9#)z8nfOr`P|a5B7y4yOZ4KLqNCFdjav;bCR1OzNA6 zPH=ktvYQhpV21;E01SG_9%3DJ_MwmA`|06y>~>Ub-&}$O>!#iC21qZE${X&%5v^AB6{i0o~BOKG>^V6dY zy$bEht)mqhqR2)^r*@sXIxzcG!FR|@yu$qHzlAL@88sIuWW|now$ZfjjU{HyAp>ju zwTlp$Ch7hN!FqlV>ZED90Qa8pz5zGChzN{s-0|l0O1vs*dn}5TAHw^vXMadHt3Y1Ai) zsA?BRf(KrcAx}yz4hqOsqi94VSH7>fw@0wlh8Hl;(9Q{VJ&6CBm8OMV1XY5LP_=iG zQiFd~;-%S&)lR&654c%bRmsYW^iL`Uop8+Z6;e)>73K=p)D~iiYe|i3(C$QLzk?^2 zT&J!pxc+<3m7Anfb>-#dg&iFokFSv$jAOwA1mr7{V+AomJpjSI(n|IQEXDgFpl@4P zmv+;I#k_dPoQQ?*zWLpjr7KLd3H36i*~g_AMEza`e+?cY*F+uQ4qW|n5u*q!wqT*7 z`WHSIcB+f}W`{O2a(<;YQVT8~-JOb7V3F1r8CNI zQI7ld-6sYim{nVTJ>50vV+`+dJ?kJn-WWW(qP+Lnf_80;OAqqSPW7=G#=M}a%lOT=gUt3WJ?CVUAdd}ivOf!3fIIS< z+i|mb#Z-0cQPxdo3ohq)KFfF>LW~H=Dt&9C&xOpOUp%E^cJ!Azk?Z{*dSs@n-E?jY zby%2-M>`ab)l~f|s$j)ew7*)9T-7!X9tHsIWNlD2qBkj)8olOqwx|>QP zZF|+I987gqUb5?>Bvm$35h1tQ&Yzc4GlX#o6639@v@oiMSNsr z-H4Td3H%b76|XK$a1@z?`)y4TtvrkZ*(C-1#*MYE86C#jBd(g$2PZ?6;rBRr8X{nk zK33_ZeTJ6$;*3J222HU+f*gb)*SfXy-W39)-UPQ@k9mpg6eUl*AzG)<6K?~K4=x+K zq!0Suy(qM>GkGVSN+;R=usc&eKoj4mH%X&dfj~u`t5U=)o!*Ssv2vbLqn*BI$hQ-_ zryWYS)YcO{Q*@K`Obyp=VpSZZEWi4+I#PhNvWM@1j5oS5#iP^TEg7VZy&P=r9CUzE z+;iR+ZW=)tgiDqZ^ZqXGH`Q#>bL4(6cgW5^!~5ic-t^1f!maaq55^h^WCb@Frb72| zZ^aq`E~eB^zhRea5IEY)lwp7gnEcym4AyXdeK5SQZouH8=0rNH7~*2oc2m(v_kFTj z)!zQ(T%%AE#r!$jY*mM~6O{)S)*9XLVq+m18ACY%X()NzLouP7B-1`+q7Ix+rxRjK ztnTI<;av^~#i(i4(yut&K2h(rvYBc$(pu#>#<&RkHAnv7f4sjqO1mUYe zt4f^JKYWn8dMMsbh2TK@qr044CD+;S5FIWSL7D^|U?~j_`WwJD=%HnF8~uQ)J!aK& zH!nbdGxQ^HE{OWv%yl2pTsTLFrfj~8+KiEx+KkC4X7B-Hr;=H}M$a)v{WXHE_x9Q@ z9OZ5SD=LTD@3s0(drUZ?SD+ul2!s>m;d=D|fDgPmzH&j}sJ`t8^3=l{{N!rXH`4O1 zd5Kn9p7s>tFy@%xD;ehEZ1J90M!_SJGu|V>Hd4Nejh$^>vc|q)0n7ONzxTjjcoGUO z9$KW1R#-3fDb~3gng%-O_L}P8h!5^;X+Lb%NghIfzWN~E>}YE;ns%?Xv#OJK2uC^HI=|lm=3`ydn&B^FQwc;0?r2D%87i~7=;cdly>#>bJt#R>F zj?$mA2qZq6Bx6c5+Rv@`)nu8^Jt$jvQ&d(}Z7jEX^cg$KeQiqA>>(eNU*+An=?a{z zixa}@hTlT_w^Cn$e)(t5pLdFhh={!BzJ2|B1z_2|&s@Fi9ZXPQZgxI>FHt0!YcDyg-wUl=MM2P)6y|s zZ^8T9mn6V|l!}{TG+pKu!4@eF=6I4Y6#&d+hw+?hxf+s-#_sy2 zC|?Wirg(uNcia4|CSBY*${$UWG?rRpauhZ&;_;)hLTV`u7W;av+c(Z?y2i-j1$F6@ z#O|$l@n$3^ja_tOmK^IAKMJ58idu5Oc1)TutAMTWM~&0OVwGM*J^WxQ>tdBK1mw#1#dF`@OIdq=%jq8C}uXlVuRZF zVV!C2tq;Eqv9E73WQOM>$?h_5H<6)%0r&Zl1J_ZjI`lT^GE-02E*~6IfcyM7_g;&3 zt*6B1bUhyZXE~~+%JsSM*=~Y5g{siHA#Y#PZ)c9+&{3}HjccFT8C85^E<(*CtCWq~ z4R*iC`h4TPBokH&%W)a>?6K)gIj1K5N=iz02VBPS266<~Gul(^kKTlY|L(oi@6fkZ z+oeNEWu5_^nE;;qT;w`f3gdR2B&L3m=igQxJlM<4sta(ZoA+xJGdsV%-|6TmbboHK z5IxRToR~0Icy!WeY_QiD5S2 z?RGa`9PcV8?AJJe(^K_gLRU#z8_qbV*tsxE^K3UhZOYg`?UW2Y-+3$Lp;mRug2$~? zTCd!cX$LMt?Xq%TT`F7p$P-DTVXqaN_Hy6Ev50uJ6JgVOb*cEY4lk0*OVmZ~^Y`DU zfxixNcXh|$>)+Kf5nn+bYR%18XC^z!ql`95hR;Gq?K^Osw4`>-L%E?2_R}{B@Q$aF z#~G`=RH|KB8K)>!rqZT&o1vOvg?pcpxF6|BZxVy7-jTaA_GNT5RiQhtBHF>Xa~?6j z+2Yn)Z3GH@!Moyz%93tW-2)&^eodUBIp6Ibd9orcTWn?ds(*Ln_$ znT}3wi^tlkwd6V9ALy*ccQE8Vw@m3iF6F)`-SJC+!hfx_jkMHZ?_848e=7e;4Wh^9?M2HgufxN?6C%+ao)9%lj+R*8${3b25^d5C7RbrZcZMV)8J010?==1_AIZ^Lu&Nhk<0;y%i??KjqR9R^?Mnz$7^>Q zh8*jd{HWR)0D|it|G@2(ujgqo&Aen7tiC!0A&nN6j^#+qkdD$-%#fvSSITlqTFp(U z38vM_D!7wS{GK~;&KMb`^e~@4IU!FLYsg|~Z2vVQ`Z(9#GTP}xt%sQG#yFYBNDoYc zfHpUWN-_OiD0eU|KU0f1IiXE3yJFx*2e1z(j~wCRnhnJ>z1jq-p3`zAyiO5!s@rk2 zE^w_}Z@7)kkL`|3f;rEVGmTmBn3Z@ zI&WG6j(G-H%9(gGiYNLcc*aJjk*XEk-_zC3UiO$8F_oA5e1s{mV{>SVLw|q9>2Bc! z|4TI+npC$}oMm1qYh{<9Y_mrB^+wP@L*(1H`c*|2fHeuL5T-d6X7aIehRdT@z1P-% z1(4E%sVuOe5DP$q>fPD?j*VUp%ALtgJDEYVVHF`(3S)K;Q=((a^d-!=RQc>~vrS#5 z(FJI9LX9Ww3iOLb&h}ea8ua3)!zBPLcEl*jBx_u3;EWiFj+_L3wpMLb4JJ|XjpOjJ zmWV$z%b%2KO{5%DvRGmFN?u_|M&~}=V;HSq)D9CZsNnu&QK2c`eJfCNb1CfR>XOn~ z(nA(%gmO{C+}^FJ22!S4w!t&S4|8*ODNZZjBKw$v$Dg60Kvv>Sx;ghgGrQ^a!I@I76^i0LmeMBdV?ucG@**G_Ov2XtEv4Cka@l2Zy2R zoHDH44-D@g+466FJ{6v9TPR{J)_3n zR3i46qkTsbS!|XS&`_{oE0N3_l`~$}BLfPP^duT^BmX74d=ZP$VcDE3721rz9 zUUMb%K~ZD!DML?WomvO2aU4!(b4xj(TIKlAcMjjN+Hp@J$A>Z4w#&mq+LYcAqLLYkX}Abt#G)+9%=kGcQ!)sxxL+Uh8{>_G%st z(TzfXE8)1*d5m94XRzCaHmEy#zn9mBN|%&Z1u`hD#J10}$<{5MW5wVx@;IU*1?>xN z@#Ob~$hl}WtHj0hd=N1x0l%zwqdwd@NVwpv$#7+Q4Shg^GOezzYwy?3- za%Wf3Ed8wo5O8-Ra%!9Y#zLNiH@4#4=-DV{a$vdc=;Pw7+^fSwP`1IB?GpjA(ic=O z2hB(KVr_B1$MZU{O2luPNyM{0=Z-!!lSpC@o~W^tvDU4pVLLz8^1e8D^C^j)Ci~VF zGgo~sd-Q-6hbL5R&nye;}9S#bAH#X$x zc`A`l%3QpIFLO(r@V|BqGxKj5py#q5`_KWmtJ;+Q6A!G8^$YBng^l5ERJBb_eWep~ z?kv!iNz!atbJFS@-#9>Z+nlVXy*^c`i#u#(?Qi7yI7n%Yakbp(1N`XWu~GNCzfjhaq-4lwe2F-N%)d-f>(~G-%A8gRQG($Bh=&)!F@0+^#a@sjBb=VGM=U};& zH83wTOl=y+;meUmst9Tn=RVU=VYg5_B|b5YA3T&%YR(-H*7B(nhe{^*D0^y0P{mCg z4GA{$QU&uZch2GS94MW6%QupjYmS#lk(z!OK^xf&_3f=;jmVjsHj|Y}g|b0W9DdPM zkSN1Q`Z*m=;P?)L-MuG)+GvXVizzeJ;GDknQ3KK6B@FXBPM8{1)#J32f>>^j#r{W6 zjm&>=lTu3nPcHMh=3$15ulAH;bmr(cTea+W6F#56THCvJN_L)c_oV{*?auJHwm`P- zhbJJsNmj<9O2#C)(Gq;gwl~I3lk41>#98!ww?*$z=q!>V)JR3B}a$DAY@ z?0p)JP)h;;UXk>H@HAJIbz6tsd8`IhHNX_0N!$*0j=DvWzt?T__*x zra!;r>#CiPBTXzX?ginI%aCOpZj{P7q4p{>sI;zZ%6$I>y7JR6TeW3-7Y+4RRgJZb zri)ZR0BY64yy1fZKcx7P;()$83(s>cVDwd>oyqks?|A-tHLDhy7=V5g@J^=R!)VA> zneKbFPS9!%M$=v^x0|M1mI-?$!ZgW6qm!|6L@YIZ_VZa3pvJoFQ_ppJe>v!v9{Y|U zT-GJ!3Z4t{6!Rs(mt8|SxYL!sn;7u$-l`pamIuKBD4#m+$%XgoR_#5d)SdiMXmc5Y zS?F3Q^jyku@#WLoHtQ!9p5+iW|xK$B*PVx^r(6n>NGRlykhab z;h!(@x--Qioxk<7EH{b?)-eXseYkM%D7`h&ddcn@alrXpzcD$0qr+wxx30<4xnic0 z3Z?owV7@BcB$TM8j^DVpZZjXhIyf^wMO&&bkJJ5b{aC8a1GAubB7>qbph8Y@lVB1+Pn9%=%W%yua=U#$$h?QcgP{fG%+1oJ9 z-jU!$&0#oNuEoYyi|R!~2H_C&y-V-lVF}iDW4?$L*ACs;_ksUQoPM zp!X>I)*8WZ9$OJ-j?!e&5nw7*vyQ|I;wRp7ekz;Z6^gO0W7hmJIbJ0^$$Jc3V5p$g zsrJHdP0y6cYI2#C;!p7POPIpP{Bj!RVQXu9oRgD-^acDDC=<4y0#(Q^;Ej(O>>b?M zoGAO!UN$P#`N4|=;IJFd6~xaZks0TCRs zqiI8<<$=pVF5XmW-IvR_E_1-tVaLNOUiN{B)FziItHqi1>BUG&Cn%SgITMu3V8ck( z(V65;m$_Br2dc$y2hO^kl>@-ps?EULNWL6gAu-#%`{)kIhY!IdBCF-QmSGc->HHF{ zOR8hJXXChR#Vaf>ccXZ?Q>Z?2mzxF5nFto*U>*!--*mTPa10xNB~tiax4~AH*ZL&k zNvyfmiFkbA2KrprjfqAVIjo!Jv{XUnnYpI83B_?gi2*ON{3O7L;!c(JcVO(gPY&-o zpZdj|e7)xhT>kOdNDB1+V~ZkBK=qieC$!McC53ZSUTB}py)GGwHMPIzJ!xw5$`KX* z`7hCnQ2)8FZmO=dbcooRisjdvLa;i@V^r6DY(1cwxAdg8=CY2Tr;+bp=%xqSM8NJb zW;pX^M^f9R*DEg4n)Um**VUPzR?e=ihY9jH328K%pH%H1XucQgH}-xEt~Ah-UL^*4 zl!EF_z}9yFwtl;CrLVRCN`&K?qfxLCt+Tv(KHX9Xk(1nfAUnwZmb!TB^V=eIALbOT?}=% zS^(4_PRr=McLtVhlftGW9ODb|mK+-iJ;QdQ%rt3kQW+tC zX$E-0xDV%bE{7)J0NcRy*lqyt@!V`--ddKPzsz50T&&ov7xkAo@;B5h)2u~l4+Gjj!*oW9L7^{y?M7KW@y7AWwlgcc4@c=YkYL`T$0JpD)c zE6?!|!cDZqp&!%i4fDRRD$I(Z>LefQ44AgNM3M=alnsnyWjiegj7@uBs&SxbdX=m! z)O50(@aL5?@;)8HKKh*STkm5)cc&svf#7Xh1*V6@so)q>^Sbk+W>DhAoMw!Z83C0B zgY7|yD<-%>J2w)chndTI&D~SiX?OC?p$bcEdL(2^?z}&K!M(osnP1_;d_!Tv+A=Gf zAR}*Vv*8-9Hg)k6xaBsPMoD@3?M4g5fH#Rtp;2BI*xda`-7%Aymd%HiP}v;WB5nG2 z(LX%5wS=O|AC!rMToG+<9qnt4ZCcJn>tN2PTjEqbR~264t&qrqq4C40e)O>iwB3VC zd4vk})S`uLi3gdWjx745(#zVBs)P=E|1cLfZ{=I4%V=fLNB;k$W+3{z3h-0g-Tzza zhAL&y0S|=Bx)2Y2?|0g;IDkPMBO4{-(4wyEOO>I7;3O)uz;dEirMkYqIxdsxoPAqV zi0kawCOfQcU$%nfbmWI^T%BrJ^}yVCpTfPhWR6m;p0nNboM%kYsGggd5EvU9No93I zBAKX+L@r(v)Y@qqlR2r@OmB`P4=W+g<|d_doIR+p59Jh(R;>^DXnCx?u=8dK03w!K zXdyO_q7#?tokG=6KI7~5rZk>C!qmheFduu{3QxIf!lTtSNC;@ui0HYRMEqjS@dm{j zZ!Tn%{`0(+;?~3VuFWS|tp$U1`O20juUA}aA+41Zg>L%hfGf?=Kh|`XF#$rUL8YxD66rKVnd~P2pR6o*2`ax5J}7l}Tl-6kHgT*hla# zl*~yib?V&d)qH2W7fW{0@dtUgLRHlC%;WhbAcuTY#)mhgJr3{^x7}ZrZsXjPbZdvo)3H*GTfZVFUh1Ef<<>pYrzi)Whp&$(K7#mD zEjD|EnkL`t5$-zV54E?QW9&q2I}!9=VW0f|r4yCG)W{(5WlW2Ui+A(WUY(Csl38D{ zX$waaKjdK)>am;|v_XugV_MF#s=ea*2wp@~pM^NY{LM`t9&mGiNY-LA0=dp;jsCM0 zH&nGeYIF}-fI`Cf(}f-mC8Zv}Z1=!a<+lHlLv5i@trgwOGN8O!4$_HsmG4eWGca+B z=WsZ8jP3fVA+tDh$;w5qy>I0By_nOeQZOp)agQL-QvFxl2odvr z&TxH6qf7}NgPt7BJ4ZG=P6ZGEb380jbvPTf$9~h5O2;94GGMWqA?Gx^)J$uz;y4P} z-E1q~gc1d4cYN#0`HmBFjZ@K4T7BeUd_iT;O74fBe%U>f?WM#1aJn=;pWS}#!K;N= z5?*N4FICW^U=kElXqB^hfn?FS^Qy-vw@raCeuXoa?oxarsl(w!phX!q|2>+AXI;6~ z%Cowp+Z*O4D2wZLALy(nxng>4@5-)Lq($S11Da@kEN_M#b6hnNwJ)1xZjd&OQZbVS zw{-w+q*xo&++IGF&G*jEn=WMjweOc5CQmCz07dYP?Q&Sy{8_Jpn-iMlt~c?C@9moY zT5&jLGfE^8&71!CbI&~{W3TrW<2w@?xK3+he2f{J)Cm@ninq=P%ARVtgh(?|+E9OsV!;6!C(X@`N6Sjv*?I zyzB@c?*rkKbZ(lKeAQPrJqu+8)ZvLV^x91Cp0q82DwkwT$oDVG1TOPa2V@N{M`38L zQnm7I9_M$$(!}*FUyB_{x`vA;NbL=y2Q)nWJt)uR;cn;)}{7oq^ zyp^4O{8bmGLlau7dsJnkJ2GLsa$>b`9E;isbW)C__;Qfk@xGP{8e*U1a68R+bNdG| zQMcWr9+0AWaXirXf9m+r6ePTOP4<>W6@08bLUli;2U{3uvanb|y(_kF2IIa4BtEt-?Kt|y)hyVU2UVdFL{Ika zgE|Les*2mB+a>A5qMi7&h6=0J)=Wx*i)$)4Z&q=Z5A29;*`$0vC6#G0O~#+r!BeqI zJ<;elmor6vjlbtoW8vQN3!l&5v<2&Zt=Q@gx?~55Dcxk6n~0(6w*W#`CB(0J^|`o-@kyQ=vc&U4Rf%Jj zeCd8LdF^ux3vB!b^^EfJhiBD&^t#7CvajD}lv@(*1f@lZYSEMFl@pu1=wy(hv z06w!*+vrLra&Ch-gPV!9A+mQ1 zfD=#OrY5SS_JKzAZPjH}J35tSbIuGzbyE-}Q|^lZM)hEt^uc7)QhP;fcjg`8?m&O3 zNFz#a*0Ta==tAt)(fuA%7oxRc1Wcpxr6{K*rzMGOQ9XLs{IhFr>DD+x*j$j!i5jBk z=Bv|-xhPmFGik*Ym<@$5Qv&@&q*={_4w-?t?0fsim-g13*7@L{KcSpF|=1)~%N| zx+n(|N6bZf$5Mf86zlu&pYxdCxKFm2A2vEZ9s~PkhE#I|X_41>0I*Zqi5gdno+g$v zZ>hwq1v*b@bvE{?S_iEgb!U;A=N^wIXn|0Z$#`BeqR-06&Tz@2{v|UW^FVa2_GYce zf#tD#i+rQAU28qJ|E zy%ed0Kkm(ur-PMD=k}d0ip)S+I!o4Le~?VMFRSJQtzBdWG(pju7lX*bN!Bz(>&2vm znp5p!j_Buwu?49F50&iZJSBt4p_|s4kMmoOUbwib434f0phr^J-Eon*XnTLbarWGE z_rrLN!>CtYo(>LE6q*|O{rsp9vtN-Pp*{@=VkNh4pTO$$fv%eT`SNo2q;o$~CV0T7 z`EXM()eJhgZ!51biccE{0b$blV8Li&HNKuNr>!g=x2X#gCz|pk2CM!MgTx#5l1Zgn zr#he`zmKoWX+KbW=kDG0VW&^%yoH7`&061-d8P0~SG!H#WQS-avJL0Gq#Fu#P_C$75h!P?!Z`$qH`w!QC6xVjtOexsYtT3 z>~WUl!a3aOYF#XJ4?PFzXrad%&0i;mMHue?YUar5pmTJ8@4$sjJ(L3aPxz!eU?Edh z`uD92gqurswG$`2Qjz%J-Fb_Pm#@w^y5Rn}ii_cDc;9Mx|7C!DAM)jA#VF@sKwvKM z>oSQ_EozGPRNZGdP=JfhB<3|OpX#d@wjQHVpRnw#AmW~<-kU6wTz5JL?M}P% z(r)XSWWS2#um)q$YKp!^YmM?p`ql4n9Phk?pFAl|kUI;Le#4$iA^{b=1FG5-tID7K zS6?AgNfe*+j4I{gA~ZcH^_P5lKD+y)Gl04A>C8I5fuZ37Of!%}&=B<^b3poDU0wyb z%xlGGn55nY8^L!-Uw;!0CUNUvj%3ui(SCRLS)KemI!sVqCY3?(+wiKwY{!ZM>`pQC z;gfskOQQ{70x5tAs;Rn}COWu27Mayz`WH1KyC5xkj9zq9<->%awIj1x1a?(@`~U=T z;>kVf+PmALOMkJLdIL!(J3F#~fgh6wB*{OIA0Y-qj`#pAe861@KS}ZvhW=F**Wg3~j zDYfg>)X>tOHlAxYB=yTV&~Po48h^k-gz?m)+%5$s#ZXiisgtfzny^*P_X?>~bJ<6E zNMHY$IRD~Jh$*PfLYRCt|_QxN6 z=0^-q7eKvSw7L36HC0jJ_=2I>3_grV{WpB};|D}KojkPs^iU!ddTRNA!yCpj?wRB~ zyQ%VZM$)79jNh%^dW}5r``L&R<1x=%BV{X_3ry8M=RKgUp<3tlk87O(n;|%E`bs_h zSrC%rb=7hhg+8VkI~sy$Fgf`?PN&*eM9{lcH7$$x8b}^^>~T`4`cc+6`i7qq4kek~ zd>*2ub(!(ud(aLF6d5KqK8X`5Ic2S1AaIEx9&h+EnVnF!@-4n{VX6!F#mP2NJ=7IR zl+6aKcuIY6Cb!|Mvk!wp2hAU@oW!u1ugRT(T~AK1rcZ98_V9KQs;3*s<{8 zk`)hGTBC;dNu&!ka*|3!X1+6i{6DtdJD%$P{~x~^5<&|ZQA)1N5VBLUH^&~yieqGN zN+D!t&uospw@UUn;@Eq04vuxK+!fh?&I;ay;AmD zXG~BF8{G(GCk^acC{oe(hv^_}K@oj*$An8~(hNOS zyYWP2r>k|>ABz@tDBtk=5FFqV4qTsuNkU(eJz+M?>V9D}9H~ZHSFJ8@X20FsZ3;Q? zuhShlwJX5Rwq@A^mo$Ix=1B=l#Ej8SZ0LEZ{~(KZcL0q5;+F@TDzMkG68?Ch+}qO` zw!Nt@lEEwip1$xI0`bM+UQ1Bj3I7z{(d8n=;ri-AzOo#JmB8SnN^Xvg1+A)3vstLp zlHR2P!3TlY>sR*q%;|dekB**x=-C)~#?v4ROHnQQNoW?}Ah7xMn5oQN)pp9x(uwW? z6ZC{L)qS%kjJsU-Lhnk3%`B`xGvmHO#6T7XxDdzeT%kWycm2|)RyM+~Xg}9a7?CxP zCeakI%h;c*C|;_iGUOuW!5pQU-!pv%+o>x>nrDurH-@%QZ&-qK{AaMeg1S%0wU?+q z`!O9e@EQ`KuVqso=oC__?q0y5S#K%MXaAM)T=B;7-LE#UjJ5v#I-I-s7(YlY;a?GY zJe*XT`50);+Ud7CSXYm#J}JqK!px8V=F~ZYb0_xd`c5VB);pk*ybjgO{^k)n|4?%R zSl&7_ouhdJWM03J-nKTe$L5FXw#-Aj>L-#minw)<#Wv4rwH8dLL~DtoBSH8=`1-E) z$R#ig3&}au(iGT)dIIz)gL8bK8DL*`2FyXG8x`;TaU_sjCZoojTW#8;wT}T>oo1*Xma*VcMepJ1Gwdo|V$ajG*CaUrj+nOo5Gj-fr)iYNl?kQl0EE3`2 z`1dREbe32-m#PLnU6icL-cU|sJ0eKk?t`BvPRiAf_@Z3~q4~F3<>_a)-!t_t>h3Lj zW%cL|WS%+ds`QU8?c!`M1X_^fW@z zUxf5=HpW@6qy0l~VVXYqR5#3h3%B4gi?FoDKCu_mAYWD^aGxpEy?;$Eo0@*|cmB4rCXbfOP?Fi? z)Uww%k<11qYRowC_;(f!l|K%!;j#*CII(=;BHtVLC*V5F!nec!NfOPIj}!5+2uJ+K zZy8xwSCV=$^(bfL*)B!Y{tZo$>-Ol)+tdcl;aQn-o(?~f*K16tpBw@2WUr}7x4yDy zT;1>z^UJ+}J!gyly=<j0ypBB8G>sS1B6$;AJz((@%s#KrJ7E_A&{V+bImK5haQij9MD6t4KfyLDg2;F{|;1)I!$QnEktT*Fl`=C|!h#cQH^lBFkTX@2P*mmdug3>{+ljWp^yt3HYX*)PSK-LVk`Y$PIU- zGZbMSbd$}n`99}q!4ubK!Gq;V&F+@GW1aDQE&+N3yQij`@XmL%| zruJQD1m2?xCxgash;N^fZ)FQL$|QL}e50m)*w~a2&&rS8ouo1Uycd$#@OtTzYBTo9#E3n<^a3;Hl%15#F~rx` z*E)4V|3VSX%yTfiY!=N~(^dm-mFoJLQf&yWqeKg%B6F>^Zd~BIy1p?ExJkDKW8~F8 zi{^FwluC3)?e0MM>pXcl*!FBB;=Af;?JbInradugb-iO8wB}(kwy|FM^4V7-eqSfW(Q5X#M?E;ahYJ#fre7! zw@<{$>V{5QIV>IO8~w`^WgzYlRtR60NNMt70|5M}0P5AUG^XES`QSjm7%NOtalEq5Y^h(yh5%2$H(YCa>Uw1M^Jec=B`PuNSjBiQE9#2eBc?BdgKo z_3rC_;M9}ZFACkJORjLr9KrHr=Y?3gP)!4`OwmIM1XY3bZgq&5i#oSw#+nisb+b){^`ubp5ea>v~+Z>)~n0gKk zrJ|)}?2P0Ivxf3?ZRfh=0(BZVJo(`o*?eyP@sDnKcY*eK1%M;rZ8cZNNk>6?Do=GqJ z2mT%Nb62L;$bwelV8$YqYH=G1u15~F-2W{;FYX;S^>5?JF5o}x&1w)kk|XLg)=RQj zG2r--6cK36f+UNFDL(`)GtlN!IZ-Q~nlBMm^E}P|0cJ&$!6YvKy)4USR$jp?d?0=_ z8az%tG?|UO z`Sj=3gtu_KA$S*N zeVLt`_!xunhD;<=EHEbHX_k9Rg`#SNhS}Xo3Mn4j6mDNSXhpp`3gTB@{O2TT0Ek)6 zA}I{EZ27)z|Fuf!%tj;f8{WhAW8CCdAro_EMSb%6`c?*5=rpfy{%| zNceAz=Au|m)C){woxq>{VoKCJi=s$fQ+-G1iOMV1DPc-1h+(J@8S)gO2QJ{{K(lA; z?I#Y~F4RMdKXs?*=_Qh#^Sy0usHzz{$L)AK<&XpwZbB${%sQ^muR3GU!8`EQ-GpZI zA#}3u5>ytD?zaQ$+7s(S?CtBjVrE=m(9mE_luY8bb}dNU>ecSm>(#zdbg*#(=1kk> zdv0Y;KMPR!9q?_*g@^T;Q(FHzeQ>MKeutm_I^u(i25~;XM4*Q`hm+N(!gFl8 z)D&({=a6j=LV*J1NarrVT@W`n6ErlFgW0iirf)O&qtobgVhj^a6(8 zRx>hmxb;G4qzMYCDZpCQY=&>Nu}9Gn$O^tHSWH>hTUjuMSWitv5NEtxi{9wCnZ(RGO8)LK{~MaZiLMo-xyF*FTBa;CYvM*Ja%&0% zrIsew7#6EjLr*=Vi&&n@f%be~;q>zlQP(@jI||Vw7zwq2Di96t+j4#ueRd4ofhx7A z8AW=#6OVz#hjb4@8S70RF7AiG>`zD3=sQ{S(Q=<{wo>qtDRW%hf8y_{cn>^&=^vP7 z81SjGcKVO3-2$cNQ6Lb}_vUiOj}Nly3YG`iHuIcKGXF>qq6zY<{NM6u=PGo7#+#{^ zLFNw)<$O7|sh1isdajOX{aYd=ro*{`MyQa zXB*x3LYJD3M+oA7W~Jnr6!7fA&Q}X2O9|L4de$5s&Xuaq@0?E6r(bUYxxt9ES&8iF z7vQds0_X+v7}NZ)V{Y;IOK2JYf-ea&_R$Rx6kb&_CTFpT@1H)WcI)Dw=j!D%P{3|T ze|Sx-g`arHMb3xe%UrF?GaSy*75A*J)e)JUI~{z0tIq`tde`DU#ED89Q0p?WYXX5< zg8KX%K&?_O@g&7_8#a?5PE?ha>zUGJ{krlW-%J~S{8-s7{wwK0pB_}@t*T)se$z>t zHWR->sYf&KKK(;myVQwyxJpOI!j9&Y@m!u=2`!`g&jOYdUd#Lrn4K8TS384?{>*bm zlf==7lHpAMrIV-a4@2N2*QJv0@2&PQ*0*zLI(Ameo3fB-kozcHr&-mLI$Qc(97a|| zr(xy9G5=?HcB=|+tMESG19EUpe*-kbDN4aSlBu|@mUFbmTVCA*((cc}4g)g_rZDCp z^_8J0_wIKXxyS7)ogF}?X6w!H4Y46ob({IjQyqNTKe|@Z5#C7W>c7gWy z+7q@};mO)L1{d2%c)%({_>trk-;M@Wie&9URpEW$iLGy_aJkeIzW7kB8-%G=LwVm| zxO8`}$VqzM0kWu$Ly-Z=fplFDytaYUmh_GNjE453lFH@1E+L;aOM3m3u3&4dXW3>F zT0!e>{%fA1yz^OG^dZ7^iqY`=LDPzxY?^}^yQ0ElQxp8A^5r9Val#+!dKe?u@-pfi zGZu%MuUrtS6_yHUVBUn~Ge%6n(EH>p3bD&OWK3&BAN? zWS=EiAkC^p<;bPqyh)e(j>%tyq_LDCxn_>@!XyJ>RKcHPG&02a%NR)@YNeRklqD?ZgSG8tMSCR-Tdg z`7_AQK}CZ`EGxqk7_L+v=U#U_0dZVY;7;fjxmlzoMYEr<80Dd!!v&i;(kV0;;Yzm| z%J{P~ynRdbPK$aVM|Sm2h%_-q{^}+7k3-*uVB7CDyH8VrCIxq}Z9O@M?#jc>tSXK! zFSJI&wlWCrfvkQc;2UZylp(ixf%pQtT+EfCWF?mZ7pb1u*dXXoQlf6#P%}qIhh!`7 zd)$|+j_}ysMcA6oSv2Q>-LZS;v2RnQfWoc{?RTjz04LX+U2VV1wtw>`=jVBmxvcr+ zb{^Rc&kRM@hss6%VA0L@(7Va>#w$VbWgtv7F+@*~&_7BBfk~OYFhbsunVK19)if*8 zoP(9M(3QTEztx1)5B4fzR@Bh5cw%S(zG^X2Jr&t;*iqw&5vQ7)FN`IBY!pKS+z##@|j>;jLuDTt~k7$bhK zff)dEf5T?^Tp2Wa|D5;p;iT})hpO(5@shQ~m2rS_k^$@4lnPMQYEmZ7#V+zocD+`z zw~G?)%H^8$ZI=lr;+Q%kSyS6ds49ouux-5Ua0D=SlU5J>4lx)dMdQO0Yr0JJHnG1q zp=eVN!%BhwA)=IB*29U^?1APH0_baE*WG^;ck!DBAu3|k$d%atnYE4+`|m|4Q%u(sUUN3_}CHv3nqJUOo@*)-L@AIonC zB|dzV_+CR{^&udj2I(+auS}PwB1o)KhEnD}*+{Fu7|eG~`16DQb-wL9`Kv6?v-~ka zu$Rs1g~SP1fD72Nh0w1Hcy41kS@{Jt-&LVFuNlUV6ylu@cf;v)C=oIy8LYQd*Hhmf z)&|=T@+P$FJNjNLm2^$F#4b1i138h8_BQ6(QG=MD@g=$vJ<7_t=IdkSo)7| zCEHW47**ckD4GE*s|i^2lu!T+EU}Ti39Z;8U<@~T88TTHmic=j%3zp6a(Iqn*Pvz5 zhwju=WpuD zy?=78Aa>EqwZ`0GIRcE6GH>26GMm{I1q)KBFU(BkP#D;l1|YZ{=fZEZmGJD zNp>EdgZa5=EO=R#v$Zh;g4p5AxCV%NpI&Prw)Qz7nK?n2d1&7v-+Wu=&Es@OqM2iK ziI9fp15nW|ZBAMlv$BR(;mQ6g1b+D?$}kF$_k0Y3Uu)Z5SqmUywgkB51^v)!GBi&Y zB)7GR-^kAdUhMTeL$KdzF0Bf$4s;`k<|u3M+;Ka)!R&B1mv06Lnlg!8!KQ>fw?EnQ z{BK(!&u93LIJdKgS}n*^7+IlL!(XaXZ=L{`v>N(D^>DE_XEhy{5)hU5hp)#`{UHjT z3nN;k_-7|t+&6mNE5BGFQAY5VhY4RckF`4rJL_og{bZY=CKCt{o3yfWiJF?V?vu5^ z#9l;*MZUBW#fvpC1Bix$kYgcrBpSmq;Mgw@(o`&ZYVSYiPD>yi`9vW<5g!c5>aIFS z0(hKT8+48Ko`}vmMRhocZ{U+aq)zVtcstB*ncv*PuTp9_WhW)+rKgvoILE(smWSUG z@^h=oliq7Syl1uGto$m!3QUD$sWsiyDt=`B4u07nfIR$HG!6fH4Q$b1qsX^Me0K>w z!4`~@8jq-DO1;Kw1Ic`s&N3v4{}EPd9Ed*Qm)8Aa{!SRcK6{22l?Og2)#oIFZnC=b z{usTku3Y*f<*!$blx#m=uB&{8`Cht6LB#0m`}SqMWTr%a+N%R%-yi9FX9{ zHAxN^9`eo;W#+G6)8`0p@b!6dg8ddKI(X?%dQOo@oxR}mz`SS1@@1p|FLec=v5KzW z$z91`E19NFCZL1BbmEj5%-ZeQb6p|R@q&u`qx@=Qg2(Ct`MTs50~PTW(Im@FjBwKC z){;pt+ta^8XcEzXA51Y*aQ}R%sWj9PC{h&d;67zMUTpGr$>D;F$xa%q#Tnw&?=!Lo zzs`N7d)r zYx@!Ec>h*JU!7%uKF6N}%hgAeTY>5~O^$Y+sDv>yQJ#Z+6M`)nuhAa3tAT4WJXauI z(_;I_gC=gh2mYkDop<5~9Qtqgnf!_*?;;h9MTwruwOP`RET^Z!GYv|6x?POVsXBpMk zC5q*nyG(cASi-aiXwgf3n+ofk%Sp06YI2=Ayfi+lr^U9?qimVz}38}Kishtv^y1*XW4z=x>hw2=t zpIf$F?hVqN8Pb_Bpbn5Ok11@a`qH%G>vaZ&<(3rqOz>HZIlF8h(RmOs+ZgA(`tv(; zB)E)RlT{%*cTrX0YpD{^J$6x;T5*tMOin^Q`@)dOkUX#Rio>pHto}hsmOdrC^zNR(drOULZNW1b+J?o@QhSpT5-XGxE~x2rmfdaHpz+Q+yHW#_htw#J z%jWipVQY6jWoM0n`D@+`=h?>fJ}C(ibAtx@#v{GK{&aojXX9(trn{041&ZGrMYE*k z8!XQLcd6TrcW+8D^n+0HI4-B+_N$mi1vadc`t(O7eCwJzn03dat)?-h%& zhmJaluwOkZqgoBd+V{i+?#u84wsY>$I^6rysBX!Z;CKBSI7K1A@-14D9 zEB1r-8YHVp16s{}^j-|%#(fP4OSSB!A&x-HYVBkKc;6IfOmL7?{4h2Aysg5myXY9P zqNxu$pL=_ox$=`;D`{n(ehVn-jEa`azoOf8`em)`Y_6Rt%_SmHui4#i(r4{wwpSgx zmqFfzmbdW3pGEYttNz=xlo9jMh;41sX>E3N$DP;r!PT(HR+qZ$DHpXzp06A$t;!Rx zcw{mR;7B+b=wGro=g+<6j z#O^WNQ%K+|cQh0x_Sr+;bi+i5XG365qmR9<#v8IM# zZqI|3Bp-vjD^tRcIrPPqF%ZZLyb#-E|6vMq>{#=8H>qE*8;{oTR+L zzUywdN{rG&TF+!p{xkgzn_M-Fdffik3`NRRcxf|V1(r%dynL%9fA^o7-0PwknvHvn z)~>A3k<@p}U)!iwDhQuCD{ZzAApS$%yc0ff{(-oNshX}XDun#Gs(bo^kn67wvM}Lc zsU&&M*M0k?#sb6tya0*3$x$eEls>TyS&DGBN3YwmQM`z0g~qlxr{~>X-AZfa;dP#k zC{C8h@}g#4Scvjnr|ZMeFw#)hFdcY}F;ZCRV%Vo=0v=Uj(#J7u$^RT*h>f|)E*Fb5OUCFon;RxOlj!ya!&5y!iZ zMEm#d(d~W6Eium5Gh>FZ1SB$esO<}EDUU?tKCL1)qt$OQAF3&uas1CVrD~$92#1MJ z{S08v34UwLN&O~0MMSO1ZpsoI6kgT(Rfj@i7l5IHm>UzfUOY~IG&mdE6q&5M{_05) z|EX81=hmw{BkIaGju@PpN$LgiL2i5K-EuXQqf*9X-4gj&nZ^6u+nUwZ6D%P#yb+Yv z&QT8(TtY2YnO2-0>EBU1S3JF77?O88BXQ_qh#Go>X~+<#SszkvTcfL3S0Y{Cpz=-Z zZ1n!U;wPP36&?LJb3LJ_ioMN0W1Ni(B{0~vcYdn77*9D!9y+fmc7|oF-9_#YepBu? zxti;u`I+fD&n@;`~QJUn5Y=jIMF35Zf)0_W)M<9L{_?mJ4 z^z9;r8J307%el~BK^dMAWxG&Pa0y)yQ(X#EU%G}r42d1p%ps93*&+PaUy(#Ar`A55 zQLRVWshte&^K14;v&=13XM1nEm$2~Zuu=XgQZAInqtop27uG?y)&%EHU`wg|ZpP)% z3@6!pE`tGg+s=bh@VgA|*FJCy(PgBYts?cc^Or~Ofs#)z)o!VBj+6q?-`Ka|RZQoTvc2hD z=hUidqdmk(ZGN5OA)A}3VA7j9BOXEVMr8u2)Cf*yOpV5V*OU7(Mf^1`gU6L+Hm%$$ z%(VG`=}kCeJE!Bvs;a)QBx2X^aYN^&Tioj@_amN%_C|$3HM_@`nmg(!`O&QmR=f2H-! zSC4l)@+N8RlHbpZ^Xg&z1=(ybnXE}WCkAApwJZ0 zFNZo(_DkS{UYooH5!qEVsH3IK7uENMH!t-^_3-moz|I8EeJ5jfj0Ld%h8{r1mGdhM zrhAq8R|4FY`U4}KlI-Foo_Ot~u9&pN30+BSGH-ikF+8T`>?)foY1m#J&rsg+1X}#cK_*}JGy*Du_4XH zF(Ihojh-n>?5hIT5G2%InA1dVr`*#(r&5k9d0&^`$ELoldtH2-<9u7>hhI*j zwH}BBiuG$YC7#(J!gNqm2Ql~RQaGg+lL19H#dGT=4~i4o>$H^1RJi2HuT0CZiBtGHWdttR^FoWgAS6w}(my`%_y(tARFK?TWf32m2BmP>J5-EVlL_$Lm z>Cc#0^wK@@qV^Dh?l!d#$m?_{eNIA+T>P6(ljrh`e~(RU%XjnMs6-_v1`=OAt#!*- zV~n$TPaSZZbp?kqkb75MsCyc2*kt&(82gEq_sJ`REuX*|1kneo>HRe4LCfuD?5}R= zD-tFn$VzS;Q^j2>QBQ?DDV_OzKuXmV(w6O91Q8ld&>s_+S`6}Fj-!wp9U1z*-Vn>& zi3%f^(PBXI7btqr9j^OSx}1h&FOaYpM6YZ%k8KTaY92&2be~cvVC))K0G{b=FBUT% zmHgSUa=(0V+irs}`rg0IR^Y0_^P&+!zHEkXjKXhVBHbR`02TuLJbOdznfojsXh8e$ zj!0Xojtu@(s7|SlW)1Wh*c~i*2FoIdb3--}4;4|GoUvZyYxk4u4byKI(Yh^lUQ=3B z+en^*h9@Bw1Phz;w)5?1^+7&|ZJx+SC!{8~9~88hEHO`tDk-W1T6U>qXQOBCP(-nE zz`B9?F%oyyf%wwM7Q%lpkhoaEY8!^C3qkfDq&6`tk+UI?y2YhDnO*^XI;I!T|)6n_YJR*W|umByF^DY;Ea%#&euwrE)TY$<_juo zZN^vsxM<`p33x4P->9x$KYzLA88{ED+2X}FkS6(a1np}Slo6s?w%rCBntkoBHk>oqz9Lt=(wHr6~sXnoD zgZgfhr6>|&)K_95?2&ejef>1;N7d7u?a5PWFsrF&&BezG{)Wy4!(OdX)6rA=o&79{ z9nUPO`ppAo=(Z8kIap`h^FhQN`pUWR+y?&3YDi?T(ky%+;OAt*>O;?5w*Cbhg6$Z}= z+`K57_!a>sNHSW^ObSQJ&#m`NM}px;-d6SQ6AFJh1h4V>Uq_3*=J)t~ zwtnVx4IL=`M5c`2Qn$Wp>FCCkOLh6;V&CAs;=#B=1lk@U~zCO zkNi&xmigi(XItCRE6hj%E|LmhIkdSc?W(MV`>V;v%&CahZgP}4*^g%k{XDdjqOxQ$ zTaNj4u0wue-)5xEt%Gw?Nq02KyxaqlA!4b!k@7xiVYGE4Y2+G8FA>yojK1!cF9mb5 z^6bqirkI!wJsX#f`UMn}A^FZ9)rDR>=&sx-ZeGmLWbUL@x}xA3YwLMvDgzhg-oz=- z&7e<(G-EugmCt)On=tq!(0R2nacz90J5UV1sB}U9WSj12ZkOD2EG#(l1**#4q?#nF zg(xQw9=j|-_4-eb%DY9%JGrrwyDRC_y1&NF^e+PxR79CMIfUPx0@6}2GVX4Z`a9bw z`0J_Bl}_z4&9=KbrVA>ogWB+Jl8hiySkpmet4^wIV&l^cnr&HS+o1s(YNw%x*QSzm z!Y+@TOI}g3=Fr;d;;#}@r5!1ZF%Q7RFf_;I=vNbyQi>cF02)aHOF!ic=FN0rF;!cI zjr~B1GOXDCNa6UteXy9f6u!*XPlx8D+*_IQG}N1{5V{uNE11KR|JM9x$Ig=qdfn(^ z5k9TuM&Ztq$iX6(7!Csd_Tj58e^#?uyL07mY9gfD$+?%VU}=>K{yJ$9I$I@g>FIb& zZ{!#zqs!EXYBS+jVumDZ8Fh4RXz0}-;M-&~+CuKd{5IR@$S+{o$GtgaF%6-Ar;k4n_^nyT=c2%yQZtSr}BG8O*pPFJNf{}OMMKzig` z|0=>FeyuO9ZrsT=CpTS!n(lNR)>pDXeO7-CKzoA0BZB-tfQA_nd89!0WJjWf+j6j} z9xnK`BGG^d8DGe{@4d)7b}$>7O~3v3qm}po)BVpsvmxYwS{SgbZj|rVW>4cII8j=A zbM&%JZM-@k?Z>*Ae5cpNBEu_8`y0NG5t1l{lR`D)DWWA(t*DV{o0EQBFsGKm;@an- ztM*Ngr}OgGo=00Lf*_cp%q>fN8! zB4!*&|MKw)wJ6N2Bv@W=Dq4*1gyWw3az{)r?Y(`6ndgEs=fLs5p#?kam^&nepR8urnq-8I~x1J%YMM+?y zA}Nt1S9v3VQ-s~H>A>H3N*&~(-lv3nkqIM8`7Hi3=U!A*`pOXA@ZHMLkET}qjqJd{ zz)n64_I|16>Sg9%g;f}NS;BSmi3(?%)N`-CVo~%+;|`%P-IdY%JH!`HxED`au8dqB zTK-@e^(+GlXb_Xt7gBL#F9_26YHZTxmJ_n0Ytr71RzS!ZF)2X#@jD`MCP% z4ntquonkS&Vheps{oT+!1{px_Ds&5~%!V-g>ktd%i{{Ph#HLi^$!Eo`UQtw?_y0H| z1X4np*TtqTn{0!LI~3m`C~|eQ9%eltY%+1~iU=CX7K+i7nHqb! z2v!L4oon~rq9(vd_T&rj-g(8&w8$qzh&8SfO4d~xRFlAi3oNHdn=-70Xez5D6H)SM zrdh$@mYc_S#gl<#o66NRN(KaGWjTcVuz{?ux-Bb7=SlaI^;;_S`kOA&%qrr~QaRc4 ztEvF6y(x-%+TwwY_4?cTV4Oeyc%w#!J*jCMjg?^E@~iLLc<`KiKK3*qlW-7zH*{{fcx8*nWcJdjkZRG$nva0s(1Sj~rZ1$|tW~sOg8JjAP z)DLPXZo>p0(CgOvNlB4$ktyM=(6M`C*vG__?S zJb7$mjAD^RyY+eKJ>ZY|H=eJ)3`nJJ`F?UhDrNj7F2dgzB$Imn-COy3blau+hP>N{ z91JHWk3K9dqkbmBqnTzn9<8VMZARz&`k|XB_5R`#GzGC5G4{c+}28%Lv zjSSaxMy2XKg-N^*2L^YJ7oJ~9tnqr~w|*LiyB9MCsIl6W)ZyVrh~t54`ka?91#y#! z=s1sAa{_Sykw!}Kb)v?m(n_aV2l3!7t&br)?Qp@BQzh%=T1$(N0=*sME$CcP6bD-* zsE-EDV@~imYM;}6l6Vof3c!s{#i0fYXXDB_qUgOOS1DrwsApQR6`ZJOxFIewL`3nI z$LY7G^z$DCV|Ai9Xte1E^Mb*Ur)8!@Z9SeOIJKT_$0^l#F3;kE3gwn9Ru1qFX}Ogh zuQ@R|r^AQlaybpVCaLeqqIyvyU2RTKlR-2LoS&s{Pp8Kzow$Puu1`D1Wc_Jt7Tw1vEtj%h+Vnk#~NXO+`lw(ZVxCnz}k&Q49eFWl&FwmizHKHP}1 z0J4H4==bUvR!M-ey^3Q+5(tFmd$9jE!Y*Q9Rb?erDZw9^}|5hn8{Z4l* z6_QW9@qXgdSf>$7kW6rkC5_#zkpR%j>c*F0UQurx^@i*vMeLKB&3S)0KRA8;k#I{< zlD1<`lKnQRzbQc247+a-?%~Aic~BHX;!;>hbK<3sjWzijg~g$Bdo59!mJ~QhvX;%eRi=#Mu~YcF5+^!GM5jGwi+dY4BR;88 z*{bkop?8HmWzNlM6SXoU6&EQMz4!^-gw9BO6|pe)M(^cRP!@Ok%69ffip#EY=g|s9 z_7aj(*{F0Tzd4Gca;AG{oWW#c$Iy4~OJy^ztmXU{9bqd_<;U{0Nh})F)GQ`5^D15s zX(YhTDo?S^y*gE7r<&0ap|_`{c8ykA<`dG3Y`JuhcWtoa0R`vMzXuR5QcXlZ^(M%YAjwKiKPR!lc0YkcK4`|vlbGQ49M zgJMga-A9923R!##9nE*wxFla_V%~a2pvI4DdQekt0TUVbo6_d*dbr5G9`=Gl4666wKJIX1RRd3p@+uVb?3Q=S^Ki@*%^4 z&n!ijr|EB#Ua+%VXaJ%@ONo#Yd5vDv*}4~V9i0>H26y>VnGi#S$RPO@-2h!*H;jnF8yP6>;0#)KUs41J%}NQU!8fLyw!`lh@xkuAmmhD!sj75^yD zZ#S0IrSnd+6mcF#i`Jyh>XS}rN0oC8jXXjf23T`~3$`%tychEHoJkku&kg<^JoPf!M|JQKTq)b-#~ABcQ_3l|MiH_fn1D2 zcqKAB_q)^}!)gYk6Hsk2fFDP&pwzzr$|JICB>NWMNb~x0i~5nKYu&3w9#425WH=Yy z+W8Dd)$x5s774!NaSo&R;x?=3WVjb&{d!%Fv!>+D#kiEy6-V%dyiqZ|JEMhE9r)ti z&h4oYViA8#Fw9}dLat}iR0`AKP!yE&{6!mBp+qq2K--L2zNZiS8a?#&sPckCpfN!v4?#Wc3{U8e2Z zyLj>aze&hn52Aq){4cCLYe9Bezozma@d2U{A_-!o6YDywJ_+Xh>mu6E?{BLbeZq3^ zkPsx@5KUvHT&YB0WsGEHfg*{u=vY`y_B0)ZR;+!0# zO{FXOS-jlHe6{(R*EiWE$~Dx-ISunE)oH3U9`6WruA9(X2B&xXSL4AQeeK%%rNpq| z=C#v!F>M2eL@wvIo;%Cs+4_pZm9Hu(PfmlgcJ?z>vW-7!zWYt@DmCQi{1?SY+nq~> zSKhoP-gwhT`IhMUhreEcl6YPU3(@$*bcgV=mQ%B7`G>5v({D~J#kJYl9PWny{Z7eT zgp>0V3!in{5TiJ^Yz{_k;NGLZoKg2JAA1@!q|ABj^LEV^4Q!Yinzn zS03^x8_JSj@<*?6j%^^AovG@*S{=f=dU6;Z7px+UqLyZj?Tu}FmT3@|-nVcH4L=gj zI&aJ`dQxW84fVze+!~%JbIa%zsc5eI9;v$a=Gno^Rzshy7-0uA=|2DU=OI`+*El*L zy?s;rkx*?3&C8}@Kr-)|EAls(cff*8a*`8u^GAxlvwEMZ(BnqLNKHG$@KVwmkFk9yo6L=EMIRQ z9d~0!QCaBxwTo3YV{GJJddHeK!jxamRVa9tnuDO zqr2X9)sg)jnA%^pYo?xs)o-(D*JiRNaY(4yJoWZ>!(`A(ndwW1; zzgtu_q0;KM)08EfWKL1?=z@+w&WMQ0h;4gF^J#_FgGrUEc8xuj%IGnpM$@_a;zZ|- z>7BB^MLPJGI`!g3N3L_=i(3~zJDT8JEKQ_MdE&H3ePln~ zdmyv$Kxk}Gna=X)bnsvi2+aPx3_#7XpL!2ZsA}@Y*t_Aj@9J!&*}eRq7vS5&!LE7r zG}nRpvI$Y@Z{sByxpU)@inU|bBh9vVyEyFWmKn9O`FPO@>u@ASmu6aVe5-9p7 zC-?Sk>DRBD8mpaeLk1{bDBlNPJAj|7@iH|7Y=Zp1_ll zf2AzOP>Fb}@`~VvTLhTN#Fl z9ImP@A+b4D&D$B854vGt;~w89@r+b~as5T5yU_9#vCLk24>{P>gv_pnb)n0XAfYB) z<})(|x`!-3DC-b4oVCWEDqc70-sEjtiNP~(F4>}NyE7~^-gVPiE}^l&xnjxxWV?@b zvHl315W@gO4;^$aNaE!&uY(DdBmOelvE?5WC(U(T2mtM|y!Zwu+;20= z+F$=u!UpXF+0uLs6Gw`pF;I;T{~=bBmO98zyZ;Q+LW!|S@u%lBjvH_>zD_(orb0Ii z%tGUn?H2*YqHj=2{>=RDt-24dVT>o%FL$sTk@elrEfyz-@c*ROcR5$Udp%~?q_djj z3az@sz(JnCKuxD&pW`QuUzq2QKiEiqyNL_2Ky8f=Sots|tp+nj3r`du=QCt1a(L=Vna5P|5dp!yD9?_kC2(S z^m!Q`5r~#%WH?n5SJ=*Y+uXU@Q8?3VpYbM2IUI4#Nd|B4S-6HrMN~zFq+NJ+mli>j`DGq{j{)BWfHE}?a`~<8UjBcKeR({T?H4!GC|enO$iA;5 zls%KR?E6m0o=REDh>(5Bmh3xOE7?M}P_pkuMXPNrk%kc7>&Emvzo)$K+dt;>JTv!w z-RC;zd(QcubFTXeDo|^ja#N;X3Pu?E3a3hr1G#;Kl2Za-M6CW>y@S-CMs1&HR@$;u zl1lH~gSj%-nMY3U23L36h;EEn%|>2=U*GXMpXGek?#^~$d|S9OeMq&rD4~>1MDuIG zRJ#_xE~pd|j6I6J<}dakKT7bjE)mR-0HRElFT7JSM3$VYuiB?yow4_z5cxkFiEuRr zqHg^t2HvP~o*QHovkxMVvHPLw3AzK&Aal!Kt1-%3*3|eiPL287&&zb69+r8B&v2}Z zqUMw0-L-3-)C?{Gt+QH`d`jU(WfP#oFK+DuA+u{I;3mx_-EG1hh1JE0Z{f&j3;w;f zc<;Y3JkXNQl{Tl9KM{H65^2c2ThVMk5mu>OU_cZEZ3PDi6%%bKm2Z%vShitMmsLdW zx0t>O-|Vv&p^(p9`ZJx=MDgWc(()vKI(|G@HOToZ65=}*+jdPbhMW+{A$poEty$}W zuYl-F(il)L_56AL!{Asshy+mzEUd6p@zVGsXeOlD>8oNca^K2tpIgsGTIjFFowq>cVGbG?^WgrwXd1g~o z+4U+8wDcW@|EYc`mh(7|?>erfrA4MlPXj2cbk~r&%ey@;3awem|LpKN9eiqfMjwsQ z3k{x#+Pt0P;F$JkDg&F*Kc$8G4#~PvGPfP9x73gQtn8g22iC&r`M{w+ryBQU9TNw` zryCS~XGC#j9*))h9#&@8zw7Vb4W8#)zK0siK`jPuue+7d=$}4rWM6QCm)>NtXjudt zu6)kYU$Rd|prRo1^oiz2-UVF?l>8im4|TpDyO;4tT!jbY<;Matooy#bB5+dm?qn+Vf_IPZ!g(I2?ApO^b{ z&@S!k=9S}Vg^6F|Q=1oei{+Wk*#nhoP-BR@%?u5UPz)hVg0VnP-<-YYEp+|Q+XMXL z5-#)hUSPVho5pVvo|U~ULJa!TjzF!CRupuwKV`LNqJB?+m@;B+67KQ=%+Y7`9Ilh7 zTTt=vE@aBp0Q4U9C(Sg7e}5=%4d^1BexQfQRqYiyaZ;oA)a#E9)^B%txV@P@uQ`?Wo`%uz}m+ycATQ zz=gKH)+o*4l1)0#;^dQ-ZhIpzJ>IA=eF#RKMSE|OpS$9Jp@gPmc=HO>DH3qVsbO>C z`b&>PUqMR@_geFe?4NRoX@L#Eu~dlzrWV9*BJMzQvVcg)H2D<%9Jbis%280_XVOxX z0hk7AB+)9FMWVG}w>Z~+4S2(KbyyHfUC0kUyn?3R_%6R&NDPNZ(*0=(OOdFN7FAxs z-ZxBIGm5evcjQ>Sg2oy@;%k`$QTinkP%sUbJbXAlaYS=?<$LbS(f_(XoEo(e{#4v` z@Uf=TiVbbu5wxaNyJQ53a^!PMsF@0_%kbxO@9%#jcvv5PRW<Id#0hbkcMbG|x(X~ekm;G;+njn97%Isa{! zWkQG;R@n+4&e$to6=mo0-Y-Ah8*7nZ*S>(djFx=n;<7kje$x6{a9=V0e@{LAN6`K? zgx$Oh{IO>;kG-+lL%i*^MD^Emp9ih^pu1DBd_1S!H^Uk7_U+pRA5bBM%ixRs_o!-| z3f{D6w*Az{BE#wANJc~SNL9S0xd0836UkwBc zUHZKO(2E&bXb`z9L(Q_xM$QL{!!-iz(9`a~=?&{$uQJ_yT)P|G%Z4itb|AFy5U>$o z>!??wZVpZ?nVfq5dOtPgNu7dNbQULo3!FXr13Cv@!4duozk|PfRY`_Ani89%Gs(oc zS559D90tJFE(-dV{xo^$zs2wXoePUAumG-w3*U$U`#^~JUZ~b#Zc3JeHyuc)PayM? z!zYlG$lkuPn7%v6bcIRUM=i2>_=P_zkCs(GT3(ux`@xOMzeOs$>ip5pf6#Ybw<@dr zIl@D8SWi6p;h5Lh-6yO-yFt-bExp!0@9&HDL#Yxc32u-Nt97)u+&{Yu!&lX~i`w*n zsAW#2lR^!fe<4&6=lJ~3ooxW}(KStD_7w0CPZBk&P~#(&g8Y*r0!S+NKuLXc!mm)0 z%gxFNB`tdRP;glcaVNrbH!NB&lzthNK@Dac$BOK%4p_3^9D&VI%=#WL>AHoQWdYJh z-0>?3K-)#YY60=d(0a3m`hbOl@N;3m4KstB=&USLk|V($o>gCXKMj>1C(_N7|D2rP zTTG3G6_`M&(|oceiayp6XYO7lRFawLM=p6heD5T-L0 z85wEb^g&KsJ{hVH1+QJg-jUCs+GAwm9ZC6|{mYg7--~5-r|X?Iv`>s8GK)sj1%R5` zUw0e`?k3^!pk>v~NwtxH&ci*$L#ravQ@^&R>E}F`XT&|Av`LegLkyg(w$A75?EGE+ z&@n+_DI%2Ffr4S8={H_tFP#3g9=+1Pw+n7-MQ1u7GaU;Y0H*Xe-9t5BgtYT?Ht7*C z#KZ>B3@Ixsi+HpK$9@(m&_B>j5MX$=y#k@K=Rv=WdK7EQgZ(});pF}oN40zBrO(1uu`q4b<*-&g0Tz5i_OUlal3{2c#0 z)-qUIf%#0y%T^faflFo40WzlG)OrSx%>{gnyZmwnN+kZ#(0_gMtA=1~cKTylVL{LN z=jgrI^3YurWKS|=2mA^T7Dkb@r+QBPxAt6>(B8bQ-JQp@QGAKrcO~ELm^&;8>0Hc% zl%e?;{tzwIyd%f%elunE8R_Wh#T%_%0-;pP{iifD*tV7pMp0<39Jwg@J7_kMyy|MtkVFogpJ(Ut z`S1G@e+PH2v3@~8X8>`S@{|KZ4ASKk0W$Xlz?gBxvi{wq0M>&BB;|FUW?|#0!)T`o4b?1U4Mdt3<`fc7fD2)6Ya3Iq2{Pu$y^~w(G0- z>wD4rA_J=|lyS^M<$-ASs)M9KSfh?>_0)ViI5czzKWQVeZ@Q$gsL1+>_K}kWUEs-| zpV#^r1IlTL%m;rI_V)I|pNjVQXoHWj|ECJ)fZLkd$QR6rU_jBoBQU2O<~O5yrLods zKJ{m#DNZ^3je($F{N*rJo*#*AIcdF_waYTytWB>*#|SqDM1{w4?+ zuI&iO-oJMJ>SxFw_nx6P<0?!HujoxWK(_cdpbo)ynSK(`A=rd^e?bU@J=-CEOb5ZL zB@`FEF!6Yk1akgMXX@Xl?gu0W7fYcg))(v6A!cI5<{-?tnH&JZSsYsYM@A9?h*`JU zJ^w5-%8)LSePwe$JiK`v;z``1_o zy6;jN(2+ikBK{*_f;(a8bB8AR@XFUp5%AuRzy5}W7w)cbGxl^c1L)%pxkfRp7VcCK z4}#!)(9zEGspdeZ-rNAR_Pw;YC5G;-EVAdpvvyD<8edWP_V@eX;Uh_)T(Y9;rPy(s zJ?GcA3IXV3fgjioKgg9>@OB=!F&xn-RPc}P)r1*O3EhvX;}^`#7}oj*JvPrv-T+*m z+nZZAYsLRx`J>Qe~RUfkJ`Dho}5CUCYQ|Uc4C1c>cFeqGy{SA z)9J(k@c584=dpQzY0%mO8+>udAX)`(Hj;sORqn+jI+nAtZ`27G4>k^mKJ@GX%ApFOF=SJuY$0{mlyWlCW81taxsKtoZqN*k}gQ^X;+Lu|F# zt85FZ>e_upblC?*Nl;_Sp__G+_sjvO_r5})NK|3BvxEb&#(ZUVc9xS(MB;tc(B1?C zKj!wsZJXTG=3Q3)x2XS(dw!b)r9z8zKKCv5jA|UIO4bUD>PxLFpq(=G2BGtUaJc@z zpR_Nq3{G8B%I)N=xY)b;8IHZ_2?0QCO_Moztaj4>^A#zVW{&(Rv3z&H`Sh2qe*qQE^Yp?f4A-s44(zj$w{AH?c|=d}l>1;W-kM##jp z^NulsJ4>nF`_Crf?gK}Jh5+chS*Q!zPHq}BCval`f?~eBtT_nS9H4mVOB&`Oh#Iim z&9%#v!*0DPP@(Ye5AR783RGrq)`Z|S5UbxCI0Ze#R33eI1~j9}+D9G~0=Q~6bL;BF zOm=7v@L6G?CgLT_SFP>C3`IsmE(1mf{1^}44JhuE@;w}IPDsK}uU znN+yuEf_c=1c^VxJIVoCE7RPABmZYL|17gcv03$Juc`^u6Jk{dJZm*cn`|MiQ>;UU^Ks6D2V6)B z0d5R9+UO!w)cKp}P`3!s_Ui(-SJPu-W9Qa3r-SEyr|7r5H5NLcUTHc9!SxMwZRGk? zB-23--IAPVArWan!pA^bupdu+>U#*s4o3}?&U(E_xCWs527y3RsZ|2oAloUBi^cu5 zt73P1`pHA>b)vH}?h4ppfE|8Gc7Sdd*&{@=EP3*OPsNWIt+^fshjDngM;1gXu$$3eHSbaYm|U{FcNdD<-LJeq zkU~K`gMM(d2p=5Drv|?3+#{a*ztpXapmtT6?xkCDg8lpj2FVe`3mA!I8$?h`-#Az6PI6R&vDtSBw1!{9T{d zZm`xC2QeqVHl7@?N}p}EFlQO!ph_BBzWHz>HHd_W-1(&LE+3M07rN^aZr~9li_+^Z z114g;*5NOmxw~>h5fKnjw-K&sxrxyA0w^YQ&>Ksd{jUU84q?1YJL|t<+}puWR*r+ z8i)A&=Q|g+)~}ka9h3e!X|_hR?Unb~+bnWGi9gHt!o!^wjXXTFtq&%1O%fDny0)SZ z1|GNH+Ca?=4;&pc_bQkQefs<4wF{A z0WdJb7d*^Sz$zSvYe3xmUdW2-UIf+Mp`%T;oZVBal3BMu5p^dc>%J1~q(S(nF4vyKOo)B+Po(2jxYg zcw%ncT+zC7@fgAPc|7gs4cd;67k4hJj1R}A!Ln)n(l*ko7QYo6`f73dC8qV!Hb8uC z4!N*5zV=hibXo!rOga1eq*5f~>)SdKDe3gw>c7E|H*yvEW8G!4;MCE&a))?3UCE|TnIZg}YYq*cy4t-Xhoey<#DSZ) znrA+2eA?d1DBWSwdbys{x4SVN>{;G%*K5d^Z(RD`Eh@W<8KId#Y(K|zMM0)x1U$-( z>DRHboNP%Bc2uoLQG%ZsUoEMs zfuM+j@JD+VTV}y>A1~L$abULTw!%dsY?tcIOcgd&uq8Sz*UZF#FUZmP!c+qOmWJyv z;a?>TBpDw0@U`?xax{iuZq>P5pKtojqMuHa`b@;9mKzh=wp5@)29DWbkREJGooix$ zE=?{Y2Sfnc?~)pftXC%##J;hYDbue@MohYW*M4Z9nHX|ue7>Q>b#2L80kc(a1^V-1 zAWl}-;Sn7#IPr}8J#xotGgwu_ldxu)(?R+r^<$~hUl+$Pa^WNL<~*ZT`8ys8 z2+SF2Ey41^=$rQq@FQ=UXHzEI6iL5r92@Wy*qWMX0;(7j84^DhlIB@W*K_2-lghr3 zhfikmCmJx{>ia)f`&^M2JB=srU9D4Bo+D{@QEAKN%XAcfL5Sv_xMIG;JhUbjOh>sV zWnBmW^zm~i&_VG9(KTFr;kO0D-tqK!^R)}l8eCH^r^MKUFZ>F z7_#~~PQ0m_cdFNaeCKNGcAfY!fXFc?gFPDb(Qr#s&DoL1Q6j{QEwIsTu!6`K^9tFyxN z((q0ig6l)K20tc;O-&L?ZzwPl<`tvHtt`fyEw&PK=bDZ)F2bedJ&QJ|PF{f{3i-9y z2)102pWgV4Wzb^00EyAu1&j5~B&99;VnK(LBZ};~43n~!A7{xn361&S@&5GY4}dlp z62%phN>F{-9l34aAdF@LwgizT1rLU{pD*Q9Y&{spjq9wE{Al=e&nn+Dy?Sl(L2u4> zdx8w?*F=hpo03*K^|`hO^u2e5z0AJ6=>D26jlK(NWqg$dG)pFVPlYtK`|9X!D1!JN ziI!d*1}>stT9FrC3Pg8C1K3rJhC` zjSQ#C!(ufXAEjNmRESvo5nLy}6z4=Nt4MmviD38}A7+h8b8IM69-OsccrOx+^jOXZ z4}%xHgP07Ce-S^IazP>}uwa)A$!he|c@?eq3!;8l(oo|c#1#s--|Wc7FEREVgAwMX z_?b9rd(FU)^k_2tdX|x<|3#x?IryDG*!qpWT!87aZfCW?FTFtU!r5moIXGl|2#uj# zy6O89f%X#rL2+fA_Lx^~f2>NN@?LCtjvgQ8eO}Z+=Z5~)qN!=x)%ETEke3!Do2P$j zi@*L5B6x#KQ+y_P!oBtKK;S50!3(GBMKt&!plb+jAl@w>P)vIEYzTMclnk<&*Bh;; zuC3J4bR%)^;C-_Tj^aS z-|otu5&L@Ge}lEY4qwLioBR4#lbH<7EFN7CnIkk=yx}MqL3d-Hhy0!5P5b9(eM8Dm zcVss?EF7x~IbB2AL$_jc*)oThQA7K2 z&2-Jq*uc-ILoSpw=}|I1cq&@fK8s%TQh&I;SnS(JO53f+eqAKZXwAE1doBLTaiNqC zQRAX+KcL!4sAV&cnDJA?bUf?v@AjGa$FULvP1A~xXD6gRTBklKXEC7OB;S78_mFu0 zT8rh^>7=X-@22*KQPBXjWtzc_sIaVzXH$130%?Vb-)?Pb{)`q*c+*_OvZaG;usW91+ zu6afO$j-|OK6S6hByAfNVqbGtL`MYW6cpqh4@-Z2-WNclFD2Czxc0dVWcH1X!=RgH z0;)gtcvXqpE3i!RMX{1zqCNMIm2*ux3A6yiL@f zy_6pR;ZpS-Jys~@^1BGL&*YSNP0m37z4L>Q=C5NVz8AtgpT{*@{tN-gbo4`;el}<~ z)JhEx(18eeio5UV31{lxU)Xuje|P1R9?n-yVKawh*t+YB^VQliso>lL4z0B|_{~!}96rV4tr3>RK*SDHvNB8O8F zyuqP5tCBf~0fi1C64|w$uv}T)of)XJ(^>SrTCbs&h|bX<8dH6VK7Qn7Cw#8Lm@ze9 zDgU!0K<|lCP$|Gkr~aa)t(5m#WDR@u=H@g0b@+`ZcYKjk;R+E!#W`a;?=I^< z)ono^%?B()&i4Cv&NHevGI&6LS4p4x{+S2k)ph$z z^@h$v7G$SOhFltI7PsIWJ!0!punF>yZbaX@?dnx1W&@>tFN`Fzt-9{1#R5XxdGC!y_+n1aXkcX$cBPU?#?QkxOr znH8d%6@ot@{5nz)c8#?x*Ke$gID60@DL7Mquld8pUKyW@J6pbG**AmN^sOHE?Py}7 z?Df+g(MFlNtjz@eXz{6`>G}5P6Wa7fD-qVim#OV^=KH;ZwO=B9{_QQcAI~Ber#(H6 zn%Mr5i@FKNDz+`HxFBWwB46z1$kjps&+ANz$H~!NuDv@g+w~u5o4A|j-5gI}ipZ2b z<$VF=z$lrQ^(@2ONW1#@-I_diGqdE@0i6o`G}=#3$d9 zTj`4`h7Nf5IaRWYkdRTFh%~&vse9h9s&dBuo$8FxaKMPSjmf)+EBd>cd7BZ7_1T8~ z?(6V#t?Y%gAHv>vb(mnY9QD2K+@3e_(cyWW`0)$>^3JFtv2?p)MrlDnK#XC^hmDN_D!GBD{#yFR{P|b|81h5>3F*AvmE}NF9Y)sC1Q;5 zQlS$ZD;=uv#8DL$#8bV^(_^#E%}Mxdh^C+ON>y>`Z)~b-ioQn}BRwJ%9mq^QzhQz< z$nDX<6yBEl;TNf~EO_{YC_I=oH{$lqmf7u7I%NgMGlpZi;e!|W+ciY`&bbF^Bj=kE zw%4Pz`{b&Pw^cRNnS8_BC6rxp^72T3jrURE$_ zE0`-VNI>=SyPqEv0`>CwG4`uQ?ggdq7QS#jb+~%EG8hle5(5}8tZu8McJ55*k2nj= z6Yl{!f6J_vyP}9t^8WKDYR}qn&wjN}EBsN#Z#a}Jo=E7E+j z4u7X=o{{G_=NlhPJrC;#tjawA!w8Tl5U)63R}{U&!sJV2b<%ikWCPL3KQ^fYYuScL zj( z?Gd?%_0<%iYH;?*?&bYZ&V+>x&Ts0;^@T>7FN23uf4gu;TZaz^ahH;i&t zh(ur0aUkB^FX91>DQqZ-=kpGuUL97el1g0ZGGyq@lc+gArK5w4Hw+hdQ!1Q3s#1EK z5>?oaadn%BRYeEX?fu>btw z>8aG!s9Ng+Q*E15jN>Z=$SKtq6iA*29}XL=3^E^*fffsd8W@HeBhK*Tzw(k_9P_p9 zy{q+OXN_-|FXi#4MY@#OhAXC2Nr3d;!VwrH=|<_EUQArYz#N|8*5^B}wUT#eDD{H& zE(E{lHx$tqwwWBjdx){;*pm!)mj~&ktJ%w~A_cUsN1M>hq5|^!LdsHUKA#|qEmMrL zFE|(Vz~_Z?Ce<37-U~D#_Qz}1hxM#}+I0(HLHPoJd+mu)F8UDlB6kI>M;p(iu1x%^Kd8aNVVQjG4nuB={7s zZ=9yUL?s*9j*@3vPHiPWk7UL3PbXp-?uwzd_V8+VjL+8d*J5?o^XF2KZx&vA2>=XhRRMk_FLTxeV>%Kqw(U!3yi}p#Ap~Wa+<&oU_;h5B#c+L z8sUshR{27@KtO3}cL*X*OyG3!W4n^0$KuBjSL9GHUrUYD)J{=>FH2 z;v3K&o}XM3CVUb%cSza&nDcN|xXhA;Dc|n5^JBTg71BDrwHMEJ{ z14BQ?F)f9%2Thi?Xie5b>$cU}m$sD~aQs^)nIlFf=HyT}} zE2=FwlbpSmRn@M%#HzN<1o`Ro93sz6bsVIHNmG_lG$u?^KR)j8Vn7vPR4q3xwqqFR zX-4;!=k#*;faf=AUhjFmEhpj{%DHv{+W_P5q~wjU-Mw(fk$)`39DUU%QtGvMue~B-1Mt3llXrwxbrN z*u+|7c0LE#?K_QoL_ZDev`$vI&DIKB6gpdQW2AO0k!wXps8>%;4<_5@H5+x+SXrE6 zopa+VCzZNO33!PEgY?C~pv$9M`nQ8?w$j4yEZkjiKk;-WR}P+Yt~uK0Qd;~->!&c8 zv4U+ic53+saH2}u8?j3|RBy!G0`TA>@jy8U+kG&TwZRF>aTeVJV}7n5OMjgLJYa`>0j=P1R#E5?K4PP29vo45>iwo-L zHz?H3UU^$}>1aZqs9}O3hcZM!zhfX!)z4Q$DXN194w6hr6#m;;YR+Q;|5Io1w2*Nc*UYk>DyhMu|22c?Jk5 ztlHb}-Y7Qh7>=#=tRr+_tacqZzArTVRCv9wyY8e{X)kA_xnik<=uM#%X|VMSLPGs| z2`j@YSQ*;0+5CGjj|aOCucujOAP8V}HVxa;Q>re^QWxt#Rd!{NDf?^PI!1`iq3Ut%20AVAnaUnXjtP=P z-(G(tR3VUb((-jF&u~VhR7j`OJJj{JwwKnjS#gU;xT-djJ}4@c9nxa_Mi3qk@`mza?bT&U)i!^bjIj!9~eN`-}~ z2wgmk+X2G7Ak^Z=bDsll5$2Hbu;m2(~N0ud4l?+r2C%5@7#QO1ubpH7vY7)ApIuwM)<-soBQ{hiYB)a zGwXD$E;z}43`E`fhVdjTskquRENT>p)->eufp#d=IDGOw%K5jhy08qpDZH+EJZ6s{ zn<3Zzgc#?;#0Y1?v1u|k!rnt49Z}z}(Rh~3>L~TKTUpWL%(0oWAGkTd1O!y-U^5CE zb%8-Fb%|d;#x@*be(sPsYarU6B&=FNiuTs$m&1}O&+qxatp;c6;!y7w$H#oDzTS*; zJ-nUKB9g;`G4y057_zfATddfUbG?o*n?mHKeY%(PDZ z{uL3HQQ)?ArO)yZKEG)9a8)zqm4w5^AVCWQ;T{5 zZ-vI$&%w>kD}LvY={mM9xaST~L`|2w4Sc_JxmMkv;d_mH@$3_os;X{BoH?U{Df-8n zxD+UBZFAJJGju$}0?~9E5&0)C2cqd?96yHxTrm2n``YBGDUrhoUJ{~1IMn$_iT^Ay zExy*;;|A}9O4!Tq?KD5d;%8DuCe7~qNwLdk(T;mO@5U(-;z>o~Dd}%rc|OG!&n<4H zk1d2~M}N=;Ig>M#GjZfd-Oe^)Y>PP&`O3zMes9b132(}0xthd&o$3>-vU}S3E8hembA+ z`C8&gXZ4EPdMb}hi!);^-de7l&E7}z%1V7hj7AULAb*_g(Mhyl(<#$ zHp-t?J*9WAN{xTrz|Q^F<^Hdxb@2){owwJ;zB&ju{J=jT#4!Je|Qx6AA7? zvWh9IE2Uhsl>H%Qb52DO{rnjB#P8kn^XnyW5Ur3h2&4QaE9rVNw{GU^L?=uKCYung zkd>&T7Sw^uLzl$yOSS|Iu#wD>6>kgJhrhfb1r&ojATZd70sm+JU0tHCl9q3de9C`K%LChaPuLJpb!(_tmnnbT$J6vTUX%a$YC_deJQE z0uq(yM{59e0I>RbI$;KG#h{NCP(|Wq$#2N#bPW2!bG%+$&Yk%BxwLi=hQo+%7OTpF zDyc(}!wN}_u2U(MGg5XKrGXu2&jpq9J`Mt1Dw+}rsJ3wc>tG{{a|`H z9m0cclfPn=(IZZzp*vc#q*kh|q?OJyc0W zKX~?7_lWe$(;uWW0$Vq__)K2SrEjiZ@)d`qq8#?wpSDpG-*5R(S4`h$+f#2hg z8$GWK1i17%h%q)}(L`s-K1>DoASh6wlY4>IA5mkXl-|^{d*1scx7QHA0e4M5R1Fcw zyP*_W0rF(9_!fIJ*R`jN@3zdR)>;}cOH2U6*#MZaMi-|#qs;@3c`bp7oqg!PM}bpy zS9R&Ihw2ZD9XJQ|T)mvmNIL%vr0nk>9>y;qGc;YODlAUC<&z{kVObrLZf-AT$X@Y}TJ#ce z5U+~V7H3?3KK|j=M|-O8c;+zeu?xC1 zY=?gXx;(s;FK))NrBg9G_6e%=Ro$__o)FBYYU8PZ*EPNv zpUN=CI~1yh`yxC8^*Q79yno=iuWv|mtSeQXrc?}g6@O}+0QRvIL39L=Hj(TxG})Gh zhOcwLzh1*ihlfIfRnGApa^h01VLP1m)x7-;iWwcZlBIE~iQrLdE@lh664y0*DaZP5 zXm3P@xh;uX;PQSxjpxro*-t&E^m#C~OOXjAijTiqZ3Js0ldgq@8&w!4+>H~cJvnx( zTWC@_N8Y(PpwR76w)vC`yOn<{BMwalRhz?%k$OnfyQ5TWvd`Ka$$rOgY2=|FmOF=4 zI1w00Gw~mgr~!<$U1}~BC{4Mqb4oTYkoB0ht@SAW&*}#r{Ra?5jdI z92c<=lH)Tz$&fO08D`^Ftl;~?M~!V^cou`ctBuS9k?%XeFfsFtyB(e6B}3%PS>D8X z8I&+lTS?3e+QIN4$B${6I)wV$*+rdN_9=W9^?w!)huM2o7cT0NW0v`$UjiEsMEBO@ zHzbac=#GC^*498SkhlDnU5>g{%U4d2dD*n`(x@}?NM0`2Ah!+I)>>22vP_TuewEbi zch(7}w3*VieS^=$hH4{ux&$x&LKP;S%+~BuOee<5?>KmwIwmsfufZxY9kt4+fMAB- z^1koK_=E{~qt?P_DoS~@qZbHH{Jv@%y!*S0&cp3Upadi|Z-7cy0<%vv{MCCzT& zW!)N_uBxg^b6+WYb7QG=BtQRCa}pZqfs}2R{es}ap0rWK-7rU^852;^yM1Z(!SmRl zyi@^wJa=Oy8Y;?vZt28gg`H7Mq6`DtU6v(s>l}z#8}V*soROE!#AisFR%e%ePe+aO z9^GtK3GAYEqYMR9Qxzlyv9D|Sm3w=5EjDm&5f5J$X213#=Vw;n0MjsLKr0Z9o(nSs zi+3l2n-+?Dd(5`z#%d0{_~v477^2frdL8j4^3*?`&5sVhf}~gSaz(yXCrZ!H@cb*` zp*QDgUQV}ntytd`Ys8@s!Vq%540RKNR|DO97??VXz&_iJ$}fn?H)w|-mwMTU)qFw` zv-ug4?6^27CK>L|HP zxIKg%r<6dRXDN$1+(({6O#swCOhshiMHV?#7v|nm5T8A1l%R%ym-1;%*!`rss=`*5?&8jl- z=!%_qAlgu8dfE~Sg-_|sb5bNV)VW(1ohz`_@4l|2^NcjE*uYwR=cUq&zEbctoFGF* z{1ck0$*~v8ul5`X67jHo^y!DF-G`D2ttzyPSm&kC+TZHpuTLrLJu2&*kV%foM|+%P z0`jKbmG}7y+T2dff}Pr0k*S=QtZAG=xV^{ocSV#H7dv=5bz$=RT@j;WYU#GtHe?hb z){!D-aNh!#vG*PW)+#Tl5s1zvH7iV4E~LdQ5lnamqKl4_lab}$7tj4?ArkoPPcjmK zjLD9Ug*6&1MyZ@!=-(wL_OQ)Wt689lrb?DiUUqGcfCOl?Jw#f!)Wq|UTKBvDuCbk? zgaFXh*IvGohZs9A!^WXxso41ugYfFrq}@MP9SDAbha?D z&d=DwS7?rCnyx6~C_8R-F?8_AHnic5yq1mrnDj!^lU7%qL%1&E^uy0j9De;s-@TUn z4(5i)K_=?#4DmSvez3$T@+E`9H(6)z=&)VCXi|#HxXnIgRMk}u7i(~BzsKNl^(q#9 zr#;huijw@K!*896(S_NEAonb>Id5B01Jnu1olFSrU3BSDhyMW(I`w8pB*SY+neCQr zG>3-4MiSi$CXLWWdicNYY2b}6G>)sQ5vIUBeX^3(z6Va{x{O8H8Z*-mMwO1-p_te3 zI6sJEhe5)s);#~n_}*$EA-3K^9O76-LQtk4hq$f>=sb$$WM4vT=a(mL>$UqjVwkl4 zn-N6#`KiHzy23fcZ$HSU&P^1^eb{>uD>+!dQ6AHS_=hb)Nf0GjxC$jnX#)~8 zG1YiD5bbH1X(Sw36X-_p@n2~%H-3ep&2i*2fOpeUq!n)nu`MSWK5pLC?%O$TV)098 z4nO+XIy0p*F5G6RAS(S;6;-(NIdep|NLkR0b!s(tlsCw3BiDQq&6{T49j-iK9)AKg9@ zJdz=}PpC=Ww67IsiU~EZNk-tV&dOoMHJsX5wv^eLZ@nT2H`+A8%_n)P0h6VN^&k`cd@@Poj#daaz#1@P@b-;PE8Hpo|rlN!`*N(i!;? zOg0PRRNE&nz^?7xsYR=~v^YbDpjMq4Lq|m0+C-d$j}se$kQu+?2zLwDDnQ#tEUjlW zs2|7%JYQwb&S+YGQrrzTr1<_%2C`Bd^5glJ!)aV*sY~Q-JdF6lkrf%|Pu}1i^irME(58{cjq`rmMb!^qx3xAT5gNaMbYE zWK!In5rjne72Y-&nI~RN?=F&Q1!5}%tJRVUp~bhKimXy-y9*uO3K% zI@`Hi8p+tR_U>|#K2Ems-HJGK3nlcQf89|sRrE|}O}$Qj>gp`r+vK-2?>M`MJud*C zZNV)8itrpzSY*rJs#;he)cam`mKZP_5&VldB?W@U#Z>9qBL1dSfoSt95xpCBTsVlb zs%Ew^?i7|*9zy)fzC9? z_!Sq6+jUsdaSn@mvxtu6W(oWgd*S*2AQ$d@^5R=)(7I*PCjloor~x>uI(93!Tm0D*9yg*Af89-WG0p$C+f9Un1A0%3CH@?= zdq9l6xiHi<5-j+886PwJObi_DWYrVL5t!NwWXAg}TV?eg+gA6`%k^9$)dEgM{7}M# zg}2_4F9fGZ)vRZA%m%qm z7j%GuEbkgf-mZbvC@z7G-Q^!_yMRWm?c*wID)n!L!t-|oBRhh;06203vvu^n7DP?w z(#L)qwlBcu?I>$c&(#+2zDUiE9Mf$1px#Sw9$8Rrhl(CT0_a$YZ|mrt-WY)C5TP66jTsi-$20cEZOyZn&uajBQ2BZ$M>12!iNMIP9@SF>QRyhO@3s zTj^?H*k$NhF8%9Sj_y7SJMhenr`xpnah;sHcKL8Ln+MM@e8 zkyQGIwVCcAe&N04w?_WOlGxP4f*00ujH8BtuLA7b< zfuHZ}DBdak?IGSk+s5sBkeNQ@Z`vNxm_G**_xSTzK?D!bF$S_1I}2wSDWKZcyt;WC3-S41_z|Kr(vzTrN-@6g(2@<-A zIPC~NZ8I_VdG$XR;AMg+u<80>A_V`bDLbFCg#PEhA4L1aG|D-h;?he(rFRgZqr%M} zNXA(L^#p||UeE$`T|ba@Y`oXrD+84(e=O0`_UCR<17=`IsF+PP8glXE<-5NZ>H&C0 zmn8I{l4+LvI^br4OyYe$vwLti3Y7v~ovHiF5v*=|m=FKw>~8-tA}VEvv6@}}0oxBb zGs!(hZlGZh{Jj&nsLx&79HvcWfv*1qIJ!y>z!K>xUE%`l3B0WXfr^Vk##mdiG_X>Xpq)AE_io{7{QinQ2%=Q<4*VpD0sWnhq;*Qs6PE%A7 z+f#YArZo$FNsh1ZPdjW~zH-oj5XQk|(5R z5jwA-{l4;vn3`m5#>q+%w67919M?|g1@=%t%*=fxh+@jp$HJvcc=+%mzr%~=#vHc_ zca{8C{(bM;L#Y;NP#{w(Uii!>?tZJwcOKtG(&M`Ahp-S^HfA_cP_Y^Z3j% z%UC|nHnk&8)kh}_w>{iQ+7*8dzNr5k)rWjf1Is>=JDA+F3aqiwn7|d7CW#2>-d$w3+rv9RZbFX*!xHCM)o4~F9R zg;MX(pcM4lX4Y*0l{{4T@u)D(D3W(1r7`_@g@zGA1-%>@FQAn}&A#Gv$pK8GR z?=#kf4zNe}rF$4w$D@V)K7wyWUjmE-Y4sGjld1nQo-iET4=UQlE$3%`?(@YM*v1X3 zLEL}N!y+$A#d|{MDJ%IS^JcvlL0+H^wiF6=)sXnapN5EpK;Em4&4urIesJPH6WzJu zdf;|2SLLu4sg<5-Ub?qCd^rhTMy!A?JEMM1!?hpE67Wf8v14Yv`+Y={j;*`dkqwwj*g98}6V`ra+vKew`r5Y-f& z18L;0UD`&VVO6vlen%U*#JwCZn#mSxXUSKua-jX3kk;m!;uFg!%DvPG@nVW zG2H55x7#^K{&l~*&edM{sdr*LF*oeR;BNnl-#}oOvJ_7&tX^&3gfvu)vJXvx{P1H# zjf39sBf*0od=4juh*m)%&p=U1Oieu(tT9O-SNZB1DZAXx)Q`{WsaqpMK8JA+070(wfQca0V>g4af=^k_i-tye#Mnj%3IBp%? zzhD;zNvwC!G~+8>N{?Ek$Y@*aO^-|<@Q!v;5US&eI`2#;PChNZK4Pfd?l3cH@7&5! z?}wEIzn#z>mbBc_Mr80=UoB~9o&MzAKiYq#3_)X7?~9)HP)i^lgSAapTE(#P*$_WX zKKQV;-jlkn!4J2pQOzI^?2z#2thA%$w%UGGBd|ktPZ~)E4cA>GqX@`Jy4^S*VPwaA z@#~f~zF27wU<*ys@D@o?@R9EXV%g+3i7^gPD#NVL*8Pv;H@lqTVhf+DIbVx7cfY0KcnT%(mt zg*aR$ro&loqGh_oJF;nA@Bto#-W!IShP#t*H*+M?dnuSSgf(ExuAw*EepIJAb_nQ< znh%4s9n*V<8JYU=0knYpqlBN7iv%c~nffE|;a#$~PQCAj>0ubNfD3aSF#p0SM_cVz zYLLKWTg>pgG!mT{o*+q-h_SQ3I~-LWS#?ss{BPBvK~)FpwGdaV-wk{ODJvGg|JyHh zcGX>+>|V5cMY;@j4fX2PwsZ8Z^Rqe%J5z#a^UKu#6)n*S&@lH?@4+{V$h3iKI0R%1 zcH`HQP=fNeOMjwQM2f#6AaXra+nB9w(pq{>vKqHGZmOcYt=Uq*%s{eGm>9C>!_U{3~@dgDpF@GC2c&#VZ%fqLpo`~NUd}7{c*lV zO?)xm#Lj5*3cZ4lc!&J$=PgMIevfl5MIW^@VAC zBhMGy^p6`Dh+p^b>goT0=o=`V0Nyw|^@WGudx7rTZdVl5kQtGP-DtK$JY2o%?Vyy^ zxbh8Lu$1_7>AA-4pd1?gP$C=DnVRcIam*T_N3=;~Xv07;E+MWH)IpWI9} z-m{rvD{Fz49jy3D_1UQ0vH}n-J;ETUQCQ!hwy3t&##+tdTajMg zCHiW0TuOY&ts8l5AC~;hkS*Fu;Z})sj8uSTFH$e}F4M*d|Qx})&%E$kV&iT6(k-S9*sR>`v2YUga+ZkrQ z#07aGu;dxkR?94`Z?t#CAHMh_Tv29W5R!FpStik(8evhly!!3V=z=?w67P02;32B+ zSvl`uO$YVZNcWE4!6iv`l=f|?TXF0a{#A8m%hOW+iOFq#RHLYyI4V(NgdJ^xH(+lx zuf7nW?!(M*lwn#Bu@Hu1RURQIaxPe8s3yl=x0av%PJpgs>sxVVrA=)!mt9@NcB7g6 zl+m%J0OaVbB3sS->eTggzhKW~wR54%gL$G?M>gbD&^`7hwL%8=i?)A2(*}gN@u<&# z*#25*_Rcc{4AGBwKmndlbdy~AzG@^XbpIJhQjbOAf+Tegn&Wfqcif1nYPvQbC zoF1mI9BZmDr5z-jvhF{oO+>~jBGtk384g~!T6``hqz_)57)}0%UVz};FAd0{RZ<$7 zM{njb#Y!e76el6@;{(UEVY>GOQCVB~?nK!H?IZo;ER%!uxs;ylnKSKb$C1T>bAnzy z)p%O&*Zp*QCS4y%dDm7<*QIcS0aHVM~f=J(jdE8XAVx2G;*MseVjy%hZqW`gA2<&M$Bm^sm(D-dW(#JmEd^9-IwRVz*qst6x5T zA0Fj+5dF0J4-S3niYzFH*tz1eKGri-Gu4-m>c?l>GyNA`QxGc-waZQIWpLG?4EG27 zKP}9p5tec9MS^a;fs$8Sj~+EQxedckn;cY*bCF2dZE}#ehY*|dcU_`Q6H5pWGm1kX zVXqzNz|4bqndvoFcdUTK`x#&tER6=V|F7ZzUUFMugDEyqV${4Od4xyjcJ3XyQ|*)T z6OT=&eXr@|>3qSw=t%U{S(u4VPy@2AsO#i~^+k^;ia1q6jUlim*M(~t1%cxwwYz3~ z3IK!keB_))mc*oLFAuk})Yn&cT6q*jJgY1D`-G(ED7_BSs`f{*D3GvC`^;`sl+G`2 z_I2{Mjva3&&jyC?N3bgZ-tYD@a&^y1X+6LInKGuy)URo{KMd4mr8xyS0l*FZ%(w#} zsx3iylP^fCKt@VBe%5>?b*Q-o;^F7sLB+lNa4#;rGXS*~%+I8VvM)=D3J@^CI`C3`Q z+f6!Enz{uV1=N>{Hgw!Djk3FAw&~+XH!#VtEh`fa`IeM6UA9{U_t*cS^`b$VM|t#T zOW&iS9_mRkCCbrrp?!+A3K2JWZ1k=#gQH3Jj?)1UI>ahGUq3)rLZIbRd_1$j%}md5 zD!cnf^1q;ooO5?lSw3C9ovj=)3K2N43wU(`m8gpj44FR>`{%C5=y;CEf;o7~+>FR|v*PzjlhRlw!-u@RNrf6U<097!yd>V`PLQA{dGb%iv4!pJdaHx( z@?8HYsNB`QdezKIgH6orGsvG+lsaE8DQDk>?kh(JKKA*-$%qmDqf>B8_rPbKc$f!D zcRj+F4!C*;7$;{XQ`_6y{y8~0Y0)nm^FM!n6N|sARWK$(a*B&dnBR+)iyfQFGxgwArv>T zyD6Lxy{pgTT!3Q>Hh4Y9VQ|H}Wuw8<TTcd8frGOOc3T`8gt(ZtB0!PDqE-dO|EDCbTzxEB+7^sF7s}=5tUWr_6Fk zS~s?-=QYwWp+2n#f6xR923myZsK0yg=b^?@r#;o8ec_5d3Q8%8sc%D{l=08_k7{dl z!*i8gY?2&0H$TC0a=xFtEDsLv`qkeJ!36dT|pI>B8v-tD4t7@my@6S@%*@k0(^dSM=AGG zrQLN z)T_Tffdo=;J``dc`qk=}-l=>oYgaU2^w`mAJ^DoAf>3>O3|?-3F65*rvgvf_TfH;A z^W%q+?CP0L7{v{;H`-Q2Uh+8grg~Kd=VLey1MLw#$)EQ0rha~^aDS{KL~5~4;=W4i zHjpirNv2~v4GWYX`RXKvy#BuAKTeB{!WbV_ev$0ys?amU4=aE?Xq zY#umMjkft~vBip*CgTrm;um}tsN}Jybu7nUuvLfY=6=zH%`iF;o<>d$>%3T{2-Bb=dS1N+%T{)q3d}q4>z@#(Di4dzXs!Hx=2hM zBjVNRVJ+po`u^q@GrO)@0VRqN;;Xwu3_8`a)}J}qj?@Tf}uM!w`w0TRd*K=Dw z@is+1YbEFAdOd?+kZxARcIEUdY9eVm*K$wglUHkR`REDgO4;a~hX$*wjI1PE?XHE} zT6;w0bis4@MH8GY&MT#&hZWi0Q{7~kksYntfXh#m5Q*;o`Hvn6hpM)kVW7o}E|pVv z_h)DgH2+GFyu9Rb>@&9uR$`A`YO?@^x*e3M9Kz2jOMUvMa_hdKDSSf&e_)0&DsXzU z?&c8zkoXNIvVQnk3Q1^o=&HBlmU7v>VQs@4jr|7LtO6U(G0iN3--|kueV*i z;r=+Q(;V!?!Qo)mQ;n!XKzN5L61=h!j0JkRpJi6$Kn}bbc|wFR{J>C7qfe^|ZQWUY zJ96MXOAKE(6_Ex|#KL{E?Hj-f!)hG5g~$D>Wz;u%B*1d@YN+#XPR?F|Qf8gt`Ga?i zqZPOwv;*zCT3hE7#R;bTQ~cEUbUYS6_6NTS!F(%i%SyOro2y6X8tjdzv&ERdkeaZYwgTeCq|n@(?=e zCP;aj9d9=3bbVHC99|+e5~C~!&npgx6(8iMXOa2okI%j|`!oi>_~mOFBcfQ@xs;Sk zJP$)YX0yIW!{;Za z0bY=t3cPd03X^lVY+aWRpweTVu%8q&E-wm-m+#)vMXD4W6KUqasTXNJ_T^oz_6Ws- zF~3g5?LIw440uq+Z*ZOu$CWBUzSgDjOgWsvlG9USKxht&;!Jg_rK#6$q9yS4B|^LHj3 zfCYQWFB|(6PxwvKc~QwVu-(5+ZP;q^@A=DnLzS*PCQyr}$i|H+++5abr{C2fp>b=m zw@mL*L+3xeRAI;CmRs3F;|4X-71&01>%FgzAS$&Ea9An#)>HMZ`%(;!&eQ8bn`Yhj z1NZEv^)R;wnm$wJLv2^SDMU-uhg1H}0)R`XHkT7kLr{Hj=ponebp14Lf34n%Z0dun z3SZTuWd%Gs;+6NHxPrS0`jQ$DiR>Qa4Wv>=@&Gc~lO+XBs9x>*0*DP*Qag$zde2D@ zXl0K`XmS3pP+fNTFBdd|&>{2=#W08SH_4R|_NJKKJjK^sYeZh!J(DCJ#7B(>SM3fH zv2@ke27HATx^D)QtIssKy|3aR4oh*4cT$AxHHw`!CntO4D7|glM|ZIb73jGy>dYh{ zC7kC26yFqzn+sF$4vNo1(<)!kD|oaH55Rd#A}t}zI@MVyXoVO^xKZxis;J_e&{H;H zZMCC41`8P<9Fwi1C_Yo?v7Kik;kjY~#f(q=@^EI)JKlNWVg-r|8~;Kd29g7GFRmHy zi9(O<9^^v_{PEMK`6x$aS@-0K?}BK}5KnJSl+{6m!2jfGbWoEhM>%_Ha#W$b-_L3} z3}bim{M=cSHzDcu9`G%PPOlP+q^rFuA>%LkMOD-M7HXdhH60GjXm{ML&!6xA$mUE- zbe>)BqpHvi86w5e&o#>0o_?}eIqtC5U6lPij48m>^+JnUoXtaQjs1$nB1J)pf)U1) zL6FhIEO^|*lyte`prv})XcWU6y3MwZ5E*{$!!Je&MMA*M6Gv8P;SVfSeUJp!NT&r= zN}a$3WCkB26Q-Y#5q9NL`@g{8LY}f7VXpd68GLrG%z4?8;&>{W45v!$F|6QTeIhok zf3CIRlUC6WO3sJkcbM7WEaHO*(35|Bohr}%AuBHOm3C5vy$nX671o6FP>;TTZ2vi} zF;~YZkzFtA@_I@Kq8uwc4ruL@!HpVW3p%w5NaH1^mN&ir#Zf_XiirTRD5b*7k+q`z z)9L+*UlVSlya`-YaH&gyu*-(rFoM<1`ZlJg1%iLqf)73hUpeDC&ek zm}VeBH<3$i5^pCvk%ilMK#nYa6Z$WJS})1=w7yZ@LxB3rvs#&SZfix8b!&cM+fhTcj%DT%Q8P1OCWc?St75z(_;6!dQT2-A2(t% zt`oht1>IePsH{?LW$w5+`d!{a0z>-cm-`X3F*pf2#r)4;YSMb`5>nyAn5aBR9H>pM zZLl`*v@<|58K}SNpv3OTb(tGjRk~7qYukvA$9MBat`(t71iy2OlTErY+1;(UP|MKM zjav`>36j3yvkl+BE57aN30dbMYsVc#h_rKjvMCYKiu(@jEL zO@Jczd#o~oVSV`$A6=R~?6#PJEn-vr~ycI-*BdcdekU04=Mes;SHw#1W74^ElPdwBfIMV zsn0`Hk(gpa_^eqyG@G9ZawU}R&LoP+XuJ560ax73b6QIq>$lKrSyg4E(fd-2hI^3B zYmc+A?`Y-l+3?Yi%k5<6v8w`Sk)EHIxjn#@)SY@XqXik_FWeLcP_ItKq8r_pS%7`wC|~HhhShj{D&i!i;84?CLYB#VB=WEQOem zVgf-2xZwA2K|b3o_T$5-e?5expUGvgN>zUrEkd!}rdn&Kdw3L&$s-^#5o_d2E4+iub@ zWW1Rau~GZ(0<6LPW-YtD*dO%=YxBd$*=yP*A#gdlnq>XPx~l9S?S}mH7BRkPv4oJI zgY=Z+GRC3Q6h%yPjRyC8mrwea#g&`FR6Eo{2-H&d{F;`g`ramlOAHVX$C`A@jHH5Q z-<@V?SRf{ShKtYE0I`Hm1w7PwQDJD)9} z-&;KVY%!i!u!}V|f1fTjY*nD6qcb%dj$fZ1(xvL_2?Z!_JkS8#wNuIQ1{vvpAsqco zb4%Ipk!_@3nTXux09KHADir-|i4c&L`3_x{I&DLo9T>V%&xWF-R4O{ya?7fjQ4bFc zJ8KdMdWqHtA0Hkc)RYVh82F<-{Tk$ZAL`Q-;Boz}0t*(?CewbL*)tsImm_Pt`6az%$Bpro5x9imO=oBp zqxOmdw%emeovh96ewlcAjhQCPhx4q)vrv6)8_vw30MPd*=5=Y1$}lTV5j$)0Sg)T& zLdr6-<<0DBnV;a6sxtm2IYqk}KDE+y>HJy+MH*qTCq?4*0Z3D7xIwEik+B5M=F^C= zy;AQs%lyRVNMN-NR>kew!Iw*4b8%QN$m0rDo9t>U69|Xui>mt^JMB4@9&QXv98FgQ z;PJY`RaI1oNe;&7GCJFm3`(wlW`q#KpZI+r-d?O5&VEgXOsKd(o=t=sQbwW`9u*mS zXucA~GjjmxU|-#IUssSwSQLMtjSfQZB>Y*ugZaX(KV0KAk&6fZ3v>@rN6zuPAqQ@* z+O>|UTFS`-cbR?W1Mn-6FPIX zSDbWT3o6X+@=XWTT|nmo*S8hik5Z3U;5z!H+ecbU5RR6Jg^XAG_AM$&kDY^D z{Dqp^L%j!y&hLG|`*t6m&l%Y;q9d#BUpPqXs3*GQRp5$O32zl}Is-Ch= z#(EcP7J^hqwI1?p+8(4HU7>+{jlWJdW;+3FWHYo9j1k13Wuhg!PBVEX01X|o59I@6u1T~l3) ztz72tn2K|M5cG|?FoZIsiz z0DW%wtD^DuL*7cfK-gYy#A+uRuP5~u?*%Ooz8CV}&HT$b^z94P3Ks{JyX>0n2;vS^ zf(jyprzmwm#X6eKs^hX&(n;o>lM1)+`KVT+&S$cu9CS`%>!G#AdN!KCksv6*rJkg) z?y;5=vGUz2ecS>L4Z}?%8#T96zAivZ3Z$qm83ow}8lC59$UK8)(2vc)!Fct+9i8HW zGp}_1Wt&>F>8)z-1pJ9w&cQe#mx48OZ1{dl%%B(Lb;@aVwWuncN(spjYLnY%y#*E1Mh&b;``01)l({uO^8PGtlYc4 zjA)}@xyt5=EtP8oc5Q+j@Z2nGT+-FewT=>G_AgujW&(rcfQm=lDHgx2((V?=nROce*}J&U z&D*4w!hMS0{}_2hU2*@RKai)(N7BM@A-E{;`GeV)kfy4sNUd%$$w4&x zQQQ!SRdu0bvB${7F`0*Drld|hIdY|G-*2;(&oLT6L82d*I(|zldd4PO3Eu8OtC|=+ zgg_A+Ez)q7TITbrI!-%oite>dv^wj_PZF2Mfzf0YKh<}b=CWVmP5TO*?|Fl=tWWxH zChYlz02@^}8zLt`j%aY>@t?guomiN9`FoBX1w#V7&UKzul?k46W++z$D;x+7RZn*h zl@mHLJz6z5`@9plb}?C?m~k0C%e9LyuCd^GMLktBZ|i$ar&^nLbNXuHo1$~^_0Q;D zue88jnuHP|$IPRf3Htoju>wp29cjGB@v0s?#}4KoNwSJYFQUeH-Dveprt|Uy(d*Ug zlj}qc-b^2e)^{XtI9EbEcWb0}9V9lF#Qx1Kg11^A#Bh6Vang5P5OtIsLoJk)*=bvl z^HBep_)p%wO>2uAKzaTxGfB^0*7>-^2~?^%d|u3X^TRa%r~0Kn7oU<+s-2?gw% zxK^qSyHcBgku5)dimyvc@?$7(K7sjq?=-s!N;`9o#TLf|Vd6jH9wVFZs*nulxA?SF zS~74NK{{!#xG-1l&H>^3(bu}(wG=oOar(qzv||5c7RU2g*6i(kCvObE+GMcLipf{G zv%)`1YTVT5O5y{@V+N#n;{mMC;xrlIS+Kp(nohHK9pTWyWoR9~w5@W3?uT?%jTo4n0^E7YKq)5?;stIF7BL7%etU=u=J$VF|ia!*9G+(4@q{ zXF(XxBTdpuOZ_I9j|?3#gvtponr1FS3PJ4c)bjG-spVXR${_-0f5vVmTf2%03Czpa z$+=&Z2o)rR9?uu#ZbRzbx35*-83-SPK3i-kOR1tfc$xg=oiK=AofVo}fFzWVN5ghC zk3c8of@uQ%&L7qJ?KRq`H$SI7BoFCxkDc=pXul~|uAg=52Cx6w<@**b$ow+kMcp7?9a41O6W7?i z>~q%cx<7VCVY2Mn+yy{SlwI?=7E&OE&Yqv$711hotZ|eb2Ln$OYGp+9sHW#*#*me- zj{#UP{D3PAnC3IO#U&BFUd5-bpqU2JK0^)JHiC1x~87B0m`(IzT zg`0XaM~z(TIlp6A&hF`|-}9l%OFJ)Jvd?&^zbV^ZRen<&!lbriz|=(mv61nzjULf2 zZ#)#~?2-gw!(b|0SYqSihX2zIfT4Q%5*`q~@-}}3Y>QFf;;$cJIeOkBj{Eyx6?LbS zzfXhMO$6qV<@jWK9BWm8{+`!GJAHC7j6TRbEeIR392U3D-pX+qBn$;j*0Szr! zE;h4?)zk^9j`0eDsE+BP>pqrMGZEl504h!Sk9@33IYz@pkj0G2+4CtH)6a{-trF%v z(9LgFWKCNOpMlaM0O*dMVPFfUi`?x%*CExnfGrwShb4mScpq=4_^mx-V zxa>oJ&xXxJ2JNK+pP)s>@1EErQ40X$nDIS(U(j>W+kL5WwrNv|NdzLyji~dtRk!&UJck}OFC^#!7wL@=>eb@Q%2F$ z3bYxX?WfR>T}Q1{3$Q&aKsG}YDJ%EEcF z$zegiGxR0$TMp&j4R#k zUv9)HcBkwWUv|bJ={&FQxIBmlPGBA10n1CzvE*9=?h3@k#u8rB{enjZ?|pxM19~Gt zU8H}9A)p-rJ#x^5aI9}v0}uZmU&;$0HVC`^I}~%(b1AXJ34z`+g}r~ zTFvp#0M**l`kc)dHkt2m?&4>|RP*Ev2pxdU`Q8ri(NQFFO?EPOeOqk+LtUR(u#OTs zhO+fzPjwzFxY|gWW1!$*Y6?9RHcg*5+9R)9nFiHx0)qqaFXP9)Uv47`aU+u+a#r{m zrrr{3l+(fQ`|V$W01l(BI&JMKXVqMY80rwC1(Y(|A;W$Ao0J?gPuhIFe<;Zq^Fxv} zxX1E!NevYKLgOl<_M;*lNd_`c(dXf0gNwKH`BDuc0-0l<%Jbt!xWuMH>;!*%lsVn+9eM`K()B6O>$DWz|uA7m3E$^ zDylAB8~53NQhT44zbvG7+t2&jQSDULOxTqDP71=uWxms42y$dVu{+h6pE_OPPwH$e zZqTM_;-xNaPv?+!+O|I#fmAH(C_ExbN1Bx7vIz-LwTtlB3u57Il2ytjOeBy(0c1RO z1i!QUmlih4Z8513L`U&iK!cc~G$Fs}?!qOD*MR%C8U*y8435n{P0xl4komCM9#ov` zY2`PfSU9~0!Uq=o@?I8MYlkZp6m1g8KM5Qlug(Ti+`BRLlFh>`@D-~fl+H1mn+h1b z*(&l#G-x2p0grrtCFkfCxRL;6S)aHaQmqGe!-m`Oo0-KiaGtxq(KIkuptA!5E!q8= ze~YdV7#V(&6=^0*``|(5Jctu^cfTihA{BHNK5F^e+XGQJw>@AVu0Gcb-v$DiYQ#s@ zM;A}>*~362=4&3C{fZese~~UC?2z>Foc)HH(2?8n&cRBH)B^V*%@j>qfye^m#MN=-6msNrX z-=&ICE`S@@t7TR>R(&uzi|=p>U9X%uA5yxUnlIP#>Yn$(nQFe;Ex&H6mj!w3R}nsN zPdj$@@XIC!Vc}^YX08NTytfWQsNoOU@mw87h^nk69>etXEXVEDbCf!d+jwL&Q&+ck zJnG4ZgfYNsASaw@pIIQdw(>*A zxxMOSXAk9w80SE;eNv(E!!d}K=(74aPkPAhNe4h0zMj=B`>^>8KYrw*e@(L*5H+Xw zdSvI~G$AE>E5Cz7&E=txmoGBO-?tJ@Mpp(8Sg<~kcEtd}XZMReZU&iinsU7vnFL=B z*G8sek;C}S9~lkKIm|QW& z51Hoqu7ugQEI@{f$4DO8n0N8B6I&Uz=zEDz?3^Ve&SvI1774+%>(#ldvZ7neQbk%z zdkm%qn56livP)qLWfyyDUAz~({2usZ`KL*IreisIf2envOGT>Cwz49#N=K)_dLPXS z$g6a~MCwjQGqx7M)hT?Z=YM-mDi6N*u0X1pGD4{0t9{qMimZ+8lF|(q4mz%x4$MM%t3cQ?ATZdysql+E377!bJ-IdIQ6NYpCgeI);Z8XM z{rbLMdGl{Fj&jlO?byvE<*o`k2acJoJckT^i?bAm^EQ74u_MI{KAbzzJgmnv-2o)^ z1`Qqv5uB!4hwVMVL0_qG4lYgya|R-p`61gtX)T|)dL-^>{Wm5;ZCQ8jTx)2r5%N_RBqo5F$L4#%N29Wc$=*>7G zr%bsCy*T_jA(DU~@s5U%F?Ev1RH&j1tl9h86pydl;1MJVKY+KL{x*De-G5dodQjgR zC0CM&{jdv=L&Pe5GH(F`Vr#s7#4E2FzBG$>CNG~Xs~T|(lkyA`u6np|ZJyR}b)*dr z&U!$E6%)q~>A7SXkQ`mzpdkUQtSl?|T9w_KanGw%VcZ+>-t%2Y7=Lo{JFwbspr9kg zu($EyY#;~xEREwYMd3tEq95ht*9{tx1shSAHnXBdOc$hNpYI+9PyhD$9&R*yJbKpe z14dN(Bgo8{zMXteE9h+`wlsSQabt8XeVx;BOA#A$wPg>pR`ohxSS>zqP6KV{2g}S+ z*4RP^uCJ-VRNL(Ln(-I&-=t>m6pU#EJsYI3&rY4WexZTNQ`F*Vn@z*$aXFP)HbIc) zt!@Opoe1Jk6V<2rob&NI9VypDMl;Z7fn{0lJw%Nv1ZPY&id^+L$(^QQa$et{n`j_$;Ugb9CBZb~%3ZtYgXH&_yTe`1h3b#U5bA~*XiX6${K47U6GoZnf1?JRQI5FZ9a zhUqC~n)^NHA6k44p6lY32gV}Sr}ApMikRY%LT8aYK3ctVL#uzaC7*vfoBuiAWxFQv z2_PMQW4Dm|iHU{H>)Hw#TT+g=&hI=uDyT^zjnvZ#@3x>tr`TASG-1eTGgNFWRVl7V z&>1+8Fd5|SyVMzU>UTt(!KWWv<~b0_yzqG0+L%4n;`|D=vNd^^rd}<7W&BX&(!gy; zmCvVbS$gMDR66b!-8nMlsr!1wQQNJo?MenMzb#Us83K|i78MIoo$KBoZNwnUYCgxc z-Og4x&s)C$;92AN0(cX)LEpiKl*m_rNvYhK?Y37Z$XMo4bw-))Cf)d-O@cvhhn7O7 zOY&Q{%MFGn-ie|0SvDsJd%24>ad>5r>;WPr<@I2$)UJ8btJ;`?A&hXl8a~DlMqv0e6x%202Qs1_&=6jqS z_CKP_ldz|IWE$_)INmCcD`<*()_VJn!PzKDqbGCRhDr(U5i+R^<^d+YlIMB9nH!W* z%8eG?Yw`>lh3?83XQ9bSk^9tG2&z?;dvv6}va&?F7(0ro=F9`HGb17X+%!_;X@W}i z?w`goeZCXQtYozbKfCUW4%>VYyYMfN_Bm~Cb$$w~dh8`uyi#fLab&hU3?`G$CZOpB z64si#4Yf7f+KvPxxh8l!CL}(e-(=_9Wi}XHFo5rIunW`at4n{U^EaU9P5>cx(*=<) zM_9c&-e-E+PIXzS{eTrptBti8XB59Y-}aX3lGh+j80`SMMFb|jvxo>AhRv+LL2>#? zt%+pgV?B4~M{xsg@)Zv^9l~nFp=Jq$JIFEO!kf(?ZE_H@ZQw%Cm-zC`)!;vmtzoS1 zGAGda_pPCf)CSM@YBde!1>>JkUq3ZnKo4PyS$51z4&LnvE|2}8h^*o*?dNm9xjWNgx(8-A}%42*5uC?02o8lu;D)s4*eg zU&WEP7dCyxD%hCtJBy$XQRu zqh3T-X0>LaS~?6~|Awti7YuYb?RxkO^T8IbL5D{;T`{f0271?lWIt*P!~}<)p#Ig* z1L(3~gisJcHI%5GF%jlA^q63&{fo>G&#SjC)45K5MbjNu-lg^+N|G%b*a2eWUV;YV zCSGp)wQ&xccRSvFH-;E3yfG`fQ@>`>**wDhP@mYtdSu{u%Wh_M;9Q@x2;0)c4vcRQ z37;ft(#bS|WxNm+ZDkB2d~>iI+{qyT3|1KJaLR(^B#FS)yH|)^5Hk4lQTBIKCk*?g zUqgYQV!aSRp=t4xNnc${DER;sPOmN}ww{Ql+`v16Mb5P?NNu?9jb{=KL%7Yt3-WX) z)55)L(?**!hmAHJ#VGwwZbzpR-sDhpNC z;Kp8>DS6bMMYC1&bqFLjwh`Jny6-)FVynikW2D(ALBczp*O}>nXxGagn6JO0!Bu&; zxW{uz4ZrnGD#P92yhW6-ES|s^*jy|#?IfvFHIP2Vk{IYTwmBD< zEDvl!9C4o5qxch&n&!za62GTaev)!mL|P%jgJn+1r+U`?Qd;{}Rmu1zBbmGWlgiV9 zUIN*f ze=`4V?XZ5}Ls06DCq?m<`e>c(5na90M>f&aCmYX@gZ#aQq5AKD)W|dl7||Y}-bV3% zD<(>HFAC#LwT|3mtC_U%bVX!K)P(XNM50j$o!!a}C*HU_HuarGbsm*b&B(i)qi);y zvT@p=rdA&1Z_}zz9#4+d+Lx9*I>p=G!H01rNnf1Qx=LD(RF*D1rFr%8`hJSVBO1lG zv+c1_&bqh{>sYAGdT<}$N<4e^qAO@?WfN~P`m=;WDSM;6j1g1noU zaETCT6~?BjOvbk$jo1uq!J}=R zxfaYxsmU1b7f{4Jo=Ec;WK{1TZy>fN0)LpY%7{KEe06_OGnS>&&o6lC znPak)s9#SGctps*@cW6~LQ#h(dKMte4W7v0A|w_HiWwP)a?c%>)-E&U^|o>K2U45G zxl;yto$f1=E2qHKdUL`gokgs}D^~JsS25#8Lc6vlsSdYR()uzxU7fNXbmJ0nkYyT) z>cVEbB8Xa7jc#&@1u#p*Iz7K5_ljPQV;@5A4?t>8zXxVl;AtpYc*J^$tHIXli||mHbn3;k&w#J+uMgiOEq_+ zJ5qFqPwe=)(`zo-H**kHVdBv~c_Z(9s>Wg{aTt7v<&-%);|H;Em|n#x4qVtgZS?T< zjZYbNd_4*0I)>YwigVd#MFbo>pHe<9_ywqVv&sm5n(sMfg(}ULqH`V6e%QH>h$JnV zom8Cg3ei+%eRG@~UbjZCqAt;RO_j|L4=R3$aP`1o{FGkDmg@hOQL#gyfrJRh>nWC0 zyq@c1uRS6)PUPRa^!q_Y;iD*{Je3^f^f8f=!&q~{-M7?y`}_`6Ov<}iA#8d-NZql6 zmogGxqMgLV^tGj{q71BpGg_#g`W;C1>OF$UfNYWR+N)}Xm?POlK3F=LjB0-kpFPd>zpxI1%q&NK_PdKeN z(PFrm)d??+|M*6crcXyC`^sQ_C97?Zz_`*41wyB#l6t3$LwTlDb~2Ox)UTUlG?HW= zIoi8En-zR;SNQxiP8xiJl+%wWM*nH`rU_qOui^Y6Sm-^L>JiH$3#&*R)x7XyF@CqBcw6Wo#fG>6 zZD>K5fa^~RSjR@&`(_W1hqU+H4tvgVDDox3Hv3>U<8cg#>2Px|{Upq`OnPQ$qUO_&o3T zo%fvI^9N?n+Uu&dcFBqUvM-ydRPU7g_3_I$Z=1EQU%6dhQ6XMStKxwjC@#4|_D;WxZJL zms8bz$XBFlEXCBxfk`96#{jj*`7A+snQ%iB8HQGwg4LYftkVuUJ(g(;66Ww0G4|ga zKkTU1_w$==S)lX!pdH&4y2%;A3PY~z4nAU*v1OebNts&o*9_qo+A9|U)>hm-cV_LB)OaXJ-B zsEGj;vQ&&K!-QNGaiT5dk2@_U~jbt__=_TDL%atRsH5HD93JC^c; zwB4n^GNIl66=Z04zlYbteyZNo!`(Tn}5K1Y$TF{A6c3=afbwP#CNvGqeV1q zc6F4!^ul_BMMuPBovybc<$rA z5DDOn$UWdG@54|9b>xcja^R9>#Q(Q!fJPsOacbR|Y#+U%&KSb3&IZP~3}->%9salx zPqUjDNN>nE@3_n6?y6P5012vhm~0~B5Q@T}iVMfwZ#a64Te#Hj4!$w?`{Q%r9~1tXg+i59ar)t)t!G6BT!C)aWsnwA@@5w=Pgj{_(#SGt zc{EUl`m$f1QDXUu=T$dxx99HGU}KAsFIy(KoiDK0U zr4kJi0e5j6f}xL?FPN^rPIeb(#vk3qx9FatPH3X65l6Vzmz_d98dfnn<(K8zafxC% zgsKMt zhp5feC`$uecq4&27?VPrOSdn;KoE_!AR*3-nn*a&x-# zgx2%yy&RUuH4bq|PgylB>L34~c5;qZc=dT+LSL*B`6@C5#keNUoeNWVXKsWxrv|1s zc3{(SK1dPfzEYoV5s;8rb8h|a74BqJ+bcp%<6xnJcgtD-e!Yynbp~fBP5ofuk35pM z-g^=L3MC(0E_0(3?QgZE&T*ou)pF_s$;Vbnx(eJ;6W*sDM=7C8N?U9~N_0_jn_yP& zWZP`$0m$x~m?VuWfGtF?6%&b8$K@%8Ls#%>K095~<=_lP;K%vMk>qJa8)Q(qI~3$x z!j#K#b(;NtH|0IXt8!$EYK`@=*{BUGG96*MZzWh!_bJ`@ha*ArKqV#LzTXQJ@+twk zqnHr;4r=N3KLzWrjM+c}yW$PCA4_sG_mV={ik@*81%Q!(Hp6)!Zcf<3z-delqe~C^ zubT}c^hFp1Z*w+16jw?yw8;GUyv^$}5lJnRkuu*8CisofY0)?u8Q^Fq>ArXH` zyRDYV+V>I1wtBafIR}lIQi?Dkx%0+^O6Mj9Qc)ptp``g=)4MTMmBgvkC>(zEsaejv zSF+qKtG-0k`)MLxS9~l5Z|t;Hve7P_d~vW`{Nbj7i45o5An8OU>f1qcFuq%mrRtDE zhai;`R$-psWR7ahw$t&$h9#6_g^NcfJRuW?o;mltMvI)>M(5tiN+TBdZzThN7jCEr z=yo3QU%soB{`08&p}tyzZIYpgJ8YnqUqpN!$5ChZ%eHS_a_f-W$HRtA?r|bFkbu(< zaaY;C;YGz_w}?;VjvpUwiNg-H&xyEYtQ+c?WT$F%)|BxunN?jd!7LKtZ#_lD#3E`3 z^bmXud*k3K$keGe|Femv&@CKF^X6%Sf~^ zd*Fsg>vWUwC=&JSePg!kTw2HbL4h89F%Yv_sgm0O;}N_!@$<4{r8A)dHQmD@0^t>$ zz-Dc!MxSkye^!O`JS~NWUw4K9b?8q@Xh;jLKL3=)!Nm@}$hUEV;cfQioVQe=2MX!! zif@%Gtc&Zs?9RIJX}`41X$LE8+K~u;$^Yng9Vw?Fy*cRhx(wsiA66=cSO+(Ef2$VU z9q{?nwQpy=f(yT2bI<9!I}fYRDP+4|d>>~7Q+cny;%|4)b+vB@mA{8Ekc6A1nIX*k zk^-CC^klE@ul56Hm1NOx0kQR=->GLRO(?!c#HpamND|``a=1k(^aWa@KX1$(^`4@2|wt=PIg=z1yZG3_?ZY;xsERg-*pq2w?n;;K$0+ zEY(xnyowuexLadW5@3{SXZAuOW#q~JXQ$wXFq|FVrfYY{b z>@bc~6p48TyTd;L?59iCuVk`#XvKIc!nc6WDm@2Q1da-z`cBRgot?EUtmhRt6vohZ zHI}`?T^LZ3UdaZrcNO~17+|!srWWpi()UV_|3m-(lF?UiCa3}o&`fuRkP+(}2D!~+ zgB&q%Wn}($?(#81PQ8}%E_+f^A)>{F2z47?G2pr~UZ_5t#CHLK+F(=aOSkjrC=R%d z1Vwv~=kD<8kE;Fl785#c0N5Z+tKJ(6APTG&TRz@o;Cp!h`+}Xb+348>IpG_|<+ov~%am|FW9a98)`=R7rQ<2r zno~|C3S`#zmpBV+(LVJ55g;0TtnZ~Z)qJWf7!b=lV@H1qU#(rlD22)L51rM+vZ{cx613l zcq@0NXe9g_#oAT)%p+W^3iy~m;mA8VU{&QdDO-O)P0Mw=(XzWuNlwd$ori6P+nkjj=Y5 zkzf+K3PT{|$l@r|s^OX~habC_S+inU>cMclP0N$$P77mF6wTDpV%$LdRl$Fk+v4id z_3XrR>R{)Mavo!%9lBpj9h1V1gZIHH|K(fZO9TDDEi0N=`_$2UXLgDjpx%b?`Kl(4 zt&x}ouxr-OEFK?8irg0M8D>lAn=ezH$833`xuYtEFO)eTMQm)2asnmlBFr&;6#Mno zx{#o&a**S_n8jvZ23y7V-LJqqB)Tt4RxM+u9yDt#B#y5GY^EKSVdhbk09p?d%dE$H zsP)v{t%`9;)V=^+aihrN1B!;k!BT?lnd+m+^g08rh}*?7@4pln4RTW1DbE{kyq3F> zHUS=uk?|P~kjoU7bDCJ$)zzB^*vK(R$hV-6R^vrqT_8`RohK^!5{}6(WAbR8_Y&|X zqR~iNP%aL6Tzeh^<&GnoO>o=KNyrca~sF#iFYzwsY8G!%}dKch&@Y zgT$+L?>pZbY+@UeBFp)I+G)OInN9oyxb1G0fJ1v zSLE9>%l=L{SDQT8&aHZEwR8!zv0&FBZnUqCFf)GUJta3Q@syvvTCeuqSn}e`?GU|* zSJq`ebs!qRU99-yspg;Q?wJR&LvBKg@V?w=-c{M9;2~a16u|=MF%{Nhcs6M48 zB@Xo8gP0S4K^$Pl!YemxCzLZ_`elq$D3@SQn5kW~ivUKwrb_Zod8*x8TAqU*5XgwL zGsLI(oDK;%K8;r5B6P>KkGPgG;AJ>y-!gAdN3$lXeYEPEJE}7*en9>FHh5UWx_bbj zB*&D2GpO`dLrN$VkEes-2<_q&50fy%~Lo_nO8FSI}iHRpa%---K zs0a$Ek6r30=FJ+NLl#sEwV6M8yJ}z)7YYAtvQzddwjzM$1q&E^U>?m)v8aAxV=vH7 za2%TgJ3IGSD{5`NHtt0&7KyOj8b3l_zBIcF9!vhz>Ks*1@?t&OZV^+J(uX(run!eC zrk#BKh9(rzQEcK}2sL~D&tI2y9+O(^@<=vsj@5h@I2nLP4b-?j&V$rrQfd4>)Je0p zZ*9I)m)PT$3d-{`3xsfzfYDxmRibNC2n(F(D98Ey_C+wF$;`ND_m0|C&=H`9LN(tH z&o3lU?SHe&+Z(wE3E~_i@z?A{U%?_*k0cz4CH(3w-)NA&xoO*eWa9lA!E&WcVc9Zhur@99jc=As3Q4JbK>n-wI_ z)X#ab_XYM~`9iXR9EQi;j4SlwQ=H7*3$j{31q#@KSF~qQAW{#LaW}b@>+7yvJDc>` zTY1>g#+VZyRFuMU5Y~kI45sw>jn>p5fmR|xdemHcxbVbPzmNL^1&v2xlUI+f4Ys(A z`&p!X1~7z$x$g(D!9DeFKNRvjB99=Z%0=G3EUm5mHFF1L&G5jDy2qKqZ$ULHRhuR z&q#MjL4m@jL}r+9vypG!ru8Y$rCptkWoW?EC!1%3P)e_fT&D-gG&{dGO?uuzgK9>i zZ9c=#Wc%qajKfbeK_qOePdf9ZcU^v6Nny=7D04DmS|oe04knM`w(ZqV<>j)zS(1tE47|NR#f0&+A9XFYg@kZe(nXpCm3GI> zact(@V@mk|gixZ@-Ih%qU5gR(nNJWVpd{u6}+e{&2?RUy)XG9VDDcUf|m2oA? zWAolu@It)1shf-+N)lTgg4KgZLW<6dLCctRaG8R_XR{(g0H{%pKdO(GX-&^Tku7hrOTzYX8fHF1>Y;z-mY__%MHa5;?mBz01aPPx^dl zA{mM*%x5Je#wnOz_Ge_9WH#9DJIZ~+!1%8M;*J-~omGh{{ zkm~|wc>w&{ZX;v*VjVna1(PA@n#LmsB3dHy^5wnKD<5RKvg3N)kdLz3AJ^-|W6y31 zJ_AegDFzXtO=q@~kEa!4Y;~-m@vJBO*0u;dar0Y3gk*HeCom|7-1DDbkmGaOz8*&A zr&mV)eNBKUB-nWA-oF%*ZIbX4vX~3W!iOoXE$h@ueYVCQNPQe%}eZ!_8t3Z<=nzuv5vc? z(r?K%?|{$4eDD*G(WOJtoUa>lo`oc&6!EEA*ADoT?HXru>7CU>j7w0nwEAi7&*?(? zt(v`5LPf3ZG@K2FziR2S{O<#6v9j(Xwg53eF`X~XJ$xpq$B|#j_nU8!=%ATtEYV2+ zuq9f!WP~AIw`2*$d7R^uxuLo0He&Czg$K^!DF=EuBJr>FXHedKjzTE>5S3FO^(U9g zW8oAw%E|V+BB@ugeSxyqf>Gj7^!_qSoEr9{c0hb^h<`hH&zI@zb9lrguC%*k$6;ea zIsK~(0TIlFu0ZrcbTcad=i=qp5j}toIYX~c3us2?6|u~YHf|5+bFiJQt_@t}a?jT` zXb6Oi(d4syq!!@l;U0ObEc1>tA^p%?Lc5hT*hM8~V#0^lk-ES-)tC97>u9fFZmQ)! z7k}PGT>=!CisKdM3_0mXX8jw!H}}rSlSx{6!m2n+)z>WLcQh?o_&DfS9?pzMb+9NM zFK5PXZFW-Dh?nNXT_=7oMEq23xeB-EM6=$NTlKz6Ku0P=n+c)Wxw?MT4OF&J5*3MO z1aSM!`J5>v(Kmc$1StXP3*?z<7M7EyRQPP`s9w72z=cO2 zhlhjvBo&{PhnoXt-WTQ19qr|o$p2DVKzcy%y||GKRh7cH3_zEryCA9YRWB3zKArKe z_E!-({?Pi5FN>LF6B_DFygZZ8A$^uVm0p&RSScj$9b|11sk5^hdcG}_yI8@8^2=HD zB5f?1KB2nPYy{G7y`17ib%8PIv~1aIF#%c7ou!5DPMgxjof_G(URj(mj!mu&c0jC$BosLHe zPOCtcOC*QBqqflGREh~g|8l_=G`|l$u^2B)qBX>3U2-q4^dvk{^=CLZY8|TWLnC8$ zJ6?&>D;{EWyWOhXqmKPa@l2uLr3{UPdcfyRC3Xxpnec|mM>~PtxQDqy@|PjJM1H#w z>}(zZKowmtq2rX-U0Ss=L#@*BT*D;bE!>{yJqmiK6K>1`0WVsG_)@NQH=gX70v%xh zt%l2heBN>*5Ih2B$V?~ zFJSkW2|T$hagTywd{IVGs#0=E@T>99`kp+kLb@g@Q$&&HyApBR zH|L-Gf{zTNyI?gk@&LQ;%_7rLjP!wSNnI z&&RY=*-F08^UnDHsf}EOJ|d3>>>(7du@SJV?R!D;4wV?kL4%K&u8-V?J`Uz+W?+#+ zv>BV`Xobx7d}K)-*q|bvWDHSaqtdzP<>JwX(yR)^CSSIh$ONuwSuY1C-Zo7`!sVX@ zPwI-h)EC>v`%|tQA6M1Yt40Ovl*I;;4s_j{FE*~a0^+FikVoH>j-bCZj;q;C#1v@G6434H2 z1^;2-x?)dHW9PeO_Tbc^PNh=>pW_n(Ytu1`4@)9FhHLPdC@?YHzrV(rjA>S7--cpSN|5GtYFhpg&Fjk&MCX>wCzJ=6; zpD+w<+BEGrfrhj($guyx{OoImz%)Zm>naaPh&OUAUf@_|oGaTf+P;>rS1R6GizFm?#e7vg$Tx+V@;6MVu7 zm(|FcDp9I|(>oqJQjqXK(N`?x%18jeDaS}G7+%gTom&T?y=>|aW% z7L3RhFgOu{W<|^qczuVj(8gO1@e>~P2Enbk;?7iG-8w*dP1DG2#lo3S&hM~#ynkP* z(CZtgimeuuMULa<*3<7~7w7JWijH4i8}wQi1`ZxphP7Ma2OA3W>vtU2#b@#JOez7O zp~!5D=ozOe8*9p-k#yw2;>-uTUsR;@jK&o{DW{PPBL3jWeKt2fO4{Xu1So|*KZGG% zmCc>$vVbX#n}nVmPk3;O*#ddCzV68v#YMthL}GR{XbRRC6Zgk=zUf2Q zlrH05Ke3C&sbyUYfX#{ed#e4~Pru#ZoVKkNCNRRL`;p^mg@Hjrjan&wS@J}C^fEL! zD$tTRUUQ<=5&zsqMK?*_6^y%u&GL?rR1Gp!b_2PRW=MMQ-43@S-8u$b#t*jJB%;S^ zQQ?HjaRf61<>pd(6I(GNN~i25VU;;IU&j`*hslfE|H$oi7A#xw%G`;(@38LawZ9xa z+!L3^B&2v7>JW|l6OabHN~P=#hIQ>B(WPwUq+FN#nxRVRocc=;io^0%mrSqMRmeZr zQzFe<8N}nCY9rx$l#@m*3&JRS(~v99N+KS1yYoGl=yF-jBSLwKKrtvw5RI+Z-mX;1oMgcDEe;9=X&a)HzsHp1pz7f@T(3l(r*kK|I`q zK3Hrh7LG2@2-qFK07!zf9JzpeKhPkQ!CbVjveCRrXpVvbAh2RdH3Ve`3qV1k{*Ax5sr3AR1*dn~=Hh_Ga;qY~`C47kNUp;Ak&)a%gDQa|}xDcY8!~ zBFRgmXoD_{)YeVN#akoRlq45x&17;V%ZJEV$e>-|Fg#QM&gdk z@z$iEgJx8}NsJ?P*j(Xh2RNsQ5zrdoa~Ej^Qw`l|;T`~5kusq6HtkD5Jh;QF+&}5a zpbJHkL03*66N<$t?xrn;zTAWGdGfrYUy5KlsRCvKqaScMyEr%xy@X`T`1n{4(Q53k z{+8B(u+s}|z&-7JHQ@hW(i{o!vH3e<_TRRrr;iVk^tz^@RYkwHyEv7bLja#s1`o5g zQHYv6vF;yLH1A-0E#{SBAJBZ5H!#YE4x~)tiHw|NNWYoTvc~eY>Gq#|`nGu*eC6(c zMbsvjaa%bv%~1?6VeQX-$-FJH_!*51KSNlVf!vz*`^vzaXp_)XZIpxI6hAqY!7%yt z%d)tn-h16;rKLm>THMfDIQ|G|;mZik@d#f0pj`g03+ zIE^scLf$G*_i}_n22=3Z=i8e`8)aq@+wYD;j68o;tWHo8Pjpxqwv^;26`Gs2AkxfN zC#{qZ6*?d4-}~MdO4?WY|6wMFI@=k_ET#k?02vjKpLe94H^=6YO@ z^%Bv05@c?vIl>4M(9coX6ArQ1h@H7Gc5w{lrv*m*H*vshHuB6yiVn>|~KeCwK6Iq|Nr;9Cc-4ZEk`;ru1? zTN2wxTG2t5rPdA?73@DLgvH?yV`gmz32-D!cxuMPF}>#g5C{AS2)D?%H2XlMni?T@ zetO0*VDN>UYqOuiQYFK(IhnWdm_I)>V%t8hdY6~$^Nb*qNO!DCtEch=C1Sv7LK%-9 z1#dsn%qzztP2SSO=ur=H%(sxQvgTZqHnV*%RKLQ373>nU|Aed6*P9Bs1S-0 zp%HWXVA2nG0Y&1uuoHrAIJ~9|L!o{C2?HinorrJpjbp)}n%HaWXaORMjX^Lm`b&ln z$-^IBDld+x5$B;v+}~V#TTXloIsJ;>x4d|L-y<DuebAM){@tji{9=|p)y zYqv6qM$-9Nz7P9Dz4DpX`^a9x-Zn$orAD zhQvxf`JxXD41+r!;F$Y8lj&vAC`n$CuToW&>9gI;DNm=>N|uw3JV|-MwtBk@9M(&y zsd#SjJm&KOS^Db{Js-boH-7?*&5Hy$d~{=z*kB1Po=C=f|6mr5?vgY#G`SS zMNA<3O)Oq*q4wW<97%{Z@R<3i({Xv_U){Kmuj9}lOtZTi71#7njLS}0o#@4X*;jw8 zLsV9K7h5$Jt1IW=Yj7!a6qOzD!xz~c^mVx3tK>9PRC;|KhxIUkQ3xG3mI2xW_ z7uRrf1Eeai>s_L!7HeNn-TFPHDd!ZSLy!=5srHvWlY;!RW*`?TfJ&Y!KZUhb_hJ_U zqt;iNe4@h-Q#z4=`TK^G3RX?!Xyx6+Urc{(OF|@r&PV!A;nxUoAqj!TySJ68P)|6^( z;2AT}e!Kf!W(}@s99hIRig0KzyN3D&;G>8m`fD`MIPE#A+E;F5nuk!_?b9;P{oc6@ zYCRyM#=bwCs!6`$myhM{$eqNA#EWSk`jUn4k{s$USqYv*U9WJV)ej{vFzf%<^*2k2 ztMM1j(Mhc5xVRVD5psHAf@p20H>1vrXJaL4C)tuwx*&`9X$#|W=C&~p|2Ntvc|oB(`^Mfe3aTDGG_0O);B3TFFjZyP^f ziNkmXbvH(Y9r(f7%S4zsnE-V7wT)n?vU_;rvl|xhnN}sp1YE+FkS2EGvbJW=kJ?|m z`Ky;Xo>LyPF-!+O51lpi1%sAFYcoC$-+oAk*6<&R@)f7di-Jb>lG43)W+iY`5+$Zu z760W=o04TC-#Q)Z?b*s3iicz-j-JSecg^w=f?oG|G0P+ZU-B^mmAntPGYSLtpRT#U6-`kR=qgA`_}Z1+Lz zH!9k06wI!d@Dh2=M){ygzIY?2xW%VqpqXWBBdd%&3Ja8L5o{QTx$|(%Tmn_x1E({W zUgwP_U!7WuCbbHxjW5wbqvpGhXb07%DQG=Wc#C6O{iU#F0!j2{h4`J*aW&rOQ97!H zJ)#A+fTZ8oDE~Q!Y1}!GL#VSeYAlij9v_y)tGkg80tpX$`*}BCqmT`o!EDY(dI{N$ zOYV%1lmlnT-BXG0hzq_Kw_>O=-b%Kt))^XH^cuQ+g>%%sw8(aX9yVGoBB(riJe@Y8 z41}9f+=%`U}W0EP0z^`}YKq~oawpY$5 zu*frm%5M_bCQJSD*LHCvG~zNLy2Uop~-g3QbcNmH7Eo7fLYJ$*=x!}y)|;7HhWHXNwK#@R{4eP3Uf zWP1|Oer>($TGe*t-84tqzKLA;z`z)I0O-y|p){R_?;ThHmuyBYUgHB|3Q>JuP-f?b zRxq?+zGF7)CmxP~1W}b9siL+3nfDCNmf;cp(ZA`lttiQ5j1ljeHS#Sk|0-ueJruVRM zJBNjcwO4^5=*yU?GsBX5litL~qy>bE;(=zcus|r2oCw=IgZqYz1hvV;V=daW}LR{$6&wJ7zyG;+-BNQX)nPp>~>=y*Kr` zz1YM0SbM)$I~yRcN7;Y8>?7iOEHKMExrbrh)nKC^Aj?~0j;5EWRDRfZGW%&SRYjTd zgTUuAr^`0GXl!AJuQVgn``@mg#f7M>Sy?K#=%Oy7PN`iIO5*Vro>FbH9#?cR7w@j$(K&v+>TS2ENbIEGtPHv-mT9(*r!sbkkPFLu5gVB1 z&oZ_|C7FNR11KNLN?k~{r=8F?=~T0qf(@oJf%|cT z$x+CzHq3323rHhtY9N6W29-Tc3c~;=bCYmU0T1$Q-hCdQWpQFt96)Ke%I5zU_PluK z3yh1fVZZ&PA*~uM@v-!|8&-uHEQ}1NHsSus*2~Lk*)2=fS}g{iuo<_`WMwf5_+zXE z_q0W0ak^XnaVGC$`O<~T^|^MAVB#E~r%^fId11*^o5+_n?I-6OK{adc;8L0a9E1!| zRW}g(P7Xu`A7T)1=~CIPe#vGOmqcFP+k^D61s$8&crq3O=}UwM5tzI0$b}yYx^Ku? z=&*jf-JCOhWJ#r}38ZJ*c;>2^z$z-{WRI=$YM2S}m!awz1H&+GGySs0Un(lN;975? z?Y9xK+LKZ9#5C~wt!K%J59*UZ5{k4rw5kFJl(&HeXR01Y5F-X9Qo7g}0fmAfea;Vckk85j|_hY7f+fEe4D))>C zaQ8h>&EvC%_`ReKx-*8aBvD-{z1w*bcr~6`FmDvk(RON?{g+(Zt2{SPdAwI2`&B{p zZ&t&_#%xhSkmLb##NHT=(^^OoNUVO8fXG*#Q*{&)Bsf@XhQOU|G@V(iT9}cNF7UhN zwM5~c(GeoS)ILkdoY2TGxlkRe8EB_8Q%dF=Q9A!u3Bq*)DlV|DA5M56Zzg~Y0LSif z^Y^N1{?D3B=d|&edwr?Nu0ZBhte?2nj7-e7=8xs1ceDCU6io%`{x>0r?vZCXcJIq& zKxVKcE+G$tl+5mD1honw>vWYT9LgITEu$t!XAutS* zwkc{lSpk?@5(?{Y=KK!nDPR~B#(;sFWe}^oFl3wVR0U)aRgXQLp(GM%4*9Yu}@=mO6*(_@3d8Bb!BJL zNBFJ%_VY={`N%y&JR17hCB67O%l3; z`ZR`A)~X;F?YCfUVUkK~70B6o`hMvh3wf|^BbGnvmf!b<*`anweSskpU1uMp za|k9a@IAz3v9+Ew5`!7(PAFtOB2x_YDkfROkxjVa#Mv^y7}S1BjAxl`^ytRi->zg0 zNtvxje*}-Yew#4HuDgmKBawwMx%ei)z24Quv?3ajY@1=E`;*J2;MQoZ@$0g>aS1&xcU`hNF>MKRlZQv*>^qIDv zonL#d=h9hL|3Ocpzc|ZKD0V1Off;1mmQDWw$Tmp|7rcz%fGd(xt_|;InEFo^KpB-@ zpVGfL4FycJEyW?Azr`tM(mzz73YF@W;waq3WGtcS4aH&aM)Q#5Y}4xRChgo-t?PP1 zkY^M~kwoX;mu83~{NXj=pYG`{z6VcIbxLP#Z&yN+IZN`NNx-r9*j#8(kLq zSRuV8p|d_FB?&w>bRI1~=J@8pDn!Y*VG1#hNkpINucSt7v=-D4;`^~wHAl598I#~3 zn{T8M8wwu15=?54V#8 zF-yslv0X0Ut;~_Wga+Hi0HUu-_LHXZ9;h-g$fs z>?FAHJ_{O7oh))?-A};ig;0A)g_6j*5yrephrBy+E|V`@rFG?&>QF482Pl0DJ%lDV zj8bkwfU&>A48mwFS!~yGUFy=AJr45*Rn%YslS7vXRtv@OvXq{Y)L|c`Ymoz#pR;-T zf&>CZsrGF^+W6eHRoyvT@AV_@w$Ty15&)&o30I@nd*2vb;EHR4!>2Ph`%{8uUe{rc zNauCYAC3^{DKe=+Z{Jb{d=fK;|pnlj|f@PfsTVNkMZjPeMiLa zoi+#kb~X?diIeq5A5q3PC=8nsW+R4@_VX5OT-10-(C{;aK4!13& z1695vG;3>Tef7FXzf`=1Z%}J-Fo6k2{-GX~U5mp=l=X(mEN(aY!;L@_9~P3+;LEch zs@mR}=a9*G5gp&BEAqP@r=cz_&W{MO$_+-wdAfePQw3TdgOSa$F&%)3&?dpN@UGf! z2n3>772;36>?NbT&ayFCPZ(8aKGqSXcMW&@_tdx0RbRA)PX(qBq_HqLV&JXXzf%Q=q+MbjNcKz zC*!OIffTa1ieqN_hck6MuA%{Xpx~g9{ac$eSuq&>xm=d5p_LioYY#M3=6>wMbe*LF$g({}$yRhnnXfhgv;J z7jzV`%hZ;nYTpHF{Z$n3@uI_Q>JSGOBcuy3l$2Zb8HkZ!gs2bHhsCGqO$y23C-xq} zd(n;@b53v|wUWdUTZ!F$DCIhauy{8x8#BL5`N8oe>$>^=5(3at1v_Zjsy#*AgTMLw z`Gd2?V)sNiXVj*+!#0&K|2fz@#q?MIqz|3qJff~+(Fi;gK{gSHc09l@rwxV9LNb8S zH(6i=ZYl_BWV$VSzh5BX^_JLw&R(F3bNXmr4;Xg3LDl^36k?YkbajO~DL!W5<8JatVd$U4 ze>591=6=`lRz1k2Ecv$KQPfV5nsV6nxHAvO{#Spz!GY2dJ@st&jeJ+SDv zV)|K$%1-3oF4ecZG#!Z^s6yC5z1D#idXEe+@${(b|A_`i zQgl)5_mc#<@FSUukfFavfS{i1KtpXo0z-?tH>}Z)+OB9&z_iN8y_`DEjQ=1FOQ%&T=7^5QgcKuf4`f#J{!AR6>KQ54p%G$Gf>OL6fJ}T5Fx63D93Zfu zzMSY7W}wYC&3o{sg!ypD!Xq1%fJjgrdhbfB;^QD)TrvF(SaB`8%k%!z((&F!16`WqAFY-P} zM1X-?j+#l^nV#L;uk9z0$;uP+1bE}xVmwt_Tg>I#@jGuvLqZkaNiDBXhmNwgm=CPf z4C`&M4zY|WoGr}LW$|JI3?ug8DGfTBPPnt(Eba7|nQQzpd#Oc9MZrt^-Eq+);V-V$A?{feH5hrzM z_ti%Uxq%=-IFL}AxKRA1yUIc&=~q|XtMx-vehl!Z(U=?1l;(^X)CL;vJhou2i*7nV z%)|HgoZd54eCm4(v#v|{7yR1gV@&_}Dl0Y5<+RI@4t=-jkzYCmlksv~BIcw1rmG&m zAA!E}!2tx4Uh`T7uifv_`W)L83m?}7RRjVf17PBlz#+P?e_o4Pfcl<#rJi!I<+j;P1-qKJH_9#*x&MlQ4YyN~^8$x}hRqzWKU8OK%qtHn>2 zWS`}!ZKGo(-jo%D@xN|mBi_|~JR_1%<3;q2Bgde+CC^Muegp&{??0y@E1gfmb}L^< z(Nii`J;*aQgIFXXRwG^&U-KY-a2{~Mngg?0JxAEHjj~$%OD)MHwaP70yA6BQ=`Kf4 zIx_EMc6s*zAYEgkprFM@Jv%e4UR6M&usTJrvO&;$JAlMHnDd=T7RW4PQeT*Ot#$m= z#_6Mw)LwS|gXHe4Kfv*@v-SmuCmq`Jy1o9TXQ=pvP&b2Mc6hL!5O7-~Qfm%=JHEKa zX?a23e*iIz7z>=r%$A|0Zy>+{iZM(VP_bNnZZAS^YN9YGdNaqy#%d(a)Hk3cu>Y8a z0-ICeCeJ_@Kb~k33=;Txs*?L3w}J!V6qW7n!_&!&M$ABARdHdb=v6kmMO4SchseIr zu%cX&ZPS3FU?aQd{Rl3_wY?~m)J{zyv^@5k^c)9o`<$Z4T}!t;9lsy&+B20eIz*QG zLtQq4rm}(7C(%-CxQia3Ai^k8Eswol+Xw6z*?at+kT869ptNqyGFebu2}0|Pw1TNGHBfW5^w0^ zy~M&%1Q`DLKSL29taT%uI9tO#%y|q)a>^J-GFK!mWjOa!o&^>Djj!MAZoe(Yfq&)x zW%T$+Jft(8VLJph{CQ;wdlD6Y$jS+@xCLwF$Sg$e+Zr$o6=1bM3^$A9u*4{`f)b!Y z(oZzI49r|%@($`FdQ+z3p0DhFNGV)`v#rpfK4IWxu^>hnNjub?e`LP`DF449`<)R* z)uPFXVMPQ?NfDq0`vbft_uoYT+lrB8Q3R!b!{Fpd0CEna-K4s&!3@A@^N8LrqJhZ{ zq31>qMw612-Fuz_I{qPn&1K7o)lRE|;71}X7!H@KziHOwV6rBG4%G$js`KZ{(L6u>&)y@4Bi6)uZxNDZ3H$9ULuiyVl5hc z0)k$I_sIerTm31iHRd-#4kiylZ02%iGO%y@i&KYpusS|j!&}vTmL_cdfm*0|`$Y6r zc(O%Dsp-!c4>CL|d1;&5O;Ev*Kv(%@@IXpt{clTYd;#L_F^&ciLLcfgGhh(}^^GKG zO6Q;Pr1vO?jVFSWEo=kSUu|YlmUnm?=@m!O{Oty7v$t90~`ivI94C7r!A-d%_| zUN%EQu--jpOmnNditvVsuF~?OugC9@wKQwWvRNZms*cH8bA`bSr;DS3a(_0$MBSS6 zoY_Ov1x@)g{R;ffT2x~RQmSrrJXmkZYNw!n6=(3`?kmzkBLc+5U&7`Mp5^np5@SQc zV%IlU(Wh;X*Q<^3%r?rCNTcCt)ky^-0_e;Q0# zm#c&{Q1AbQ40xDERZB)G0=A@o;spFZ6;2B{hR<^Pz4P-JwQ<>By|ovk>jVj#v)fKC z;T?OVvFh7?zP>{(8kUHgW)Icmf>eJ|QX*I~8qxkq^J;81U zy6@ok_rIU}a36e8AI_OGv-g^{)?QPtyxx!$bG#!#8&7Gp*YE6BjVwV_=#BJ-PVZep zT}-iLtO3mGYZJfcxFnq?z8oF5j7eaVWHVYk!>OS4nz5^a$*A*y=l3afF-2O+IwAep z;&T_4<*kj`3acb_>18rdys}#m_gOiKHO#XEp^%a<=8%3*P3l~*N*>IfGd9`uvb^_EDF@Dlj# z@~|2?>BuG%k%_QZXy}IMX=ruez|WjOYOvcQs(MiK-xO$rezp9>H(IH%w3RhHxt+U6 z!?EN)Hz9U~%u|zwW%T5>v)V}dj(ZQcl zze$p0fOJB$y!%SE+7adUwj|JV#Cs;BY<1nuqOJ_S{XvA|j%7gG?O3p&^XnYpMbg+s zHrdToPy~`19;Yp@!qgTqCKWF;u9_u%(tr#*{U$0Eo;b;$_d-ME!rGkfZfFbPt)A;V zH+>AnQ(u;Ll`Hru^5f2L=%isQtV&wxEAtN`>G&6lO zdLjl7YZ~hL{`Bv(&R+u`5nfDEQLa9I{{Xb8{SEdOc`5gbxz9=VG~G#D3)P~udo$&^ zOLMD@tiVel5@ZnZN6UY3(Trk^Pz#Ad1LqyT1PQ4&+}A)H*}8T~n*9$~!H0=@qk=!l^=N_o%qrFYZ~(NJI73$@v| zIX5-W@5Bg2*;B)bz1%26Dr!_&q{9yK!$aT_fh^qAP%U}mIKZ(=ZZoMU4)ou=$AkGw z>FR9QwQMUEu`>+auNJkxWf~mT+UDUe>7oGZl^FClFULe0D}H^A+)KPXy;V zNbz}c$@MxB88Y?Lt&sFPWb&p{Zqt3*cc$hg+IIN4(&+R=Id7717F%m5rZ9FVp?ZMw zm@WsY9(DkFskR$G9v*vPibQVYyGbyRq7CA|rn)X{biR1H(LK!Bc5yr!I_B4*RL(~8 zyJY}%S!mt-$c1YLu(^uB7RL5hck*fIl;p{NquWKhX)8BL6bJF>FujCFarpba@f6K( z3(Fm-)SZvPE#i0B;r-dA@m^O!%E#1F#!LNx% zWd3Wnvq3{nmYwmx5x+=U%k=+^_cq*?AJz50dU3N1gGDW!-rT_+l^wS`MV05|`uU@0Vw{n3W^s_|hP3}vrPhWsIE3c5* zeq+5I0p~!Y-|{>0y|8Y0hzMrbnx(Wjs%o1|nqRyR2!Oi2{KYv{y={lEA@UnWdiPRr zY-cY@(v{O+$=^XLQ^wFJspyC*g@UJ`_KHr1+eZp3>)33ZGJvHURuIitkz+h@lg~A&yxsVBzp31^`D1EjpS9tYBpWg1C;JB`J5h6^jaT7 zfn_#V&0a2nj+yf57U{4}_VfZLkwu)$w$^~$>LV%5>$k0HbpFQOAqTkpUy(gHCgNMP_D*N>SI^_IE4QOn8)TAujekY7{LSr{f7WN+JZWd| zwc~wPUquay=RlkMjygQ_iBG3)+dWGL?O05q?Ehg*(Q{ozFg5!B%hDk$G4)q)V=;Uz zU9UYe|4-9ZdJVnte&3S`S#+{Gog77ZrZ6#}`0JQNGKJ5I7<**)!sq3-2O+KSXft_y zZk4B9xorG*pV}ti0C-(zs);*>3^-j*1!~>pmOuOZpH=Q$=k%h$atp4N9`p$(`=fYr zw$Cl6@*wLlRig{$S^CwD_YeB+c74r0@A9WgWpt%~7>y$D0iS26aB>dGtx%%+MpsV* z?E)1pGzwVf;=>=-vf6BGafXrJ=CZ{a4r6+y!#h0v5iklHuJuQ8tb2`KdbqlX>bO#l zifINv7en20l(R#`bur76|2c$w7Qf~aGb*h{h09m+t<0Z;Wk}=8?ewN37ak*&a z(^lH(zic};pI~Ml^b3>zU~quWk5t#6gTV^FJo|P}WvW6wbLv9S_w6L;s2Z68i5*-H z(?t!p|A}}M$!PN)TieT>a#Z{e2rIC9*)zdFp@08ddpj1FIdr+jLqQ?4bp~aO-~BrN zNU(GD9OwMc#D~MD_N$XT+Ei7NLT8KJteN=^mUXDrs@ejJ+k4Uc(jXgz!sy8&HRfV9 z?t+C_vz696p@Pw+e90IC%fRuqpvirKmQPTh>Iul^H5)zesm0&fCw#>~yHM=0@D`%y z44d)p0{7B8D_|rL^Fq<1I%XVdz+nm{yqj^EY~pFo`4fxUR&kPku`@t_OJ>LiBMl4p zLgyD1brV(Ae2TbY5b{QZ?wrcRn`9Nm;N!L#1br0o22f^e~0cy94kJ!3mwn%zm zi)yigsr==_xK+>@52AfGF^WbR4Eml@tUGL)g_9!*MHW{slsqlKJI1%ekI^%R?xl zbtj9I#s@Qw+J#MwZI~H5e#!R8xzQfj_75CPHuO;;JA8O)$=-tDuWGcZLf!r}i8Iu8 z#-P#mi+ZYN(os0PjY7LxuHqiqU4Te;CVyWi&PJ2+=-e-+7KIqMzErp4`N038V*B<* zB_^B-|Ch6#n;``g6vV%0+1Y@WX2z4?q{Fz#9dLCWPav}Tagx8z>JWu5(msvd=6N6j08#DLlteguE`IyqZx2_$ZJ0vKcffBp;XA zE{NRg7_Eus!aPTB<*^8r>r0parzdSOAaCsJz>ti+L@|*S zPW@lL1i8W~Y;5PtsCf}!>SV0tc$WtfZ&gya#1FQV#l#s5o&M9(h|G|?-o>(i{y#53 z@8J)CpAR3G9I-bCYeOJEZ2YGBJ%i4?C6V&ol|eiNV&(IENDa+SgW?-_jNps11N(B+ zE7>WO4}sg{Z|pAf*ox4R)~nPyxa&YbB|@Kv7j`b)rq8qY`%^me>ATzYHNj2?MNQyX z8T^_kh(RU_agz9@s`O2L$ix8!&EzMH$E3`NL9X**ud#^0F<@zHo@^I=iuLyUi(KtA z;s-IutY^FbUKA9K#YT~xH9bYIpgvB$WyOEfANv-Kw?Fan6>t52i34lBJ!D49Kq}=c zu8OG7#yH77ZPcg;XyszTmH%JgI z)wsmo(0K=Va-9!v zjhmb|1?oj)1L5y|qcw)Dgs)G_ySZR?QmVbjGHY!2xka)&Wq(oigU63jFGaOD>gw$T z6&WgSoI%{&R%{~%zH-l`mZ_YCC;!A&37JphEP6z!=kJd|@on~kKQx=5$tG$Jhy_7U zZk5^gV%5?9&bNy;7v=WVcN#%Cs2@rFWP%!pBF$lHY#QGH7rtpOG^+IISNyT@&L8XY zMQx#Qy6Ze1Mb@Xs=@O%@o)2g)yY0crxMIk!KW*e!0UK5z;+j}WEi<{IbGE_H16&3@ z?@45*z=!C+lF4Uvsqw3R%ZSO4I(bKo@Y7jqIf$SWPgOJ>KPKGIAandt#}4G&ZEn>UZX|Z zMD{$gi1t+d1U3 zZa=m`vu~eMNf=z1iWws|Vd?B5;_KZZvCosAI>Is7KqzjxrMi;N1Zz&Foz@#DJ`FO;~40SBx9w7K!~`rukdTH-sdL zO5^W-z1aj4zG!rd4I+xti1;7)e&ug&b<{QawltIZm|vr!wX3d;;}zo9(`qwKLn}p= zk?jG4z%?DIC&*qlH1=IP@Lq)ohw?X6Ry%(Y>dl;do_r<{h3XOac{f6}cc?p^(Khyd zk(nCp{oAGT(`Au~Nb+`r0@1$m-DsGwUSgJlAJ$s!6R(mtK+0I&SOu^~j}zlOPM|RM zh7$%0OOG<^oy)108Nse$4jDNcCwo)$$_0+SQo1s+K-a<|dEyfXorK-!q5m zn4;Z}eA2K6H2YgO2KuiH!knY=wfU@LE`Zw)9)F0WB)RiDH&(FjTf>{S`%N|9zvD^C z|LbXG_s#5lKeQU+N|$G#uNFEbRQdv&6b6&QZFim!L6J72GBk)=J=`tTy!?$H}>K+%Vkl%GK zwRLIqmz*g#XF;_;^YHiX2M;8R#H~Kds8k`Z$(Bxh)M$Wm-!wC-jpwDr(9D-jNji$(S-h|iy;INL zdTQTnY~|QD`(6JVgi*$+x(Qjo&+1eHwzCoEH3?*)8McHjYLp;{soAwr9KFCY_7A&@6? znm3~b{qhZgv^DJpIVTKPO)nj;nVnf&(O|(|Ax>k(NI!b(Kj0V#{|2j|@%hL#y3bc{ z4L`Rk0Y^^&^+m|34grBp5TG#*Fce|t;sG}@NMs|hT22S&`m zzpdIVBQMI8%#3iiRsHa0s&JsWGx?4_*C3aX>PvOJasFY})VA$H(5s$N9X2{Cl9pWA z@)(){dMc%Pa*P`VyG~sqBBdC$4W0Z{oFURtN7wVq@zDs-5|@cj`MYCu>k9vNWFSaJiSiwnBG#XFKqHC8XCK~ z@L7PqD93nGI?n6HU)4Sq?J`ZgnAt<7wfRoT6nZnwIO4`g8p%dD-+*`6B7MaauJLvp zKZaH*+dWahMmd>OB)$16Q)SMxx^rY;VvF#3D#TM9f}tNT^Blqxx`rTz@c$+S4^{o# zq$JL{_WYcYu>+sSvv*nnHBJiT<&<+4K~*mYpQ+Y#zlE7Q9=W#yDRjNGzK z;G8U9TH5*}e|mB>4W^aIotft_Kl@*X=&}=V#1jFKSMM1^6v8bN{k82#$s0v&C%frS zh|TJt%J(Vf%X}HtzS)DD0c*C5g_TrvZ!~~?m3qN}q0QD$Z2Ueg_RP49A%rf+c+I+m zb1cPAn`Qx3R!k4h-s12iO!4pjAuYQ!ptz2$KKK_VW@G+Oho>EfnB<}%G@KO44&&A= zEJNYNUlbl@O}1s1WDi5{{4@AMd99E)8JY7RgG8hW;Gmk^w0ct;4aFrDlue9?-ks&{ zU}Q-MH$H%*Hs$qZ4)q0#(l&fhg$PL#ncw~Xmpnd_it{R39Zni(f! zD|vUG#}$s!^t@$u1UilfOU!>a5VA3y(9CZBZPb6xDKymF$*6w|XU3sV6y7&W-%9OHd zqHr&{+YBr_@j&#BaKf>WcO9+Mf!5g%0D#^u> ziT~tgpxPTSroMvi5#whLhDbDtc-wizJ{1TH5~R$yNXFe4#Zj4lrjWsf^)>K~L`!ge zf(U4h{%Ci@z_kA+coJ8=j^JCcF&;$!Mksw;-ImYJK@C+hl)GD!r$d{guPaWMqozVN zJ-8V1J>hZ$DckNFaUcf|*Zj{zbhA4+3jw&@=}NeHb|#CjN1mp8!_0a@igXXnNfHZY zy(MA=5MN49NNtY&*If~{@J#!U%r9UZ#o+$N4_vjDkC4W5`?RXJ*tVUGzjFY z=12ROopCW?o}FQeo7w74?BoqA+er4oB(UU~C1Iv?aA@U37BAIF&I-#mx@e1p#i^ zGZ-krDg8e^kv97m*clLF<{M=R8MPwApjwT2&$Y;%%Z-Obc9x_y$rrv0b60AB?Eq$p z$h^$@4o=KsOfmG_v2XtFQ(hq*F9|;pC?$SYs7)-~wrCg+deUYX#^4b#$~sO=z1OCF z92IuUv|aJ`$FI*=n~y4J@Oq*IIpW5*R|h>_pCA8@nD8ifpM<`p{8EG~E;jWP6YB1e zO=g>B5sFVuXVe$|rJ*v1RDGCO+^*Bg>3LfOlhl|PRG8b8WEw7>EuveTb` zRix5=ccpONI+WA8q5ua|yDy%3_|TLHgZ8ao&Mo_2hPJL|!dN!?MPOdJf zudjFej!&G6Fi3nBcYdtBJ5Ns#s6NN^q~}|g5F5Q z(d+fMHFK~OIekQb*GXhHh(qX|zf)0?hS;7PQ;A{AmLIbO%IcIBsJgtpooH~gxyxMq zL2R!&J-EOL)i<`NbZ9gIQ^ag)^13?)U+zM)2s)8O#S71$u3Iq=*BqYLi;aXe!*q$siwryGV&sP{1P{(h_&7Z&~ys;lpX z+n5XU`N9^YOO|-XQ$O5Ux=1?2`1hHD;*rJ|m6#5A#^;z^G8Db27`2t?s%E9L@4j5P zeX$xA!dc(>!i%^$kS!MG7mC)oes^{3e)q_l(e54tj$Sh3gpxq)qY_$ApTcqqW6D&J z(}zIDi2BR^ko=29H1gY+4+rlC(YTyWiCVm4b}ttnc$^ZNjx6ThGfn1(S1i)fTPbe` z1EDcFGz)QET+-(%5!=PDvZ`5~RRL$!>#0%8A~QTg`BVkln1nUiH6Rpqiu^RHh+J;A9;iw+3mkNu8 z+LafSJ$I)lI?_&EVrPWff1QU8soA_&8oM6^>bZzS(a2`wp1^to9<)TqDeBL?5zC{( z0&me!6g|GaJ94>8n2*`|n$}?*Q!QtX1jr4vNPG{W$^3P76svxbpYMnw+qyVG*0@s@ zhxF>7-e{>fwrDavNo5{ao_Pj7p4URV_NxzYY1}X%ji(ENEhD$~|fB6LNv2 z?hZMmuBC;HqSzA{238B&ReNkNBIoJSs7_3LSDTb-bHTuHYN*$zz{{R50~fKZR_N?q z6fOhb2BWr0swBO>E#r;Sf%2*ZOV^p>YHR*_Wpzzi1d{W~>!JCpMD{haJ-GfZB755M z6gFXB!+m@5q42N}&up$V5))3-RQi)aj_2u0V_TeiAlnJLrjp&>@N%KPA3ZA1l+U&% zF0_USiCgkOC(0Vw_<89al`Jr$5l|QxKKeRTd)&wR_39?tQf}J4nnS*IR@QLe3(X%c zlpLW`!aSeKVD;W*RQ`D%8u+6C%*!B`;IK_iQ7pg+eVaSxcMn3+Ao`Br>0>pfDO)!M z`M%YEt4{_eTN1G3@lf!7X~*D^Dxx$HR7s+ag@}`Lh?!9Pur5!I$@+ z{@SKBI~&a9K21%g(zx#Yma93{&f9x=H`mH9o2zV&$(>C^ogG=bCll)F=_yOdsxe~s ziAH!@6TwDtF*PRA*( z!FwBdGrnHCDkup&%Swi|URMUIqX8NXPREo02^f;?XwAcqX*(!=3p91&D>YrFJeZs( z=EVwRXfFSFI1|xS;WBpFGcib>StL}bGZ{({SrHrEcIxP$TXx4EW_i|9U-`=CSHsVX5XP)CbFQuC^{a24WBPJj89`6S z^Uakj?9Z>J@xDLD!ml&vEDe%Gb-!z5G@LkIi4U5}wr^AIzm_zzWy?Y;?qvms_soi{ zF8~3rhwvh@zF>|V{6^z1p{e$E19FbkHw1n(pW%?U! z5s%?SF{{a;25l-LE5S}*j8&%_Zcv2Ir&!FJcAKp7#q)WbZncshFr=j^T#Eh z(bM5bDDn=39lz&N^+GAS@a0EMMa6MN!XM@s@?ZS0C)1OdZKZa$G;CrMH&oq3mJ4w{&N(?b4NbTDStQ&Uu=V{YeLqx=-Qd>fl8mN(6KQ>xE7e zP;LT73ubLvL%-sW?wfu8yXVdOjm>2o0Y|<-EG5v*nYK=GbGIrTMz1Wkgy3|@t`0o0 z!vs)CtPYd;yL)T7jpRb=CGcQrm}kv4qn>vtC?z4gZ8E<2%k_LIGegEBWdn7fF$qz^ z80Q}`yX3G^Pt%+Ig2b__YO@9NJgDb4M^TS1{{rtVoPY;N)S@S@4v{S}zBJ7!3BS4` z7>FgaBv9jcr89DmZh_<3V{X#=IJ^O4z1+H~(Q1kD8 zywUw|BnL)3rj1KqUVGq5 zRilkuV&Pkp_K1Nef%A6Y-z5sM@E>EL2&-M(9Vq5Fc+j)LemwOfy6)Z99iyr>K}Tzf z^=BFt0&6!zSMH~Ku%Rd7Vr8J%kjj@9hT?}jii>`6pO76v)+B(1rlWH__9Z+BIBM+x znmQV@2$xti0W{*e2Yxw$VnEW(ZE&8=_PNIpTha18dqm zuVXa)p!i=hG*|pY!c!P*-E?}J^HG1h(Y=fCq9G^aOJAHCU)(Z26_BY;43&V7&V z8Sy{USq-ofJ&dd{pz@s%!-zq0&KS9?B>NGl_8cVneSCZ*{_DTJOYHwRf?2%*cngiA zLdl@Ez1qv#Pt>jBC9>(Tbrlu?7D{^6e=yL>VC3PGi!Z0N)-O5=W_%G2xB{xv-z=cX zz`g8(T*OPIx4(+x@~oi`CiJQ>1dX?tNBI7el~Dbp#crI!m9BO_-_VR^L}mH^c>&Ux zH!l+ruy&SKS6&RncdKU~jq@mr=hFNPz~|+0X`7h9R){W~tu!S)Kk6PKODw0yc>cbV zt(~6WD}A(v%M~Cl@;VDNREr{n^6L9{dXNDr!ukdkVLvi-qhmD=E1&eXlu+p#u-exkxd)z9|=r^HBuVdYR=@@*yy~^&Kqoq6eZP>37WADA$zklIkdnSN9 zZora=+L%ha47|kfZfHs;1QRV3DD$j zr!7>$2Pz%v0ISX^qKN4B)rHuBrjqOupL}NfOX4StA0jE*fjsqcZ}ne4qZQMJjqika z<-e-?WO`h~7Mni_t5C?}s+{6VsvbOSn_XA%(l@t7Y|(DI3ZP;!tdLaCr->E4Q&h8+IDCs4%f<~eOHF#5UgT1f)8N~aQzRDbxAa!#0Le&w-V>%?E?tvmyKP* z=rbcx>b^^o|FsUv9DqbfFy@SlqETiM!=rLb@d}3ry(@=@RV;k8&!{M^i3YueE1JuZ?FoNlGpa?t(WUk_H#=bpkGD5lm7XuKIokH?2NPJ?qp7^JPK8S0B|pVAej-$ zAHns8cW>??%k1Sd3yTA{bq)87=FM!Fl@xG7=Mccs{l)`$G1&l$YH!ZN`0x31U2=mf z^)%`eDd39v4qF*2IVFIUk{LIc^H7s%$o%T&)UYi@)l4+s^8Scs1;okNpL1jd zw8!hzNIc&{oR8{J7Oox{tc)%j=6~>y7~#PPJW?}XR~)Jv=s000u7zhb^1pog^>shS z;p*|jRJCM0l_{Ue(0%i6a*mse&lM)N%VP~P<@e=2Roa`KFr9|0KUhbLzKEX3lRZx> z=sjYc*AsHbGmexjBm;I^OiiJmHS4yJZ=-)36-5;HswlVkdC8!1GkI;T$OZCg$e^+U z{>9ZWJEOQS^y_nT{)hlxC57gek;n9`b&&orWln?ymi#?TbhuwN#sx%u4P29Rj!f=u z%KnwU2eOUi!?5Sl7bAdiF>K6ddL%5H<}EY?U0u&uI*)8Ej& zLM_^oXI>_|+}y`%^y9?e1oxAj+ALk_HN7TP#8^6U-CU`+IAQ2wA*reO+2T=w*MOSw zX|c#JH$7bwGj>eRQa=(j&vh(5Pph+1A1~cLUiR$;)%v|_ufTL*>(=YPMjx*xLuo`I zs<2ui0#>RRnyu6`;*IFOg~`2H&UvAp=Lh-%zlXBc99+#eLp?DCu>=|81=*Kd9!v%e zmK$y`pB6s{x7+dc0v7kF^}`9hIw{5OZ)S4x-Q!B7b_w~A1}4LSlzo7-oMQAUiZ$1Q z^?K=UGWNoI77wd%|DA2k$c~f|oN@2*7uBwNsw2^eT@4GRJtWotiUR$=ZYhXyanQuz z;Rwpad#LBX=^(q`>$F`NFe}k?l5EY4Q=X)?nBPe$RM!_|nFZX+wMzcp_A>h+@*HhH z!D_kcg!6a479K#Zlig6D{;JsbyD*fJSDg+<|0y;+u2XoXM(1{x9wvhF^WtE+p#BNc z6D4I)V_wKph)!xWoQmVv0lHKL^9?w z{#DDJEBbHBuIL*3h2Sw@Iro~Ygq|Qw2zF~~{W)&10_rxb2vwe~73))RUjIu;<>v@k zY(`!~sc*Y2`Vh2k*4czg7XKW(Yr@2_=nYA%uZL#xUEBhv?kPuu2 z)TN_B^TO8yqt6!(GdL82*KF*F@2P9)%)4?|r`Z;^{Qn^gTSEx%B^XwqA>oBPSdTy81j)t0p7-Xk`9_@hrS+~Z>x zjtL!~dT7@d+6%=d3(qBoDhP?o-|WGJE~*1n+3`CU`Qlry26HV6g|~gmZpecA{h${2>U$;~4Bn??>l1p^)< zfa-gEJ!$$GaFMJ?|MXg0E^0--(h@O z<|(m|)q5rRkNR3C|3#z?3l>=mb~@9BMd{d_ukg2C(CzXMGGoW=ya22@CN1Nn13X3- zkn0a1%$JUx7mJAF4Wo&a?7Jt|!tRjt0-1Eh#em^uXkDih%_|I?c1>cSAnb z`K4W6sfSOhShI(`TCLVHEkM5V2zv@PAw(rf}mqozfRwjprNfW#aIVhEMU& z5@@aQc#6bKyut}!`mPh(aC~;b*86>EqMJG>FNYpbLKGr)`=4Hg0mP z@iG^k&1__wg?-%IVhK@oN`aHM$g>(qd-NO_;U@RnVje-r`}nI{vjwa=hmkatfOV$pyoIF ztD~*o+)u9AJ428{G?{Zq^wSEZPz8C=yYEh8ng%IlP(?N3?N`ur0t2P;;0!6fz7>~5 zr?lXM*88<^Xs`6l5~Uc*>h+xW36Hj`URK2J|4_NKPDD~>2kJFgNx)nJFwy zBlO){Q)i3}(WgB!j0@zS7ARX^{!w5kFOZr`$YHWSC)31WdhSi|z7s5%Va60qG0I*Z z8zQ5-zTRJx`yXRblMm>pU!~$(NG(Ae4Q_J0xy+i&GAbQU_%+A6FLb8(7yL7OO-j zABFMx++r(aw8Af$gH|53>kB$E?FX^?;QD8+TumuN4xBaPt4g_l`!A7PsE1@Hqkuar z`we6@_5VXyZ)qqXDjj>hm*E)Ls2hu$&Gddg!xBP)eygFL%2*UJ8`- z63ild2<5FQ$|+B85TPO$(Q95V8KHu^Z%wt>>vdi`AcR;p{ErxK57)t1a+*!xoTTVgCWJk{ACfdUq(LnEl@_d+RZ)>|p9+h|RD6=uU!8YL z*72m;7!_IgF12%IHcE<}(i9N*I@h6|XN1If7QLeZEw|USZ#1M8^&aS%gZTqWP+mAm zt}~R|4bNnRIO-3iC}&z33ak(2-3Jb}dT_5%62Os9Lj%?Bp}rW1N4NZ`}W2$Pw^`!o`50F76TTJEMI;7%e9?YjBx^jHxPJeU}yq``~B~d z(mjtt`=d#2bSq;F~J#8mZ0Tb$`k! z=Sa7qxGB-Uf()?OSgY2Y(E#a^ta2t;R)KRu)&%6GwYC1UiK$wwHr?|hfAQlTF5GKO)?=5n2Qz{9H0r7Tk z>=74kGulKlqr@O8DVj4?eqdy~xBn5V7TG0~dFiFiGk^?iBU)L855wrKk$DHsl*A#n zz>ne|%2jg}Uuh@IO&Xte*v*Co-${F_?g^qz!_+y~H6)m?qxf*)+D{$Scv7H*1xh%< zE&G(SvGsVWp1LfpVPm(ti7IgPQA6eAAE@)SzwPnS`;etq{iTys_)0%p-^zegE5^u1 zk+p!fNN!^gybX$E1D-A*WM@|&RL4jl-w#L8zi-9Q@$(w=6~#ciC1{*%-7&()xFGP5 ztUd;X5A{pOf?H;6-j2JpP~DB!o;G{N#J^MAk%W+FU5W3%4TC8+KzQ-~9Sw~|A=yvO z0ZPmP(~UZ^8WPWMT+}E!j_1ZUjzj6IHHfXz=82*cQB!K?eR{53GJD9(fge!Ov8iqH?Q^#M;bF9XI_S?ib1N zrxjKTHAEpFmEkskw}H9E4xXVrxtGuL?~CbF#>q03G;Yby z=4@h>Uo6?~a9QjR&Rf4%{t4}I%|#x+OI=vnWCv3TE+{lw*OC{TZZ52^WQ!jVP+lbZPN~=|XT6Zp`+PLjNWoJ>W8%QKU#Bak0iRB4X<(x_ zGx#Zpivgwk7qV1YnMNP_&SLRTIdaFIZTa$(QU+{4o|60z2_0gKR7U#68>fmbAK!{a z;x1!PE!ad4IHA1a-Jal&G7xt*-|O^p8yA#BKPudoD(Pus0XGD6xD;32;0H2WiqYzc|Jyxw3gF$#vv z(*6R2kl$;YkGdmq@zwL7)4Wjt>fwBSiU96SIo%e^h-I{oMf{(O0@s{}cRK%JW z^6zK2#?XYTo#)Y{FOwMSZ4;gbv<)hxW_R!G@>9ZlA@fPcl~o`)dNqxYPU=U(fdEA# z_seRfX^fU`a8+J9j3IC#vIRWAJ`8)uPM1WW_nMePPz}759At12*hwB-361+UZ$j?x zQ%(`Az3dsd7CMv7OZ+l-`Kd3%eRnpwHQb{6MY?%<&BEMPCw#OHpi}IEme3CLR~mDT zgfMn6ZFEhhd>6OWgHa?M*IjK%E#S<91y*e1!X}!0&l;qak$X^APguwrEoW>V6V=b` zwL-5208&?eD5KWsv5v-f&}cu1x!81uGl!Iwx(Hcxu~N2x{Y>ICmfRRgBtu*8*ajbR$iDZK zJykNMyaRw6NYv{6*5`yp<4rqtm9mTrbsS3MWeUev$b*psn-CtZn5n^aBB|8*udxio zgZX%B>sea*@)tnR;2&nX8-=&aTM49T0T#$cc<Q(`a2655;#g42@rMYU6zQ&S=dVz!?55FqBMEi(WXMwYpF7b_EDqxVw>L?%H@+4Lz)~X|^v2NtJiE1V-A5c|8hu(^5D9O}y&yC+2|5xia@A3aY3i@lq z5Lx!MOWoZVl+~BFFa`1~a`%-Q_!*M@7=c+_JP0+v8x;gWOMf!}pLL>MN5IvWpVSc5 zn;O$#6ciN9P0?;}ZfC=lz{fNP&G)CU9l(U&jT-$;%wnG3sd$~SNHpJ(>-h2Y>L8NV zA3Z7?r^OiNVqMV?M^~5)P_i6nzZ(>5$=p862rqEoZu!L^iH}d^u+oZEsyqClKNhmR zQ#vpUE)OD6t7!ceI6hXX?ckoJ$sXV~+kQ4KruTX8T=jxL@M`G?fz)pA1$f^q1+|MV zTJTan9{oxXn^Od>>Z9l<{xv~LyZ2gLDU7#5aUsXkZJ~7yu{UX%8tc8nMec=vpG1#J zBKnfm_I$fQ97ANDTWF6rTsx_$x90RZkdP)S$`w3&3S81uGJ@8}NNua&ymkMUAkSr*k* zDQITlq)wltSx>O*Hm_U!_zuRE4@5_4jZ&c!xEKNRQ7+b!P zPt2@+$C|`A)LhSZ)=@aqj$zHjiFp1(ML`IsH@=~=DJd1&yfu$2N^UlCtEgtlYA({hF@q&C7lmjaCblG=t}!(|O;*rM^Y?J7;61+18) z*n}EGs^`8Yk-Be}hUoi(vegw_-#lsJ{B8qpn!N6tCRVDm)eRW5O?pVxtp7WCSK?Hy zT_S5kh;?*gc7SE9B~XOv1#88V-?z8CS#DEt#dpUSEVqomlgInO4#Zr**N4BLk}kk!DH%NaB$0@( zo!SZjxo|nR-U%0k&EJ0v3wbplOmV$mTjft;sEsA%PwwHYj(SP*wotXI6<=t>+FD@M za$;?w*~Tj&pc*3-Ip(!*#c|{mfbKqX{hFM0q$wzY)DPERCc`61(+j3B?Oh0PNUCsV0C zU>zs9k#(Hux&B}M(GXz^wrD)IXhd;&shoRutn=jzWF)Cc z^jyK^J-^E{oCMv=(;Hz6ndk^Ot}D^HU3jNV#12HhXh(=&PgH>^0}-U00Epj2W%q6p z)f^0hgsT0?w;8ly20PGA#h0Bx6~T8e*RCuwKp4hq zqfjbPLO@wbu~WggQr^>08wq{{xF_ijs;(jt?^tN4Wc}H-dB_P=(0(*_C33EVVWdpw zYb-23r-Bx<_gh{5MXq(i7B1-zClr{7!9=oeerpdt?=6N?5iri$OVp;PMV`8|X3)6< z^Xu{5-1ZryI_D1nTl;3WEt8Yz{xOY4cQ&73)S_h-ALgUzQA_eY@39E!*4*BBLenF7 zsSJK0^{()aZ|ie!CtaC}qDO41aL$wMn3Y*c`QmhokTM{i|Ok3u|Jx1y8w7D}d zlo5OKQsrOqG_VqCVFTD_BS`!n;J6=@er9BVe?A{? z(bYr9+MxVDNkOx?p{EMJ#`0QubhzVf@;8kkpU6;Ks9roy>2pC3Iecka(Ha`r>vn?i zN3k^0YGh{s3ySGj)0#)GvHbkzE6=Crg@w6KK5CgaTOj?hM%kBNNJ|P|{Q9c?D23C( zO=QGGz6rW4+iC~)nO5g=SDIZkOBy^`MXlnz08M-EpMiloIc z)yG_0#U?G;e}d!*_{lxAnm%(cCak_Q4{?VR$ITKpvoU4h{YUr2e&sV)CzT3x5K~kb z^IYqYORe-ys}P>-)+`KT+3)6*+`*@^3vhqm&J}~sqa5}>eBy){;thQ&@!_zWP|u0{ zk0c=&$vHDCIruF@1M|D0u!qIegC4`!{CCRo13L79Ef2Sh>EPZCbaIUl?m=RGsSTmL zaR90;)R#tg!$oWtN|j2L>G<83 z4ZbQb;G)Yr7Sh=QKndiT71+eJQ-Hx!LwH($C-@|$Xwg!CQf@>+pVJHKE@j?eKSF#= zUM;;H4@x41&}h(;|J?gkq`9p`Kz$UEw9>a|ViN7qfMu0Sx>?t@0h)@uCmk>S`hnR!rTq?50 zV=v?e`=11M8(Z_}RI>BX_V=-#OFpZj#C z6)P((oal|hGNbaWz0>m1La;Ds6{5L@|IN)9E}5_&X`c!TwMF;uZ0`Oz=}D&NTbK@4 zofG0bJB>;L#{VIM^&B_(?8#Fc7Ev0di~+Hs-hfP!j{Zt2Zj4buJqyV&_@-OW=k!oi zHjJb;@EZu-2@tzV(6!U4ZeMH(e60RQdvKpbAHYfOS?NNDG4_te@?>c@2JbUD6`@h6 zO>g}kj5{&Y3)F!lt*FEQy~CxoWC zsfyDAB~)hBx5WAKS`^NVPUkftHh9d6KD6fD9)yb0t(@5`r4(}@zAc#cGCr9%*f$H9*7yP!^aQ52w7RbS=&aog*s-M-(O%>^2;kxx}X`9E;wCI3q{ zkq;e(`-$W$b(zA!XBAXs)`}=iqhVk$B?HWu4m&u-U3RO?Q*>MTFo%RtZEuM)P?=;` zx@bt!DnK-a@_7E6#suqpG4P0>G?;mrGen1*NM9ih6-D2eEkOuBla>1kHh%Xu5?@)q z&Q^TbCX}ePs2=rr88rEDmA+0nzdl~L8Db=0WlDP6bb(i3H-^2SHT619E_~YS7{Dnx zqL5bdh41RGd37K=6DU3xnUT~vw9z^2OHP( zCSg%Ao-4DYh9}nRBnA}m5Y>4mNzJGjxjCdqiuU)`gQPu5iXk`<67ax1XJ5<(gKxhT z@lwPQv}`c#u~h+t&qe>b#Sd^`1S`UvB4yEXz#Vcl&F;*p9-Wn)^-u8)Hup4!7l>{3 z*0~fM7L#mmuEGb9qmt6HA+}md%Cz2i($CahlN9@;$n9xcYQW@UVFo!oTjp&He?qJV zk?LR#l3b>iB~NF!53EX5y(2qZZmOB0AvpCHPgwmi$=XJsF7a{m>$Yj(Y$WQ%wTgg& zr^|Ax_s12ClZ#72-Be>rW{z9`(L(iF^5mir%SYjFw84wd$CXn3tx%bYC70^Y=Y=|V zlF^z+#MN(l6!Dg=lK#?yeBy72CHZRXL&GePxj9Q?nFzYa@6 zhS*43|bC{XdX(6%f~uirst0-68w`l(e961V43~D-~&kf zXn9@r`XxZ(AE}_JvtNpFj~9IV&e0o2wITMU`cj_UKG3KpN8Z>oV%z%JWD4rq-(<>c z+jW#K{Li$nA#=n`$N35^f%7h9{n&+j68&WVhBFKmXq1pu`0Oj__ReegSk{8t5|~R- zu$gugo72;mm&pID%fxHC_IG^QhD8Ge3Hwq7x%}f*8y;@=Gn7@(nJ{YUEA`U8Wux4q z6OP1FKU`d<2*njHEsw+s`fuD^b1Bt2m|GC85=H%xuu32Z0M`Qxre7L?0HVxXCX1dh zp1;u{M}Mo2SzpFTlkBi^#lLWDL?`St$0x1cDT;EmsY0#q;Pvfv2whsahn+fXe`#2%AO;R0hc89T$CSNI=X-U@ISyRa|+JzbN%uEJP8KA>zt$S z!AIp^q7;DF#`i+MQg|_p+&Gp|;_)u)htZ@0R2v@c%=CfJ-V)p&UnX8HLxUcbJjjWK zh8w-Y>*{}}ZdeVpI_b~qU(;18!n4hm6W-k2ayRTXhfCiwSfPn{V+Z32+Pt7^@^uAZ zD93u0fzC;III)hcfNG66WK->|+_8*VmSBws-z);aacjL{g+8OsVBq+BuX>XCTuRLborq8ZR-99c%f91bjpu4AY!tsG z0i3rv9r(6rv<&kHYocE-FBpdYEXi1kq4IZ=*POojJCy7c$0_eevGYeiU$#8TLP|Av z`9N3u&-Tz^6pj!uSaK-#M8U;rbmVPU9;vS#w6a=NpHMVqXVuITwLu{a#%f}Sqbk$5 zOK5+|4x@SIF;EG`ql$~_#tUzBKZ*V`i<2S-)4;;wq87SbTh&3w6RpetHDB;i7xLUc zo@V;b2b~>G^6o7bF>QhHCl}F9yB|=oyGrWpeNHsD4sroqzuFDd38Yxny8fz+m&PXyt+WuVPGV9ZQ27Y#xYs^S=_ zVTuhO84$|)mED#D{SfdTrcAiDSa1~wJ3c5Hd%z0uB&O-6cX#{?Q!|>t1IMTIe$tv} z8c8neJhhL)=2>^OT7eNg*!51j)*E%{)OBj`6SsWHS6zKF2Iq8_@Qd2ah1!7h2ewS< z@!B@7$BDgVMa#hT5c?&)F^>Y9Fs!p50)JiMdQbqU)&B20EDIGs=-(t}a>b&VLaf?^W5Thw}_jIxu+UahoJU3xGTd{5rA8?)Gp@Von^ zHI#UcG}SjfUGa(99lzipcQ%0l^RGpyt{{l`#Cpqk`}`X}#!`#hI0k2K8N=5M0iQ?i z25dTb9_vC)%62cK>}8E#H)AeP&4;Num=BMusB=y=XSUb$@JmHgYnaFSmSD5Iv~0cm zjdB&Ws3x*(?TwVcnBN*&geng4$3d7*=TKLjXt>e^;5Sg=mZHqz-X&*bWbeBs;@jO( zbmM}tBQWY$AlUW@6qpA_Iyl?`ZYlpZP;(fJA;`k7K67}_{M7>m_auQ;4&c3TcT~<% zV~)03mBACM-o0XcXJ*tBQNJLH2y!FhX>HXCqo=;%T<*UZDP}A6C60GXMVNBGe?|#6 z^e~pVT3$nc4<&-E#Z&|e?WoI&8ivy)(mCuD(YcNolf8?LFTnl$k78IGBuyH=+x-rR zFAnor{{bXwI`WfzA(T?D%BSQlXwj$sanu^g+=pTAyxLfPz`x;=lwOP~S*p?E>EI*X z?BYL`q4o%L!7=r8x+prc?_^MTEeB(mz|zt8**@$++%JI?)9=_Fy2|Tqbz|-lxRHyZ z@nT?DX4bR&y*03J#|x$ZA!EpL=4KWFuu(C~H-Anz>;k{oz#pn;_sbX*_r2CEwZU45Gu3HyI_#lLu$gSavE4EDJ!! zK#o?$e4J|pg&ta@?IstVy$nc{z#f)-mw(-T4RjQGZC`nN01^es>?fhS13{9=_C12; z`*H7zo{N`;9=KW>(=lu&eNl9(pYS$QTmLcNtOmhOVAABW#x{H=>=>I7^A#@3fG~of zK#mHOXM>f%r9zGt{sZCUBQizbhUCszqwl-l(<1}7WEk9! z{8SPvgmu+d4M~))THV*L8BLF(vp6m;PlQ42c7R|z!gtr-lmmK6qqc#N3r)jThWtz< z2&IwGxFYWBDT_X`@pr&o0YzDSYb;}ZaJ%=;Fv%ExGohdw{S)frr$y?jY2aFPCrR1; zLpFPS|1y$;H!|X8zyy5u?gl<{y7|m63}l`J)_$;|p7YhPM-CTux<{pb0H9}?xL+|; z0I)g7-0*eRD==DB0|dGDvU(pM~C>JJ$S`GEXx(Wy`DQ7J{#`uos} zv)ofqpEO_m@R7G_Xdv?g&AzV!)p^V{XJzorm(Q;*(0X2YRyo?iu$UNJ^l++UN;3!L z&m>4czx<0L$ZLG3vLszHR7~P^xktV@q>@BtA>X6M+pd?(qU*k(GnPxo`Q4|S4%)`0 z+??kY?|Aw|;_tQ6^weYWr*KjCe|LK40T@Lb_Td0k;J|Q#n*3jXMg7&h$Xx;)pv|4H zWNagt7+>cRwCV6&ICr$v}{T44o{1mB{5_~ZUBhVIUb4j zaKS~qan2z|rLs7oZ5~mWzd$eB%qMmWCGPr3te@u2NhBPfbAc_1S-yu4fe| zSQ;Z=0zh*TO@hq0?fpzm{)jf<#I< zSn-P!ak{<(cdX^maSP;a!BOrAZ0l>5-ylm3ZZ6brI{zEj0GZ9J0ArDl*>-8UaT%VB zk92D1Ax2HNdfP+cUdl0Ip6J@`shSkr?oWZ$nz>GD$=HUnx}Sz>uth!+4|~qz9;!*$ zxA7V)ulfq4@fX9~|K7sv!^&fF9a`P!W0`42?ENjOv0?bZ@J<1F!@uRmc68z(H0&VI zE9y)(b?+CcGvlfx711{lt|m-vH2$)C-SQ^~e1K%8SpUP5%i|)r!i@OnYr4Q1l9Apr zY5XrT)b*x3x{+t^O$Mmnz87KQ%r>;K9J62i)&M+{6Zz|0)OZF$NR{`jd9bp9+^X?^ z%~c%Tgx?+r2N*B^>yl$^c&W5RTJkZE)#{$EeqPI3R5xVw9vnLPN+<>{dmE#baztXm zGrkFm7Qss8b!!LbDOa2h%ha{E8X8_{9YyxfT<{|YI#Yv2fNejeoYfWGg^+VNQhi~- zG-%dyM+Ors~OlA zll9Q;43KG&BdEycp!R4!*7=9n_r(rgl@)*V3fB72zI3F472L zZZ004Pod#>CR4QkBx_vDK4w-820J-fl#(d#r1T2|Sndj#lJZ-8%qOm=6&39I1t3?( z1pdT=N>ayGqo|MQiGf+RUJy5gZJffZ`8U@g_h+^w~LGzOrH1qT(Xv%uY!je=0(^G2bA#Wp3m@83bp ziabs5$r(t!;Q7stnzw-e`niWn6jl7?n3+7fxw0Soq()j8W%9LqF2z0;fcmNd1u;Le zCY(ndj;};hX8knCPu4M8pMjIuAyikt&OSl4+HQKLL@zR>E5JFHT0ueGWl+xL{G64? zmf?U;ZmE%R)du5_u@^evg>hB1+bthXeQh_f=!RCBRNvTc?am#_rKx5{@Y;$?znLC& zz~>JYhZ-{`hECd+xfqL$9quw2?r4ryG0~Lf1wWow#>2&~v914&K<-}>izmNsc6a%t zW=4?zu=T?ezgbk}=zG{ufkAR3E@*mb|K&J_h|jHxGOFa#6IS#b&!V_ed2h?#f2O|l z8#;#PQ!;qCN#`QZ(xkx<5g(K&l4>o*m-?0_p8R0TQM_{(!_0%#w&;A30#2YC=F(Yh zN=9N)E8AjGcmR!pK+hp-$#SrN@T=XEko(F@kkjHTAGi!nS})&qX%{N~ z;%1_(2J`z+%R*?o??tfAl8)Kkgm_Lf_MEJ|y^GMl3jcYWZG&&JdS4CtNx^`bf(yL{ zRFqjM4F^|0lLs3ne3C^^ugIX-|Hz{Nhq&?HoR#c!m0@MHjkqqdQ`kTP*is zV$J+5c@pB~1iUAOR9I;2j~npYBpdtshEqZEDp!;Oj!r#=i0&sBqgvvl!_lNmU-v0_LEe0kd#Y7ejLbeE{d|mtvb^rlh5n}-qx+d+-Gf*_kQB#$cD`)8n z>()ttTrci-|CQ3{SZZk^Zd#KevNHVhuFmu@rtci(I1dr-&uk*^Iccyp4Og4SovSU!e|9u{>$}U(N8=586PiEg{?l8{QHIt4V?rONsG=8SVLs(e zWs6Hc=nQ@$m-4Jq*$5)3_f=sEo^-J43u>k0ks&$k6A;uI*)THgev>dH8X{m=CvH1T zq^a2O^X2x>gl?rvt;gKjE}OgKR-Hn?;5zUu%GQ<7rAaXry1#@>NBeTHf?kzG_Rlih zsuQ|R&tU6^S?vj6ooR>tE2rmK&iP}Hn6t}uDV)Fzxsa?*uHn31GAW{CAgKTC9VKFJ zD`_lCN#1tV+Tjur%A#iQ#bFe5T9NDz0^74ghlY1oRqsov_l^oJ;6)<%F^n(eny-7V zo&1O{bR1|CPJ-6Zrmt`>2vx(qB%qU7Fe3Gw{>%R!2+kL6JtXkM{X1&EiP~pc5unS- z0*hkFH>Ie4>;)2IyhAXgfy6}$vDew%0Ypw=E|Z2wyt@LkofS`+DzRM6qVv_GHyREH zOcaLnPZej=tr}|njaY$famU_0P3?I0byu$$R%bagI&LQ!f;r7v`DV4QS!+kLW?5#E znAeIQm>}hvp|s2j3p>OUmMTCZQ_KBa1HG-Dk|>@tSS*M>eY{W3 zM11dU;?C-3GU31kIj_ZNQT2vwxj#BjQhO(}!L4dO+^G<^Uh<9BI08$)FG;L1AljnDLity3iM!-z$K3jKuYNsPz41cDQ~`k==VznbBhoLHxi?Y z0ZF)L;MG15u1lMrDD3mgw(RBMtz9|{>*iz1KUT?f9k;g_nRsIE-y`KF5fkRE;Q3^? z2@w%&^5{wLK`4AuRuG;q^|h4=n^ys$|8dsO6^u|7c_H3m)m^zCL{u`gjG|86l*uyJGZx$E~Dk`ZH-f7j17_HIrUZ`YW z$l18q<&7r82;uXRD+2db4G4p$hjiCDXF~CU5(7M32{HauZ>vvr1>)4iRP%?wWcLX~ z34b7&1e%XM02GaJyKQ?UejzJ06rri2T0f?o)+d{u_H=OVh%1aO z`7LH?#6wsH zm$x$>(j!8l(|1hhe*a?ue6H!xJlaavumk`EL!A_s@&ETzKTVORe=U&hM|~oz`nss| zGs^q=-F+GK0A8@Sh#w$TDD5f#HvH*y`r!+UNnoMW^vjGSCVJ?E`R|R@na5L*8xU{U zulAdLz_@W9Opyn*wDJyQr%~h;fWMwfU-|gDJ+qV<$>3@tr`rHO8l}r-S_iGJ+f+S~ zuZAfR)%*M$vzXa*e2K8p?<}Z95x>oi`!<#cU?oZY89waQ=2uq3dc{%MvaWm;TT!ciER30%vu!|Sko2R2YL6xz3jN+^&V^`Tn9vfToye7SD~u?)6!)O~gP334Ng3)6VKjiOllg()k+tB`@y94C;N5oDbd@wE83W|Xhk3^DI|U); zel)7jz1H%M4|$sYDS2XXQ9g4~Tolj)ry9o6y%O%-M57VZee{v{;jV&ho&z#= z;&Rk8-y8%A@nM)o6+?T-N{aImur88Z2z~{kT4|YSmPDj9KdL1lPt;=IyFD*SmpbdhPD!-RCVb}yY1$lkywzum)Gnh&;fjGZL~5h{SJ+o-B131 z)yfPB{$_tFKFiC?jDw`{={?Vc2Y?YFlu?^$d!TwwYj!xo#nX8}GmK}C;J|}%{`>^d zQMrD9KfqmlYn-zhv>$L{6`+v0HrUiw)vIYoCLsT?)WMYykW2S$Pe1~x#E@7!i#wdL zM1<%~O^343xV-yxo@Hms(h|0y26CSGR*XN-KBIEujK=HOcW;39(Xf9*YWKw`##bg^ zoF?w)+76~x#UEfCipkr64%7?WD*%KHLBeN-b`XMa%gaxx#RhD(xhJ=v6zEl1VfoP% zbNV+X(`w>sY$VR3D6%rlcC*LzP}l`&M2MAeA-mdAN?{{T@816KI7}Z~K!|C;*cL4% zSS|}}*;B92a&u9SB{uK@ZCFqFkD%vBE#JWCS-r_YYwK}1-mh#2tI!8f^z3ToAjtW5aDdh)!=@jY2-5#g?y~ z5t&nP@qn(;y0|QUSDy*Co2*)V)xOq(-uJ~*Vl9vpe?&8H9+KYVJXX$zU$|IbS^*Ju zktV?>Rxg2`J=wl}Px22RFlG_796etsR%6a1spXp^=e7D`sp2ux_0oAPcKDaSAS_U6 zI^IO^`6JKij1m8PHRRtfA=4u|P# zfyfrduSTKFn9WDh+gt1_c^(*)aOaP$CwxFN0^5`rwU>90Jm=aX)1J#?_R)X{eeM8e zjRo(DIFWM($sYO+o$r1>g^HyT4Lb~iFxROPby>1_P?YkXFY*dpBdSoP475vTaL~& zLyVO?j69KxYh9{sxr020@D!tgCb9Fps+RQlzn(kTrfkT1G>W?qo_aGH4PIet+>-}@ zQokMxTtKadr~j(&67C8=MKl2K^Yv)M42+2yHgUn4u`X2hKSlSk)5(XI{qa-hmLC>a z{Q`R~YhGSs0&1efq*toXu`91FtMs)DEo=b)v1rSekde`n5)9oc^E54U+Y%R$VPH0T zE(9`DOXncZ+^5$506;7tJX>MlKe469Rh}4x&a$Msn?${f|3bI*CsU8Gj!(m7YgrMm zL`bUG#l>2eHiCUcTbXSIimR3^4D%?G26-0Y9&I`-6hgosqv zs7+KVN*g^4dtttNG+VmDD%H2&*=5BL=>N(uYnR}Yq?C)GiJ{0xbYpi^UR_$4)khEX zV`KNpbfh@DjlC6Za*KeP`2QN4`#;xD-zPK`9EJx!48I>_J%XPHMo1u_Os;Id81Qo9 zZUcfFM|`*^-A{i3`+yQO3(O|8#oC~$*-upQ6<&&G8V!8YipqD?8q4fJ8hQQq)OE4- zFBvqh(JV_64O2tFQT_m-C7n5YB^8khj({2;mJdGy) z2<8@RJ6zXRqs<`ItYX{@)L)VjLKUgRM`Qat??bV_4K-QBcd*Eld@s#QGV0M%u$_1> z^wA1+T-vVL*+HfG4S@s*IaGcNkJtr^5dy$ctF^D|b8pVKF+s#H5kPQkyllN zX)d*)3H=MrbOgl$m`^F%2tGh`S^xY;U?pVel9OfjVm~v{j1$p{``qFCnQ(i_3fb%C7flU*R`x<7z59{`a%8Ju zyj@g~P+7ar-Tbi^7PpLL_k`Y`#PlNfq7%Y_P=9>!=PcNVbbTBaxu04afqimgDA6d4 zDP+WUJgFUq>S-9L2nBrR7&HIn@Fu*OfBFNc$7x|M^3)W-hv8oVE?Xp;CP%vjF;cO7 z^kVL~Sn1MTyjZ8uz&x2yd##31vKhe;uw01>*eYxbYnmPF=`ra3B0u7D6Cbo}_>+Qa z1f0yDy@#oB#~0W$mmgGH$W+g#pQaXr<_4&txf{n7B%9yNs%msx|A2`$eHTea&_EcD z#Uh+XEBURLQ-Y!i%W<1HcF~lGDqrZZv#E0^tO%vpd>O8jX45R2tu>8hm+xCdX5KkF zS~E5J2kn9To7zORpNFi;a)JC>2UG?i5G%Kpc4NA;Z? zRlDy^uuIRK8fbdkh#ryv?L5rr_{=c^@xc7!=JVS)ccX=%rDz1xd*S@KC6hyP>T5H!JI{pyF>WhC2MG_>HIEkMTxuyl_x|(5?;fi zq%*4I7)w6o{4|Yge+JCq(nm!NsKRXzaaU`Kp^4Dc5TR(SuRRM8&rd3Uh3%{v?Ou>B zJdYg$(K8W=q+QQDw5b35_;UE>dM`seGjZHs4bf(}bCCY1k`AK%HF?j`bd@P3AD;LG zAN$a)$5rq!ty4?aG!|N37IaXevpo8^mJ=1U>t{&il24fr70e~yfk(^$$dAYFC@W1> zz4??({@gKrn=lxU+`IEXuWdf2LEZq3-S-L_Sk)*tQ)Iu5L6DHZq_s>gGAEXpBTwfD zCgpSA-j*}d-v>7gY8t7i?}5bK^$~=W?`>XjTm_sSf{;g^4pM&mjtBr)pi5C9?w;pp zTS+C*7{+H=d!C8s%k@%)3_Z-3lt|!sEZ^=bD~fMc!%Xv&7FF%c9+t~W2PXUCqvi7H zv=%=CIadw^Zpk_{1ybTxEE@hKSl9f!tn#_w?;D^eTfIM+s3dA`U}7$_W=CP-BFT`}nBeDy%wft~+Rc0b zu&uoePLmOeJ83>yL_M7VQNM